library(haven)
#df <- read_sas("C:/Users/Momo/Desktop/R - Learning/Depression project/nhanes.sas7bdat")
df <- read_sas("/Users/thien/Desktop/R-dir/Depressive Disorder project/nhanes.sas7bdat")
library(tidyverse)
## ── Attaching packages ─────────────────────────────────────── tidyverse 1.3.2 ──
## ✔ ggplot2 3.5.1 ✔ purrr 1.0.2
## ✔ tibble 3.2.1 ✔ dplyr 1.1.4
## ✔ tidyr 1.3.1 ✔ stringr 1.5.0
## ✔ readr 2.1.4 ✔ forcats 1.0.0
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
# Filter the dataframe
df1 <- df %>%
filter(DPQ010 != 7 & DPQ010 != 9 & !is.na(DPQ010))
df2 <- df1 %>%
filter(DPQ020 != 7 & DPQ020 != 9 & !is.na(DPQ020))
df3 <- df2 %>%
filter(DPQ030 != 7 & DPQ030 != 9 & !is.na(DPQ030))
df4 <- df3 %>%
filter(DPQ040 != 7 & DPQ040 != 9 & !is.na(DPQ040))
df5 <- df4 %>%
filter(DPQ050 != 7 & DPQ050 != 9 & !is.na(DPQ050))
df6 <- df5 %>%
filter(DPQ060 != 7 & DPQ060 != 9 & !is.na(DPQ060))
df7 <- df6 %>%
filter(DPQ070 != 7 & DPQ070 != 9 & !is.na(DPQ070))
df8 <- df7 %>%
filter(DPQ080 != 7 & DPQ080 != 9 & !is.na(DPQ080))
df9 <- df8 %>%
filter(DPQ090 != 7 & DPQ090 != 9 & !is.na(DPQ090))
# Create depression variable:
depress <- df9
depress$dp = depress$DPQ010 + depress$DPQ020 + depress$DPQ030 + depress$DPQ040 + depress$DPQ050 +
depress$DPQ060 + depress$DPQ070 + depress$DPQ080 + depress$DPQ090
# Create the new variable 'dp10'
depress <- depress %>%
mutate(dp10 = ifelse(dp <= 9, 0, 1))
# Display frequency table for dp10
table(depress$dp10)
##
## 0 1
## 4861 511
# Smoked at least 100 cigarettes in life
table(depress$SMQ020)
##
## 1 2 9
## 2253 3117 2
# Do you now smoke cigarettes
table(depress$SMQ040)
##
## 1 2 3
## 850 206 1197
# How long since quit smoking cigarettes
table(depress$SMQ050Q)
##
## 1 2 3 4 5 6 7 8 9 10 11 12 13
## 46 61 61 51 54 47 37 30 19 52 6 30 19
## 14 15 16 17 18 19 20 21 22 23 24 25 26
## 28 59 9 13 22 9 76 13 18 19 14 51 13
## 27 28 29 30 31 32 33 34 35 36 37 38 39
## 6 16 3 79 6 10 15 7 25 4 8 4 11
## 40 41 42 43 44 45 46 47 48 49 66666 99999
## 38 6 8 4 13 17 4 3 3 1 40 9
# Create a new variable 'smoking_status'
depress$smoking_status <- NA
# Define smoking status categories based on SMQ020, SMQ040, and SMQ050Q
depress$smoking_status[depress$SMQ020 == 2] <- "Never" # Never smokers
depress$smoking_status[depress$SMQ040 %in% c(1, 2)] <- "Current" # Current smokers
#depress$smoking_status[depress$SMQ040 == 1 & depress$SMQ040 == 2] <- "Current" # Current smokers
#depress$smoking_status[depress$SMQ020 == 1 & depress$SMQ040 == 3] <- "Former" # Former smokers
depress$smoking_status[is.na(depress$smoking_status)] <- "Former"
# Replace missing values (NA) with "Former" for those who have quit but have missing values in SMQ040
#depress$smoking_status[is.na(depress$smoking_status) & !is.na(depress$SMQ050Q)] <- "Former"
# Replace 66666 and 99999 codes in SMQ050Q with "Former" category
#depress$smoking_status[depress$SMQ050Q %in% c(66666, 77777, 99999)] <- "Former"
# Convert 'smoking_status' to factor with specified levels
depress$smoking_status <- factor(depress$smoking_status, levels = c("Current", "Former", "Never"))
# Display the table of smoking status
table(depress$smoking_status)
##
## Current Former Never
## 1056 1199 3117
# Had at least 12 alcohol drinks/lifetime?
table(depress$ALQ110)
##
## 1 2 9
## 682 914 3
# Had at least 12 alcohol drinks/1 yr?
table(depress$ALQ101)
##
## 1 2 9
## 3771 1591 8
# How often drink alcohol over past 12 mos
table(depress$ALQ120Q)
##
## 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15
## 867 1042 884 521 302 263 141 193 22 4 70 2 24 1 6 22
## 16 17 18 20 24 25 28 30 35 40 45 52 61 80 90 96
## 2 1 1 24 4 4 2 20 1 2 1 1 1 2 1 1
## 100 112 120 160 180 182 200 300 305 350 365 999
## 2 1 1 1 1 1 1 1 1 1 6 4
# Create a new variable 'drinking_status'
depress$drinking_status <- NA
# Define categories based on ALQ101, ALQ110, and ALQ120Q
depress$drinking_status[depress$ALQ101 == 2 & depress$ALQ110 == 2] <- "Never" # Never drinkers
depress$drinking_status[depress$ALQ101 == 1] <- "Current" # Current drinkers
# depress$drinking_status[depress$ALQ110 == 1] <- "Former" # Former drinkers
depress$drinking_status[is.na(depress$drinking_status)] <- "Former"
# Replace missing values (NA) with "Former" for those who have quit but have missing values in ALQ110
# depress$drinking_status[is.na(depress$drinking_status) & !is.na(depress$ALQ120Q)] <- "Former"
# Replace 999 codes in ALQ120Q with "Former" category
# depress$drinking_status[depress$ALQ120Q == 999] <- "Former"
# Convert 'drinking_status' to factor with specified levels
depress$drinking_status <- factor(depress$drinking_status, levels = c("Current", "Former", "Never"))
# Display the table of drinking status
table(depress$drinking_status)
##
## Current Former Never
## 3771 687 914
depress$PA <- NA
# Define categories based on PAQ605, PAQ620, PAQ635, PAQ650, and PAQ665
depress$PA[depress$PAQ605 == 1 | depress$PAQ650 == 1] <- "Vigorous" # Vigorous activity
depress$PA[depress$PAQ620 == 1 | depress$PAQ635 == 1 | depress$PAQ665 == 1] <- "Moderate" # Moderate activity
# For remaining participants, assign "Mild" category
depress$PA[is.na(depress$PA)] <- "Mild"
# Convert 'PA' to factor with specified levels
depress$PA <- factor(depress$PA, levels = c("Mild", "Moderate", "Vigorous"))
# Display frequency table for the PA variable
table(depress$PA)
##
## Mild Moderate Vigorous
## 1320 3713 339
depress$Fa_inc <- NA
# Define categories
depress$Fa_inc[depress$INDFMIN2 == 1 | depress$INDFMIN2 == 2 | depress$INDFMIN2 == 3 | depress$INDFMIN2 == 4 | depress$INDFMIN2 == 13] <- "Very-Low"
depress$Fa_inc[depress$INDFMIN2 == 5 | depress$INDFMIN2 == 6 | depress$INDFMIN2 == 7 | depress$INDFMIN2 == 8 | depress$INDFMIN2 == 9 | depress$INDFMIN2 == 10 | depress$INDFMIN2 == 12] <- "Low-Medium"
depress$Fa_inc[depress$INDFMIN2 == 14 ] <- "Medium-High"
depress$Fa_inc[depress$INDFMIN2 == 15 ] <- "High"
depress$Fa_inc[depress$INDFMIN2 == 77 | depress$INDFMIN2 == 99] <- NA
depress$Fa_inc <- factor(depress$Fa_inc, levels = c("Very-Low", "Low-Medium", "Medium-High", "High"))
table(depress$Fa_inc)
##
## Very-Low Low-Medium Medium-High High
## 1210 2586 442 919
depress <- depress %>%
mutate(SBP = (BPXSY1 + BPXSY2) / 2,
DBP = (BPXDI1 + BPXDI2) / 2)
hypertension <- depress %>%
mutate(hyper = case_when(
SBP >= 140 | DBP >= 90 | BPQ040A == 1 | BPQ050A == 1 ~ 1,
TRUE ~ 0))
lipid <- hypertension %>%
mutate(dyslip = case_when(
LBXTC >= 200 | LBXTR >= 150 | LBDLDL >= 100 | LBDHDD <= 40 | BPQ090D == 1 ~ 1,
TRUE ~ 0))
diab <- lipid %>%
mutate(db = case_when(
LBXGLU >= 126 | LBXGLT >= 200 | LBXGH >= 6.5 | DIQ070 == 1 ~ 1,
TRUE ~ 0
))
subset_df <- subset(diab, select = c(SEQN, RIDAGEYR, RIAGENDR, RIDRETH3, DMDEDUC2,
DMDMARTL, SBP, DBP, BMXBMI, Fa_inc,
INDFMPIR, LBXSBU, LBXSCR, LBXSASSI,
LBXSATSI, LBXAPB, LBDHDD, LBDLDL, LBXTR,
LBXTC, LBXGLU, LBXGLT, drinking_status,
smoking_status, PA, LBXCOT, LBXHCT,
URXCOTT, URXHCTT, db, dyslip, hyper, dp10))
# Rename all variables in one codes
df_rename <- subset_df %>% rename(age = RIDAGEYR, sex = RIAGENDR, ethnic = RIDRETH3,
educ = DMDEDUC2, masts = DMDMARTL, BMI = BMXBMI,
PIR = INDFMPIR,
BUN = LBXSBU, Crea = LBXSCR, AST = LBXSASSI, ALT = LBXSATSI,
apoB = LBXAPB, HDLc = LBDHDD, LDLc = LBDLDL,
TG = LBXTR, TC = LBXTC, Glu = LBXGLU, tolGlu = LBXGLT,
sCOT = LBXCOT, sHCOT = LBXHCT, uCOT = URXCOTT, uHCOT = URXHCTT)
# Filter out values ‘7, 77’ and ‘9, 99’ from categorical variables column
# filtered_data <- df_rename %>%
# filter(educ %in% c('1', '2', '3', '4', '5'),
# masts %in% c('1', '2', '3', '4', '5', '6')) # I think we should dont filter out, just change them into NA
# Replace values '7' and '77' in educ column with NA
df_rename$educ[df_rename$educ %in% c('7', '9')] <- NA
# Replace values '9' and '99' in masts column with NA
df_rename$masts[df_rename$masts %in% c('77', '99')] <- NA
###########################################################################
# Define and Apply the MDRD Formula (modification of diet in renal disease)
calculate_GFR <- function(creatinine, age, gender) {
gender_multiplier <- ifelse(gender == "female", 0.742, 1)
eGFR <- 186 * (creatinine^(-1.154)) * (age^(-0.203)) * gender_multiplier
return(eGFR)
}
# 'sex' column contains 1 for male and 2 for female
df_rename$eGFR <- mapply(calculate_GFR,
creatinine = df_rename$Crea,
age = df_rename$age,
gender = ifelse(df_rename$sex == 2, "female", "male"))
# Viewing the first few entries of GFR values
head(df_rename$eGFR)
## [1] 63.19639 108.63640 62.06049 83.05920 93.97984 65.96047
# Getting a summary of GFR values
summary(df_rename$eGFR)
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## 2.181 75.131 90.452 91.827 106.662 254.554 225
str(df_rename)
## tibble [5,372 × 34] (S3: tbl_df/tbl/data.frame)
## $ SEQN : num [1:5372] 73557 73558 73559 73561 73562 ...
## ..- attr(*, "label")= chr "Respondent sequence number"
## $ age : num [1:5372] 69 54 72 73 56 61 56 65 26 76 ...
## ..- attr(*, "label")= chr "Age in years at screening"
## $ sex : num [1:5372] 1 1 1 2 1 2 2 1 2 1 ...
## ..- attr(*, "label")= chr "Gender"
## $ ethnic : num [1:5372] 4 3 3 3 1 3 3 3 3 3 ...
## ..- attr(*, "label")= chr "Race/Hispanic origin w/ NH Asian"
## $ educ : num [1:5372] 3 3 4 5 4 5 3 2 5 5 ...
## ..- attr(*, "label")= chr "Education level - Adults 20+"
## $ masts : num [1:5372] 4 1 1 1 3 2 3 2 5 1 ...
## ..- attr(*, "label")= chr "Marital status"
## $ SBP : num [1:5372] 118 158 140 135 159 121 126 141 103 NA ...
## ..- attr(*, "label")= chr "Systolic: Blood pres (1st rdg) mm Hg"
## $ DBP : num [1:5372] 74 71 83 87 83 80 73 78 61 NA ...
## ..- attr(*, "label")= chr "Diastolic: Blood pres (1st rdg) mm Hg"
## $ BMI : num [1:5372] 26.7 28.6 28.9 19.7 41.7 35.7 26.5 22 20.3 34.4 ...
## ..- attr(*, "label")= chr "Body Mass Index (kg/m**2)"
## $ Fa_inc : Factor w/ 4 levels "Very-Low","Low-Medium",..: 1 2 2 4 2 2 1 1 4 3 ...
## $ PIR : num [1:5372] 0.84 1.78 4.51 5 4.79 5 0.48 1.2 5 5 ...
## ..- attr(*, "label")= chr "Ratio of family income to poverty"
## $ BUN : num [1:5372] 10 16 14 31 18 17 9 15 12 17 ...
## ..- attr(*, "label")= chr "Blood urea nitrogen (mg/dL)"
## $ Crea : num [1:5372] 1.21 0.79 1.22 0.73 0.89 0.92 0.55 0.97 0.74 1.19 ...
## ..- attr(*, "label")= chr "Creatinine (mg/dL)"
## $ AST : num [1:5372] 16 18 22 36 24 20 23 29 20 28 ...
## ..- attr(*, "label")= chr "Aspartate aminotransferase AST (U/L)"
## $ ALT : num [1:5372] 16 29 16 28 16 21 24 20 23 27 ...
## ..- attr(*, "label")= chr "Alanine aminotransferase ALT (U/L)"
## $ apoB : num [1:5372] NA NA 57 92 NA 77 NA NA 59 NA ...
## ..- attr(*, "label")= chr "Apolipoprotein (B) (mg/dL)"
## $ HDLc : num [1:5372] 65 50 60 85 38 58 59 79 96 50 ...
## ..- attr(*, "label")= chr "Direct HDL-Cholesterol (mg/dL)"
## $ LDLc : num [1:5372] NA NA 56 101 NA 97 NA NA 67 NA ...
## ..- attr(*, "label")= chr "LDL-cholesterol (mg/dL)"
## $ TG : num [1:5372] NA NA 51 75 NA 64 NA NA 24 NA ...
## ..- attr(*, "label")= chr "Triglyceride (mg/dL)"
## $ TC : num [1:5372] 167 170 126 201 226 168 278 173 168 167 ...
## ..- attr(*, "label")= chr "Total Cholesterol( mg/dL)"
## $ Glu : num [1:5372] NA NA 193 107 NA 110 NA NA 89 NA ...
## ..- attr(*, "label")= chr "Fasting Glucose (mg/dL)"
## $ tolGlu : num [1:5372] NA NA NA NA NA 150 NA NA 80 NA ...
## ..- attr(*, "label")= chr "Two Hour Glucose(OGTT) (mg/dL)"
## $ drinking_status: Factor w/ 3 levels "Current","Former",..: 1 1 1 1 1 2 1 1 1 2 ...
## $ smoking_status : Factor w/ 3 levels "Current","Former",..: 2 1 2 3 2 3 1 1 3 3 ...
## $ PA : Factor w/ 3 levels "Mild","Moderate",..: 1 2 2 2 3 1 2 3 2 2 ...
## $ sCOT : num [1:5372] 6.51 4.07 0.052 0.011 0.011 0.015 362 210 0.011 0.011 ...
## ..- attr(*, "label")= chr "Cotinine, Serum (ng/mL)"
## $ sHCOT : num [1:5372] 1.33 4.48 0.055 0.011 0.011 0.011 154 234 0.011 0.011 ...
## ..- attr(*, "label")= chr "Hematocrit (%)"
## $ uCOT : num [1:5372] NA NA NA NA NA 0.615 NA 3420 NA NA ...
## ..- attr(*, "label")= chr "Total Cotinine, urine (ng/mL)"
## $ uHCOT : num [1:5372] NA NA NA NA NA 8.79e-01 NA 1.52e+04 NA NA ...
## ..- attr(*, "label")= chr "Total Hydroxycotinine, urine (ng/mL)"
## $ db : num [1:5372] 1 1 1 0 0 0 0 0 0 1 ...
## $ dyslip : num [1:5372] 1 1 1 1 1 0 1 0 0 1 ...
## $ hyper : num [1:5372] 1 1 1 1 1 1 0 1 0 1 ...
## $ dp10 : num [1:5372] 0 0 0 0 1 0 0 0 0 0 ...
## $ eGFR : num [1:5372] 63.2 108.6 62.1 83.1 94 ...
glimpse(df_rename)
## Rows: 5,372
## Columns: 34
## $ SEQN <dbl> 73557, 73558, 73559, 73561, 73562, 73564, 73566, 73567…
## $ age <dbl> 69, 54, 72, 73, 56, 61, 56, 65, 26, 76, 33, 32, 18, 38…
## $ sex <dbl> 1, 1, 1, 2, 1, 2, 2, 1, 2, 1, 2, 1, 1, 2, 1, 1, 1, 1, …
## $ ethnic <dbl> 4, 3, 3, 3, 1, 3, 3, 3, 3, 3, 6, 1, 1, 4, 6, 6, 3, 1, …
## $ educ <dbl> 3, 3, 4, 5, 4, 5, 3, 2, 5, 5, 5, 1, NA, 3, 5, 4, 3, 4,…
## $ masts <dbl> 4, 1, 1, 1, 3, 2, 3, 2, 5, 1, 1, 6, NA, 1, 1, 1, 3, 5,…
## $ SBP <dbl> 118, 158, 140, 135, 159, 121, 126, 141, 103, NA, 123, …
## $ DBP <dbl> 74, 71, 83, 87, 83, 80, 73, 78, 61, NA, 61, 74, 61, 75…
## $ BMI <dbl> 26.7, 28.6, 28.9, 19.7, 41.7, 35.7, 26.5, 22.0, 20.3, …
## $ Fa_inc <fct> Very-Low, Low-Medium, Low-Medium, High, Low-Medium, Lo…
## $ PIR <dbl> 0.84, 1.78, 4.51, 5.00, 4.79, 5.00, 0.48, 1.20, 5.00, …
## $ BUN <dbl> 10, 16, 14, 31, 18, 17, 9, 15, 12, 17, 11, 17, NA, 10,…
## $ Crea <dbl> 1.21, 0.79, 1.22, 0.73, 0.89, 0.92, 0.55, 0.97, 0.74, …
## $ AST <dbl> 16, 18, 22, 36, 24, 20, 23, 29, 20, 28, 20, 30, NA, 17…
## $ ALT <dbl> 16, 29, 16, 28, 16, 21, 24, 20, 23, 27, 20, 44, NA, 13…
## $ apoB <dbl> NA, NA, 57, 92, NA, 77, NA, NA, 59, NA, 61, 111, NA, 1…
## $ HDLc <dbl> 65, 50, 60, 85, 38, 58, 59, 79, 96, 50, 53, 33, NA, 55…
## $ LDLc <dbl> NA, NA, 56, 101, NA, 97, NA, NA, 67, NA, 75, 119, NA, …
## $ TG <dbl> NA, NA, 51, 75, NA, 64, NA, NA, 24, NA, 14, 148, NA, 5…
## $ TC <dbl> 167, 170, 126, 201, 226, 168, 278, 173, 168, 167, 131,…
## $ Glu <dbl> NA, NA, 193, 107, NA, 110, NA, NA, 89, NA, 84, 104, NA…
## $ tolGlu <dbl> NA, NA, NA, NA, NA, 150, NA, NA, 80, NA, NA, 84, NA, 8…
## $ drinking_status <fct> Current, Current, Current, Current, Current, Former, C…
## $ smoking_status <fct> Former, Current, Former, Never, Former, Never, Current…
## $ PA <fct> Mild, Moderate, Moderate, Moderate, Vigorous, Mild, Mo…
## $ sCOT <dbl> 6.510, 4.070, 0.052, 0.011, 0.011, 0.015, 362.000, 210…
## $ sHCOT <dbl> 1.330, 4.480, 0.055, 0.011, 0.011, 0.011, 154.000, 234…
## $ uCOT <dbl> NA, NA, NA, NA, NA, 0.615, NA, 3420.000, NA, NA, NA, N…
## $ uHCOT <dbl> NA, NA, NA, NA, NA, 8.79e-01, NA, 1.52e+04, NA, NA, NA…
## $ db <dbl> 1, 1, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ dyslip <dbl> 1, 1, 1, 1, 1, 0, 1, 0, 0, 1, 0, 1, 0, 1, 1, 0, 0, 1, …
## $ hyper <dbl> 1, 1, 1, 1, 1, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ dp10 <dbl> 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ eGFR <dbl> 63.19639, 108.63640, 62.06049, 83.05920, 93.97984, 65.…
table(df_rename$Fa_inc)
##
## Very-Low Low-Medium Medium-High High
## 1210 2586 442 919
table(df_rename$smoking_status)
##
## Current Former Never
## 1056 1199 3117
table(df_rename$drinking_status)
##
## Current Former Never
## 3771 687 914
table(df_rename$PA)
##
## Mild Moderate Vigorous
## 1320 3713 339
df_rename$Fa_inc_numeric <- as.numeric(factor(df_rename$Fa_inc, levels = c("Very-Low", "Low-Medium", "Medium-High", "High")))
df_rename$smoking_status_numeric <- as.numeric(factor(df_rename$smoking_status, levels = c("Never", "Former", "Current")))
df_rename$drinking_status_numeric <- as.numeric(factor(df_rename$drinking_status, levels = c("Never", "Former", "Current")))
df_rename$PA_numeric <- as.numeric(factor(df_rename$PA, levels = c("Mild", "Moderate", "Vigorous")))
# Check the result
table(df_rename$Fa_inc_numeric)
##
## 1 2 3 4
## 1210 2586 442 919
table(df_rename$smoking_status_numeric)
##
## 1 2 3
## 3117 1199 1056
table(df_rename$drinking_status_numeric)
##
## 1 2 3
## 914 687 3771
table(df_rename$PA_numeric)
##
## 1 2 3
## 1320 3713 339
df_rename <- df_rename %>% rename(target = dp10)
df_rename_new <- subset(df_rename, select = -c(Fa_inc, smoking_status, drinking_status, PA))
# Export the dataset as a CSV file
# write.csv(df_rename_new, "df_rename_new.csv", row.names = FALSE)
# Assuming "target" is the name of your target variable and "df_rename" is your dataframe
target <- df_rename_new$target # Extract the target column
df_rename_new <- df_rename_new[, !names(df_rename_new) %in% "target"] # Remove the target column
df_rename_new <- cbind(df_rename_new, target) # Add the target column at the end
library(GGally)
## Registered S3 method overwritten by 'GGally':
## method from
## +.gg ggplot2
df_rename_new <- df_rename_new %>% rename(PA = PA_numeric,
drksts = drinking_status_numeric,
smsts = smoking_status_numeric,
fa_income = Fa_inc_numeric)
ggcorr(df_rename_new, label = TRUE, label_size = 2.5, hjust = 1, layout.exp = 2)
library(rms)
## Loading required package: Hmisc
## Loading required package: lattice
## Loading required package: survival
## Loading required package: Formula
##
## Attaching package: 'Hmisc'
## The following objects are masked from 'package:dplyr':
##
## src, summarize
## The following objects are masked from 'package:base':
##
## format.pval, units
## Loading required package: SparseM
##
## Attaching package: 'SparseM'
## The following object is masked from 'package:base':
##
## backsolve
na.patterns = naclus(df_rename)
plot(na.patterns)
library(naniar)
gg_miss_var(df_rename)
## Warning: The `guide` argument in `scale_*()` cannot be `FALSE`. This was deprecated in
## ggplot2 3.3.4.
## ℹ Please use "none" instead.
## ℹ The deprecated feature was likely used in the naniar package.
## Please report the issue at <https://github.com/njtierney/naniar/issues>.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
library(VIM)
## Loading required package: colorspace
## Loading required package: grid
## VIM is ready to use.
## Suggestions and bug-reports can be submitted at: https://github.com/statistikat/VIM/issues
##
## Attaching package: 'VIM'
## The following object is masked from 'package:datasets':
##
## sleep
res<-summary(aggr(df_rename, sortVar=TRUE))$combinations
##
## Variables sorted by number of missings:
## Variable Count
## uCOT 0.67907669
## uHCOT 0.67907669
## tolGlu 0.64259121
## LDLc 0.54709605
## apoB 0.54039464
## TG 0.54020849
## Glu 0.53760238
## SBP 0.09084140
## DBP 0.09084140
## PIR 0.07259866
## masts 0.05956813
## educ 0.05938198
## AST 0.04225614
## ALT 0.04225614
## BUN 0.04188384
## Crea 0.04188384
## eGFR 0.04188384
## Fa_inc 0.04002234
## Fa_inc_numeric 0.04002234
## HDLc 0.03890544
## TC 0.03890544
## sCOT 0.03871929
## sHCOT 0.03871929
## BMI 0.01042442
## SEQN 0.00000000
## age 0.00000000
## sex 0.00000000
## ethnic 0.00000000
## drinking_status 0.00000000
## smoking_status 0.00000000
## PA 0.00000000
## db 0.00000000
## dyslip 0.00000000
## hyper 0.00000000
## target 0.00000000
## smoking_status_numeric 0.00000000
## drinking_status_numeric 0.00000000
## PA_numeric 0.00000000
# summary of whether the data is missing (in black) or not.
vis_miss(df_rename, sort_miss = TRUE) # It also provides the percentage of missing values in each column.
## Warning: `gather_()` was deprecated in tidyr 1.2.0.
## ℹ Please use `gather()` instead.
## ℹ The deprecated feature was likely used in the visdat package.
## Please report the issue at <https://github.com/ropensci/visdat/issues>.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
library(dlookr)
## Registered S3 methods overwritten by 'dlookr':
## method from
## plot.transform scales
## print.transform scales
##
## Attaching package: 'dlookr'
## The following object is masked from 'package:Hmisc':
##
## describe
## The following object is masked from 'package:tidyr':
##
## extract
## The following object is masked from 'package:base':
##
## transform
plot_na_pareto(df_rename) # It will show pareto chart with missing values
plot_na_hclust(df_rename) # Distribution of missing value by combination of variables
plot_na_intersect(df_rename) # Missing with intersection of variables
library(car)
## Loading required package: carData
##
## Attaching package: 'car'
## The following objects are masked from 'package:rms':
##
## Predict, vif
## The following object is masked from 'package:dplyr':
##
## recode
## The following object is masked from 'package:purrr':
##
## some
# Calculate VIF values
vif_values <- vif(lm(target ~ ., data = df_rename_new))
# Check VIF values
print(vif_values)
## SEQN age sex ethnic educ masts
## 1.098877 3.804959 4.512323 1.314169 1.634432 1.449934
## SBP DBP BMI PIR BUN Crea
## 1.964960 1.282325 1.373336 4.430750 1.767992 11.930585
## AST ALT apoB HDLc LDLc TG
## 3.010278 3.370629 10.593547 2869.306331 17188.895079 2238.961221
## TC Glu tolGlu sCOT sHCOT uCOT
## 22032.332445 2.339704 3.157822 6.914695 7.513166 7.444274
## uHCOT db dyslip hyper eGFR fa_income
## 8.383642 1.833021 1.680671 2.041241 9.764762 3.911760
## smsts drksts PA
## 1.926273 1.242355 1.189574
# Count variables with VIF > 5
num_high_vif <- sum(vif_values > 5)
# Print the number of variables with VIF > 5
print(num_high_vif)
## [1] 11
# Get the names of variables with VIF > 5
high_vif_vars <- names(vif_values[vif_values > 5])
# Print the names of variables with VIF > 5
print(high_vif_vars)
## [1] "Crea" "apoB" "HDLc" "LDLc" "TG" "TC" "sCOT" "sHCOT" "uCOT"
## [10] "uHCOT" "eGFR"
df_new <- subset(df_rename_new, select = -c(SEQN, uCOT, uHCOT, tolGlu, apoB, LDLc, Crea))
# Check missing again
plot_na_pareto(df_new)
## Count missing values in each column
colSums(is.na(df_new))
## age sex ethnic educ masts SBP DBP BMI
## 0 0 0 319 320 488 488 56
## PIR BUN AST ALT HDLc TG TC Glu
## 390 225 227 227 209 2902 209 2888
## sCOT sHCOT db dyslip hyper eGFR fa_income smsts
## 208 208 0 0 0 225 215 0
## drksts PA target
## 0 0 0
ggcorr(df_new, label = TRUE, label_size = 2.5, hjust = 1, layout.exp = 2)
df_new <- df_new %>%
mutate(sex = as.factor(sex),
ethnic = as.factor(ethnic),
educ = as.factor(educ),
masts = as.factor(masts),
db = as.factor(db),
dyslip = as.factor(dyslip),
hyper = as.factor(hyper),
fa_income = as.factor(fa_income),
smsts = as.factor(smsts),
drksts = as.factor(drksts),
PA = as.factor(PA),
target = as.factor(target))
table(df_new$sex)
##
## 1 2
## 2585 2787
table(df_new$ethnic)
##
## 1 2 3 4 6 7
## 753 481 2315 1087 562 174
table(df_new$educ)
##
## 1 2 3 4 5
## 357 683 1130 1591 1292
table(df_new$masts)
##
## 1 2 3 4 5 6
## 2637 369 582 151 955 358
table(df_new$db)
##
## 0 1
## 4542 830
table(df_new$dyslip)
##
## 0 1
## 1681 3691
table(df_new$hyper)
##
## 0 1
## 3458 1914
table(df_new$fa_income)
##
## 1 2 3 4
## 1210 2586 442 919
table(df_new$smsts)
##
## 1 2 3
## 3117 1199 1056
table(df_new$drksts)
##
## 1 2 3
## 914 687 3771
table(df_new$PA)
##
## 1 2 3
## 1320 3713 339
# Relevel 'sex' with the lowest category as reference
df_new$sex <- relevel(df_new$sex, ref = "1")
# Relevel 'ethnic' with the lowest category as reference
df_new$ethnic <- relevel(df_new$ethnic, ref = "1")
# Relevel 'educ' with the lowest category as reference
df_new$educ <- relevel(df_new$educ, ref = "1")
# Relevel 'masts' with the lowest category as reference
df_new$masts <- relevel(df_new$masts, ref = "1")
# Relevel 'db' with the lowest category as reference
df_new$db <- relevel(df_new$db, ref = "0")
# Relevel 'dyslip' with the lowest category as reference
df_new$dyslip <- relevel(df_new$dyslip, ref = "0")
# Relevel 'hyper' with the lowest category as reference
df_new$hyper <- relevel(df_new$hyper, ref = "0")
# Relevel 'fa_income' with the lowest category as reference
df_new$fa_income <- relevel(df_new$fa_income, ref = "1")
# Relevel 'smsts' with the lowest category as reference
df_new$smsts <- relevel(df_new$smsts, ref = "1")
# Relevel 'drksts' with the lowest category as reference
df_new$drksts <- relevel(df_new$drksts, ref = "1")
# Relevel 'PA' with the lowest category as reference
df_new$PA <- relevel(df_new$PA, ref = "1")
library(missRanger)
# Drop SEQN and dp10
imputation_data <- subset(df_new, select = -c(target))
# Perform imputation
impu <- missRanger(imputation_data,
formula = .~.,
num.trees = 1000,
seed = 3)
##
## Missing value imputation by random forests
##
## Variables to impute: educ, masts, SBP, DBP, BMI, PIR, BUN, AST, ALT, HDLc, TG, TC, Glu, sCOT, sHCOT, eGFR, fa_income
## Variables used to impute: age, sex, ethnic, educ, masts, SBP, DBP, BMI, PIR, BUN, AST, ALT, HDLc, TG, TC, Glu, sCOT, sHCOT, db, dyslip, hyper, eGFR, fa_income, smsts, drksts, PA
##
## iter 1
## | | | 0% | |==== | 6% | |======== | 12% | |============ | 18% | |================ | 24% | |===================== | 29% | |========================= | 35% | |============================= | 41% | |================================= | 47% | |===================================== | 53% | |========================================= | 59% | |============================================= | 65% | |================================================= | 71% | |====================================================== | 76% | |========================================================== | 82% | |============================================================== | 88% | |================================================================== | 94% | |======================================================================| 100%
## iter 2
## | | | 0% | |==== | 6% | |======== | 12% | |============ | 18% | |================ | 24% | |===================== | 29% | |========================= | 35% | |============================= | 41% | |================================= | 47% | |===================================== | 53% | |========================================= | 59% | |============================================= | 65% | |================================================= | 71% | |====================================================== | 76% | |========================================================== | 82% | |============================================================== | 88% | |================================================================== | 94% | |======================================================================| 100%
## iter 3
## | | | 0% | |==== | 6% | |======== | 12% | |============ | 18% | |================ | 24% | |===================== | 29% | |========================= | 35% | |============================= | 41% | |================================= | 47% | |===================================== | 53% | |========================================= | 59% | |============================================= | 65% | |================================================= | 71% | |====================================================== | 76% | |========================================================== | 82% | |============================================================== | 88% | |================================================================== | 94% | |======================================================================| 100%
## iter 4
## | | | 0% | |==== | 6% | |======== | 12% | |============ | 18% | |================ | 24% | |===================== | 29% | |========================= | 35% | |============================= | 41% | |================================= | 47% | |===================================== | 53% | |========================================= | 59% | |============================================= | 65% | |================================================= | 71% | |====================================================== | 76% | |========================================================== | 82% | |============================================================== | 88% | |================================================================== | 94% | |======================================================================| 100%
# Merge imputed data with original dataset (Nen lam vao luc nao ????????)
imputed_df <- cbind(df_new[c("target")], impu)
ggcorr(imputed_df, label = TRUE, label_size = 2.5, hjust = 1, layout.exp = 2)
## Warning in ggcorr(imputed_df, label = TRUE, label_size = 2.5, hjust = 1, : data
## in column(s) 'target', 'sex', 'ethnic', 'educ', 'masts', 'db', 'dyslip',
## 'hyper', 'fa_income', 'smsts', 'drksts', 'PA' are not numeric and were ignored
imputed_df2 = imputed_df
# Features selection
features <- imputed_df2[, c("age", "sex", "ethnic", "educ", "masts", "SBP", "DBP",
"BMI", "PIR", "BUN", "AST", "ALT", "HDLc",
"TG", "TC", "Glu", "sCOT", "sHCOT", "db", "dyslip",
"hyper", "eGFR", "fa_income", "smsts", "drksts", "PA")]
target <- imputed_df2$target
# Data normalization
library(caret)
##
## Attaching package: 'caret'
## The following object is masked from 'package:survival':
##
## cluster
## The following object is masked from 'package:purrr':
##
## lift
preprocessParams <- preProcess(features, method = c("center", "scale"))
features_normalized <- predict(preprocessParams, features)
split <- createDataPartition(target, p = 0.8, list = FALSE)
X_train <- features_normalized[split, ]
X_test <- features_normalized[-split, ]
Y_train <- target[split]
Y_test <- target[-split]
## Print the shape of the training and test sets
print(paste("X_train shape:", paste(dim(X_train), collapse = "x")))
## [1] "X_train shape: 4298x26"
print(paste("X_test shape:", paste(dim(X_test), collapse = "x")))
## [1] "X_test shape: 1074x26"
## Combine features and target into a single data frame
train_data <- as.data.frame(cbind(target = Y_train, X_train))
# Training the logistic regression model
model <- glm(target ~ ., data = train_data, family = "binomial")
# Making predictions on the test set
predictions <- predict(model, newdata = as.data.frame(X_test), type = "response")
## Converting probabilities to binary predictions based on threshold 0.5
binary_predictions <- ifelse(predictions >= 0.5, 1, 0)
## Combining actual values and predicted values into a data frame
result <- data.frame(actual = Y_test, predicted = binary_predictions)
## Evaluating the model
confusionMatrix(data = as.factor(binary_predictions), reference = as.factor(Y_test),
positive = "1") # positive = "1": very important
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 967 101
## 1 5 1
##
## Accuracy : 0.9013
## 95% CI : (0.8819, 0.9185)
## No Information Rate : 0.905
## P-Value [Acc > NIR] : 0.6841
##
## Kappa : 0.0081
##
## Mcnemar's Test P-Value : <2e-16
##
## Sensitivity : 0.0098039
## Specificity : 0.9948560
## Pos Pred Value : 0.1666667
## Neg Pred Value : 0.9054307
## Prevalence : 0.0949721
## Detection Rate : 0.0009311
## Detection Prevalence : 0.0055866
## Balanced Accuracy : 0.5023299
##
## 'Positive' Class : 1
##
####Overall, the model appears to have a high specificity but very low sensitivity, leading to poor performance in correctly identifying positive cases. This indicates that the model may need further refinement or adjustment to improve its ability to correctly classify positive cases. ####To improve the model, several strategies can be considered: Address Class Imbalance, Model Selection, Hyperparameter Tuning, Cross-Validation
conf_matrix <- table(factor(binary_predictions, levels = c("0", "1")),
factor(Y_test, levels = c("0", "1")))
# Set the dimension names of the confusion matrix
dimnames(conf_matrix) <- list(Actual = c("0", "1"), Predicted = c("0", "1"))
# Plot the fourfold plot with color and main title
fourfoldplot(conf_matrix, color = c("lightblue", "yellow"), main = "Confusion Matrix")