1. Import library and dataset

library(haven)
#df <- read_sas("C:/Users/Momo/Desktop/R - Learning/Depression project/nhanes.sas7bdat")
df <- read_sas("/Users/thien/Desktop/R-dir/Depressive Disorder project/nhanes.sas7bdat")

2. Create variables

2.1 Create dependent variable

library(tidyverse)

## ── Attaching packages ─────────────────────────────────────── tidyverse 1.3.2 ──
## ✔ ggplot2 3.5.1     ✔ purrr   1.0.2
## ✔ tibble  3.2.1     ✔ dplyr   1.1.4
## ✔ tidyr   1.3.1     ✔ stringr 1.5.0
## ✔ readr   2.1.4     ✔ forcats 1.0.0
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()

# Filter the dataframe
df1 <- df %>%
  filter(DPQ010 != 7 & DPQ010 != 9 & !is.na(DPQ010))

df2 <- df1 %>%
  filter(DPQ020 != 7 & DPQ020 != 9 & !is.na(DPQ020))

df3 <- df2 %>%
  filter(DPQ030 != 7 & DPQ030 != 9 & !is.na(DPQ030))

df4 <- df3 %>%
  filter(DPQ040 != 7 & DPQ040 != 9 & !is.na(DPQ040))

df5 <- df4 %>%
  filter(DPQ050 != 7 & DPQ050 != 9 & !is.na(DPQ050))

df6 <- df5 %>%
  filter(DPQ060 != 7 & DPQ060 != 9 & !is.na(DPQ060))

df7 <- df6 %>%
  filter(DPQ070 != 7 & DPQ070 != 9 & !is.na(DPQ070))

df8 <- df7 %>%
  filter(DPQ080 != 7 & DPQ080 != 9 & !is.na(DPQ080))

df9 <- df8 %>%
  filter(DPQ090 != 7 & DPQ090 != 9 & !is.na(DPQ090))

# Create depression variable: 
depress <- df9
depress$dp = depress$DPQ010 + depress$DPQ020 + depress$DPQ030 + depress$DPQ040 + depress$DPQ050 + 
  depress$DPQ060 + depress$DPQ070 + depress$DPQ080 + depress$DPQ090

# Create the new variable 'dp10'
depress <- depress %>%
  mutate(dp10 = ifelse(dp <= 9, 0, 1))


# Display frequency table for dp10
table(depress$dp10)

## 
##    0    1 
## 4861  511

2.2 Create independent variables

2.2.1 Smoking status

# Smoked at least 100 cigarettes in life
table(depress$SMQ020)

## 
##    1    2    9 
## 2253 3117    2

# Do you now smoke cigarettes
table(depress$SMQ040)

## 
##    1    2    3 
##  850  206 1197

# How long since quit smoking cigarettes
table(depress$SMQ050Q)

## 
##     1     2     3     4     5     6     7     8     9    10    11    12    13 
##    46    61    61    51    54    47    37    30    19    52     6    30    19 
##    14    15    16    17    18    19    20    21    22    23    24    25    26 
##    28    59     9    13    22     9    76    13    18    19    14    51    13 
##    27    28    29    30    31    32    33    34    35    36    37    38    39 
##     6    16     3    79     6    10    15     7    25     4     8     4    11 
##    40    41    42    43    44    45    46    47    48    49 66666 99999 
##    38     6     8     4    13    17     4     3     3     1    40     9

# Create a new variable 'smoking_status'
depress$smoking_status <- NA

# Define smoking status categories based on SMQ020, SMQ040, and SMQ050Q
depress$smoking_status[depress$SMQ020 == 2] <- "Never"  # Never smokers
depress$smoking_status[depress$SMQ040 %in% c(1, 2)] <- "Current"  # Current smokers
#depress$smoking_status[depress$SMQ040 == 1 & depress$SMQ040 == 2] <- "Current"  # Current smokers
#depress$smoking_status[depress$SMQ020 == 1 & depress$SMQ040 == 3] <- "Former"  # Former smokers
depress$smoking_status[is.na(depress$smoking_status)] <- "Former"

# Replace missing values (NA) with "Former" for those who have quit but have missing values in SMQ040
#depress$smoking_status[is.na(depress$smoking_status) & !is.na(depress$SMQ050Q)] <- "Former"

# Replace 66666 and 99999 codes in SMQ050Q with "Former" category
#depress$smoking_status[depress$SMQ050Q %in% c(66666, 77777, 99999)] <- "Former"

# Convert 'smoking_status' to factor with specified levels
depress$smoking_status <- factor(depress$smoking_status, levels = c("Current", "Former", "Never"))

# Display the table of smoking status
table(depress$smoking_status)

## 
## Current  Former   Never 
##    1056    1199    3117

2.2.2 Drinking status

# Had at least 12 alcohol drinks/lifetime?
table(depress$ALQ110)

## 
##   1   2   9 
## 682 914   3

# Had at least 12 alcohol drinks/1 yr?
table(depress$ALQ101)

## 
##    1    2    9 
## 3771 1591    8

# How often drink alcohol over past 12 mos
table(depress$ALQ120Q)

## 
##    0    1    2    3    4    5    6    7    8    9   10   11   12   13   14   15 
##  867 1042  884  521  302  263  141  193   22    4   70    2   24    1    6   22 
##   16   17   18   20   24   25   28   30   35   40   45   52   61   80   90   96 
##    2    1    1   24    4    4    2   20    1    2    1    1    1    2    1    1 
##  100  112  120  160  180  182  200  300  305  350  365  999 
##    2    1    1    1    1    1    1    1    1    1    6    4

# Create a new variable 'drinking_status'
depress$drinking_status <- NA

# Define categories based on ALQ101, ALQ110, and ALQ120Q
depress$drinking_status[depress$ALQ101 == 2 & depress$ALQ110 == 2] <- "Never"  # Never drinkers
depress$drinking_status[depress$ALQ101 == 1] <- "Current"  # Current drinkers
# depress$drinking_status[depress$ALQ110 == 1] <- "Former"  # Former drinkers
depress$drinking_status[is.na(depress$drinking_status)] <- "Former"

# Replace missing values (NA) with "Former" for those who have quit but have missing values in ALQ110
# depress$drinking_status[is.na(depress$drinking_status) & !is.na(depress$ALQ120Q)] <- "Former"

# Replace 999 codes in ALQ120Q with "Former" category
# depress$drinking_status[depress$ALQ120Q == 999] <- "Former"

# Convert 'drinking_status' to factor with specified levels
depress$drinking_status <- factor(depress$drinking_status, levels = c("Current", "Former", "Never"))

# Display the table of drinking status
table(depress$drinking_status)

## 
## Current  Former   Never 
##    3771     687     914

2.2.3 Create physical activity variable

depress$PA <- NA

# Define categories based on PAQ605, PAQ620, PAQ635, PAQ650, and PAQ665
depress$PA[depress$PAQ605 == 1 | depress$PAQ650 == 1] <- "Vigorous"  # Vigorous activity
depress$PA[depress$PAQ620 == 1 | depress$PAQ635 == 1 | depress$PAQ665 == 1] <- "Moderate"  # Moderate activity

# For remaining participants, assign "Mild" category
depress$PA[is.na(depress$PA)] <- "Mild"

# Convert 'PA' to factor with specified levels
depress$PA <- factor(depress$PA, levels = c("Mild", "Moderate", "Vigorous"))

# Display frequency table for the PA variable
table(depress$PA)

## 
##     Mild Moderate Vigorous 
##     1320     3713      339

2.2.4 Family income variable

depress$Fa_inc <- NA
# Define categories 
depress$Fa_inc[depress$INDFMIN2 == 1 | depress$INDFMIN2 == 2 | depress$INDFMIN2 == 3 | depress$INDFMIN2 == 4 | depress$INDFMIN2 == 13] <- "Very-Low"  
depress$Fa_inc[depress$INDFMIN2 == 5 | depress$INDFMIN2 == 6 | depress$INDFMIN2 == 7 | depress$INDFMIN2 == 8 | depress$INDFMIN2 == 9 | depress$INDFMIN2 == 10 | depress$INDFMIN2 == 12] <- "Low-Medium"  
depress$Fa_inc[depress$INDFMIN2 == 14 ] <- "Medium-High"  
depress$Fa_inc[depress$INDFMIN2 == 15 ] <- "High"  
depress$Fa_inc[depress$INDFMIN2 == 77 | depress$INDFMIN2 == 99] <- NA 

depress$Fa_inc <- factor(depress$Fa_inc, levels = c("Very-Low", "Low-Medium", "Medium-High", "High"))

table(depress$Fa_inc)

## 
##    Very-Low  Low-Medium Medium-High        High 
##        1210        2586         442         919

2.2.5 Create SBP and DBP

depress <- depress %>%
  mutate(SBP = (BPXSY1 + BPXSY2) / 2,
         DBP = (BPXDI1 + BPXDI2) / 2)

2.2.6 Create the variable ‘hyper’

hypertension <- depress %>%
  mutate(hyper = case_when(
    SBP >= 140 | DBP >= 90 | BPQ040A == 1 | BPQ050A == 1 ~ 1,
    TRUE ~ 0))

2.2.7 Create the variable ‘dyslip’ using case_when

lipid <- hypertension %>%
  mutate(dyslip = case_when(
    LBXTC >= 200 | LBXTR >= 150 | LBDLDL >= 100 | LBDHDD <= 40 | BPQ090D == 1 ~ 1,
    TRUE ~ 0))

2.2.8 Create the ‘db’ variable using case_when

diab <- lipid %>%
  mutate(db = case_when(
    LBXGLU >= 126 | LBXGLT >= 200 | LBXGH >= 6.5 | DIQ070 == 1 ~ 1,
    TRUE ~ 0
  ))

2.2.9 Create eGFR

subset_df <- subset(diab, select = c(SEQN, RIDAGEYR, RIAGENDR, RIDRETH3, DMDEDUC2, 
                                     DMDMARTL, SBP, DBP, BMXBMI, Fa_inc, 
                                     INDFMPIR, LBXSBU, LBXSCR, LBXSASSI, 
                                     LBXSATSI, LBXAPB, LBDHDD, LBDLDL, LBXTR, 
                                     LBXTC, LBXGLU, LBXGLT, drinking_status, 
                                     smoking_status, PA, LBXCOT, LBXHCT, 
                                     URXCOTT, URXHCTT, db, dyslip, hyper, dp10))

# Rename all variables in one codes
df_rename <- subset_df %>% rename(age = RIDAGEYR, sex = RIAGENDR, ethnic = RIDRETH3, 
                                  educ = DMDEDUC2, masts = DMDMARTL, BMI = BMXBMI, 
                                  PIR = INDFMPIR, 
                                  BUN = LBXSBU, Crea = LBXSCR, AST = LBXSASSI, ALT = LBXSATSI,
                                  apoB = LBXAPB, HDLc = LBDHDD, LDLc = LBDLDL, 
                                  TG = LBXTR, TC = LBXTC, Glu = LBXGLU, tolGlu = LBXGLT, 
                                  sCOT = LBXCOT, sHCOT = LBXHCT, uCOT = URXCOTT, uHCOT = URXHCTT)


# Filter out values ‘7, 77’ and ‘9, 99’ from categorical variables column
# filtered_data <- df_rename %>%
#  filter(educ %in% c('1', '2', '3', '4', '5'),
#         masts %in% c('1', '2', '3', '4', '5', '6')) # I think we should dont filter out, just change them into NA

# Replace values '7' and '77' in educ column with NA
df_rename$educ[df_rename$educ %in% c('7', '9')] <- NA

# Replace values '9' and '99' in masts column with NA
df_rename$masts[df_rename$masts %in% c('77', '99')] <- NA

###########################################################################
# Define and Apply the MDRD Formula (modification of diet in renal disease)
calculate_GFR <- function(creatinine, age, gender) {
  gender_multiplier <- ifelse(gender == "female", 0.742, 1)
  eGFR <- 186 * (creatinine^(-1.154)) * (age^(-0.203)) * gender_multiplier
  return(eGFR)
}

# 'sex' column contains 1 for male and 2 for female
df_rename$eGFR <- mapply(calculate_GFR,
                         creatinine = df_rename$Crea,
                         age = df_rename$age,
                         gender = ifelse(df_rename$sex == 2, "female", "male"))

# Viewing the first few entries of GFR values
head(df_rename$eGFR)

## [1]  63.19639 108.63640  62.06049  83.05920  93.97984  65.96047

# Getting a summary of GFR values
summary(df_rename$eGFR)

##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max.    NA's 
##   2.181  75.131  90.452  91.827 106.662 254.554     225

3. Try to understand the dataset: df_rename

str(df_rename)

## tibble [5,372 × 34] (S3: tbl_df/tbl/data.frame)
##  $ SEQN           : num [1:5372] 73557 73558 73559 73561 73562 ...
##   ..- attr(*, "label")= chr "Respondent sequence number"
##  $ age            : num [1:5372] 69 54 72 73 56 61 56 65 26 76 ...
##   ..- attr(*, "label")= chr "Age in years at screening"
##  $ sex            : num [1:5372] 1 1 1 2 1 2 2 1 2 1 ...
##   ..- attr(*, "label")= chr "Gender"
##  $ ethnic         : num [1:5372] 4 3 3 3 1 3 3 3 3 3 ...
##   ..- attr(*, "label")= chr "Race/Hispanic origin w/ NH Asian"
##  $ educ           : num [1:5372] 3 3 4 5 4 5 3 2 5 5 ...
##   ..- attr(*, "label")= chr "Education level - Adults 20+"
##  $ masts          : num [1:5372] 4 1 1 1 3 2 3 2 5 1 ...
##   ..- attr(*, "label")= chr "Marital status"
##  $ SBP            : num [1:5372] 118 158 140 135 159 121 126 141 103 NA ...
##   ..- attr(*, "label")= chr "Systolic: Blood pres (1st rdg) mm Hg"
##  $ DBP            : num [1:5372] 74 71 83 87 83 80 73 78 61 NA ...
##   ..- attr(*, "label")= chr "Diastolic: Blood pres (1st rdg) mm Hg"
##  $ BMI            : num [1:5372] 26.7 28.6 28.9 19.7 41.7 35.7 26.5 22 20.3 34.4 ...
##   ..- attr(*, "label")= chr "Body Mass Index (kg/m**2)"
##  $ Fa_inc         : Factor w/ 4 levels "Very-Low","Low-Medium",..: 1 2 2 4 2 2 1 1 4 3 ...
##  $ PIR            : num [1:5372] 0.84 1.78 4.51 5 4.79 5 0.48 1.2 5 5 ...
##   ..- attr(*, "label")= chr "Ratio of family income to poverty"
##  $ BUN            : num [1:5372] 10 16 14 31 18 17 9 15 12 17 ...
##   ..- attr(*, "label")= chr "Blood urea nitrogen (mg/dL)"
##  $ Crea           : num [1:5372] 1.21 0.79 1.22 0.73 0.89 0.92 0.55 0.97 0.74 1.19 ...
##   ..- attr(*, "label")= chr "Creatinine (mg/dL)"
##  $ AST            : num [1:5372] 16 18 22 36 24 20 23 29 20 28 ...
##   ..- attr(*, "label")= chr "Aspartate aminotransferase AST (U/L)"
##  $ ALT            : num [1:5372] 16 29 16 28 16 21 24 20 23 27 ...
##   ..- attr(*, "label")= chr "Alanine aminotransferase ALT (U/L)"
##  $ apoB           : num [1:5372] NA NA 57 92 NA 77 NA NA 59 NA ...
##   ..- attr(*, "label")= chr "Apolipoprotein (B) (mg/dL)"
##  $ HDLc           : num [1:5372] 65 50 60 85 38 58 59 79 96 50 ...
##   ..- attr(*, "label")= chr "Direct HDL-Cholesterol (mg/dL)"
##  $ LDLc           : num [1:5372] NA NA 56 101 NA 97 NA NA 67 NA ...
##   ..- attr(*, "label")= chr "LDL-cholesterol (mg/dL)"
##  $ TG             : num [1:5372] NA NA 51 75 NA 64 NA NA 24 NA ...
##   ..- attr(*, "label")= chr "Triglyceride (mg/dL)"
##  $ TC             : num [1:5372] 167 170 126 201 226 168 278 173 168 167 ...
##   ..- attr(*, "label")= chr "Total Cholesterol( mg/dL)"
##  $ Glu            : num [1:5372] NA NA 193 107 NA 110 NA NA 89 NA ...
##   ..- attr(*, "label")= chr "Fasting Glucose (mg/dL)"
##  $ tolGlu         : num [1:5372] NA NA NA NA NA 150 NA NA 80 NA ...
##   ..- attr(*, "label")= chr "Two Hour Glucose(OGTT) (mg/dL)"
##  $ drinking_status: Factor w/ 3 levels "Current","Former",..: 1 1 1 1 1 2 1 1 1 2 ...
##  $ smoking_status : Factor w/ 3 levels "Current","Former",..: 2 1 2 3 2 3 1 1 3 3 ...
##  $ PA             : Factor w/ 3 levels "Mild","Moderate",..: 1 2 2 2 3 1 2 3 2 2 ...
##  $ sCOT           : num [1:5372] 6.51 4.07 0.052 0.011 0.011 0.015 362 210 0.011 0.011 ...
##   ..- attr(*, "label")= chr "Cotinine, Serum (ng/mL)"
##  $ sHCOT          : num [1:5372] 1.33 4.48 0.055 0.011 0.011 0.011 154 234 0.011 0.011 ...
##   ..- attr(*, "label")= chr "Hematocrit (%)"
##  $ uCOT           : num [1:5372] NA NA NA NA NA 0.615 NA 3420 NA NA ...
##   ..- attr(*, "label")= chr "Total Cotinine, urine (ng/mL)"
##  $ uHCOT          : num [1:5372] NA NA NA NA NA 8.79e-01 NA 1.52e+04 NA NA ...
##   ..- attr(*, "label")= chr "Total Hydroxycotinine, urine (ng/mL)"
##  $ db             : num [1:5372] 1 1 1 0 0 0 0 0 0 1 ...
##  $ dyslip         : num [1:5372] 1 1 1 1 1 0 1 0 0 1 ...
##  $ hyper          : num [1:5372] 1 1 1 1 1 1 0 1 0 1 ...
##  $ dp10           : num [1:5372] 0 0 0 0 1 0 0 0 0 0 ...
##  $ eGFR           : num [1:5372] 63.2 108.6 62.1 83.1 94 ...

glimpse(df_rename)

## Rows: 5,372
## Columns: 34
## $ SEQN            <dbl> 73557, 73558, 73559, 73561, 73562, 73564, 73566, 73567…
## $ age             <dbl> 69, 54, 72, 73, 56, 61, 56, 65, 26, 76, 33, 32, 18, 38…
## $ sex             <dbl> 1, 1, 1, 2, 1, 2, 2, 1, 2, 1, 2, 1, 1, 2, 1, 1, 1, 1, …
## $ ethnic          <dbl> 4, 3, 3, 3, 1, 3, 3, 3, 3, 3, 6, 1, 1, 4, 6, 6, 3, 1, …
## $ educ            <dbl> 3, 3, 4, 5, 4, 5, 3, 2, 5, 5, 5, 1, NA, 3, 5, 4, 3, 4,…
## $ masts           <dbl> 4, 1, 1, 1, 3, 2, 3, 2, 5, 1, 1, 6, NA, 1, 1, 1, 3, 5,…
## $ SBP             <dbl> 118, 158, 140, 135, 159, 121, 126, 141, 103, NA, 123, …
## $ DBP             <dbl> 74, 71, 83, 87, 83, 80, 73, 78, 61, NA, 61, 74, 61, 75…
## $ BMI             <dbl> 26.7, 28.6, 28.9, 19.7, 41.7, 35.7, 26.5, 22.0, 20.3, …
## $ Fa_inc          <fct> Very-Low, Low-Medium, Low-Medium, High, Low-Medium, Lo…
## $ PIR             <dbl> 0.84, 1.78, 4.51, 5.00, 4.79, 5.00, 0.48, 1.20, 5.00, …
## $ BUN             <dbl> 10, 16, 14, 31, 18, 17, 9, 15, 12, 17, 11, 17, NA, 10,…
## $ Crea            <dbl> 1.21, 0.79, 1.22, 0.73, 0.89, 0.92, 0.55, 0.97, 0.74, …
## $ AST             <dbl> 16, 18, 22, 36, 24, 20, 23, 29, 20, 28, 20, 30, NA, 17…
## $ ALT             <dbl> 16, 29, 16, 28, 16, 21, 24, 20, 23, 27, 20, 44, NA, 13…
## $ apoB            <dbl> NA, NA, 57, 92, NA, 77, NA, NA, 59, NA, 61, 111, NA, 1…
## $ HDLc            <dbl> 65, 50, 60, 85, 38, 58, 59, 79, 96, 50, 53, 33, NA, 55…
## $ LDLc            <dbl> NA, NA, 56, 101, NA, 97, NA, NA, 67, NA, 75, 119, NA, …
## $ TG              <dbl> NA, NA, 51, 75, NA, 64, NA, NA, 24, NA, 14, 148, NA, 5…
## $ TC              <dbl> 167, 170, 126, 201, 226, 168, 278, 173, 168, 167, 131,…
## $ Glu             <dbl> NA, NA, 193, 107, NA, 110, NA, NA, 89, NA, 84, 104, NA…
## $ tolGlu          <dbl> NA, NA, NA, NA, NA, 150, NA, NA, 80, NA, NA, 84, NA, 8…
## $ drinking_status <fct> Current, Current, Current, Current, Current, Former, C…
## $ smoking_status  <fct> Former, Current, Former, Never, Former, Never, Current…
## $ PA              <fct> Mild, Moderate, Moderate, Moderate, Vigorous, Mild, Mo…
## $ sCOT            <dbl> 6.510, 4.070, 0.052, 0.011, 0.011, 0.015, 362.000, 210…
## $ sHCOT           <dbl> 1.330, 4.480, 0.055, 0.011, 0.011, 0.011, 154.000, 234…
## $ uCOT            <dbl> NA, NA, NA, NA, NA, 0.615, NA, 3420.000, NA, NA, NA, N…
## $ uHCOT           <dbl> NA, NA, NA, NA, NA, 8.79e-01, NA, 1.52e+04, NA, NA, NA…
## $ db              <dbl> 1, 1, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ dyslip          <dbl> 1, 1, 1, 1, 1, 0, 1, 0, 0, 1, 0, 1, 0, 1, 1, 0, 0, 1, …
## $ hyper           <dbl> 1, 1, 1, 1, 1, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ dp10            <dbl> 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ eGFR            <dbl> 63.19639, 108.63640, 62.06049, 83.05920, 93.97984, 65.…

4. Convert factor levels into numeric values

table(df_rename$Fa_inc)

## 
##    Very-Low  Low-Medium Medium-High        High 
##        1210        2586         442         919

table(df_rename$smoking_status)

## 
## Current  Former   Never 
##    1056    1199    3117

table(df_rename$drinking_status)

## 
## Current  Former   Never 
##    3771     687     914

table(df_rename$PA)

## 
##     Mild Moderate Vigorous 
##     1320     3713      339

df_rename$Fa_inc_numeric <- as.numeric(factor(df_rename$Fa_inc, levels = c("Very-Low", "Low-Medium", "Medium-High", "High")))
df_rename$smoking_status_numeric <- as.numeric(factor(df_rename$smoking_status, levels = c("Never", "Former", "Current")))
df_rename$drinking_status_numeric <- as.numeric(factor(df_rename$drinking_status, levels = c("Never", "Former", "Current")))
df_rename$PA_numeric <- as.numeric(factor(df_rename$PA, levels = c("Mild", "Moderate", "Vigorous")))
# Check the result
table(df_rename$Fa_inc_numeric)

## 
##    1    2    3    4 
## 1210 2586  442  919

table(df_rename$smoking_status_numeric)

## 
##    1    2    3 
## 3117 1199 1056

table(df_rename$drinking_status_numeric)

## 
##    1    2    3 
##  914  687 3771

table(df_rename$PA_numeric)

## 
##    1    2    3 
## 1320 3713  339

5. Rename dp10 to target

df_rename <- df_rename %>% rename(target = dp10)
df_rename_new <- subset(df_rename, select = -c(Fa_inc, smoking_status, drinking_status, PA))

# Export the dataset as a CSV file
# write.csv(df_rename_new, "df_rename_new.csv", row.names = FALSE)

# Assuming "target" is the name of your target variable and "df_rename" is your dataframe
target <- df_rename_new$target  # Extract the target column
df_rename_new <- df_rename_new[, !names(df_rename_new) %in% "target"]  # Remove the target column
df_rename_new <- cbind(df_rename_new, target)  # Add the target column at the end

5. Correlation matrix

library(GGally)

## Registered S3 method overwritten by 'GGally':
##   method from   
##   +.gg   ggplot2

df_rename_new <- df_rename_new %>% rename(PA = PA_numeric, 
                                          drksts = drinking_status_numeric, 
                                          smsts = smoking_status_numeric, 
                                          fa_income = Fa_inc_numeric)

ggcorr(df_rename_new, label = TRUE, label_size = 2.5, hjust = 1, layout.exp = 2)

6. Check missing

library(rms)

## Loading required package: Hmisc

## Loading required package: lattice

## Loading required package: survival

## Loading required package: Formula

## 
## Attaching package: 'Hmisc'

## The following objects are masked from 'package:dplyr':
## 
##     src, summarize

## The following objects are masked from 'package:base':
## 
##     format.pval, units

## Loading required package: SparseM

## 
## Attaching package: 'SparseM'

## The following object is masked from 'package:base':
## 
##     backsolve

na.patterns = naclus(df_rename)
plot(na.patterns)

library(naniar)
gg_miss_var(df_rename)

## Warning: The `guide` argument in `scale_*()` cannot be `FALSE`. This was deprecated in
## ggplot2 3.3.4.
## ℹ Please use "none" instead.
## ℹ The deprecated feature was likely used in the naniar package.
##   Please report the issue at <https://github.com/njtierney/naniar/issues>.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.

library(VIM)

## Loading required package: colorspace

## Loading required package: grid

## VIM is ready to use.

## Suggestions and bug-reports can be submitted at: https://github.com/statistikat/VIM/issues

## 
## Attaching package: 'VIM'

## The following object is masked from 'package:datasets':
## 
##     sleep

res<-summary(aggr(df_rename, sortVar=TRUE))$combinations

## 
##  Variables sorted by number of missings: 
##                 Variable      Count
##                     uCOT 0.67907669
##                    uHCOT 0.67907669
##                   tolGlu 0.64259121
##                     LDLc 0.54709605
##                     apoB 0.54039464
##                       TG 0.54020849
##                      Glu 0.53760238
##                      SBP 0.09084140
##                      DBP 0.09084140
##                      PIR 0.07259866
##                    masts 0.05956813
##                     educ 0.05938198
##                      AST 0.04225614
##                      ALT 0.04225614
##                      BUN 0.04188384
##                     Crea 0.04188384
##                     eGFR 0.04188384
##                   Fa_inc 0.04002234
##           Fa_inc_numeric 0.04002234
##                     HDLc 0.03890544
##                       TC 0.03890544
##                     sCOT 0.03871929
##                    sHCOT 0.03871929
##                      BMI 0.01042442
##                     SEQN 0.00000000
##                      age 0.00000000
##                      sex 0.00000000
##                   ethnic 0.00000000
##          drinking_status 0.00000000
##           smoking_status 0.00000000
##                       PA 0.00000000
##                       db 0.00000000
##                   dyslip 0.00000000
##                    hyper 0.00000000
##                   target 0.00000000
##   smoking_status_numeric 0.00000000
##  drinking_status_numeric 0.00000000
##               PA_numeric 0.00000000

# summary of whether the data is missing (in black) or not. 
vis_miss(df_rename, sort_miss = TRUE) # It also provides the percentage of missing values in each column.

## Warning: `gather_()` was deprecated in tidyr 1.2.0.
## ℹ Please use `gather()` instead.
## ℹ The deprecated feature was likely used in the visdat package.
##   Please report the issue at <https://github.com/ropensci/visdat/issues>.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.

library(dlookr)

## Registered S3 methods overwritten by 'dlookr':
##   method          from  
##   plot.transform  scales
##   print.transform scales

## 
## Attaching package: 'dlookr'

## The following object is masked from 'package:Hmisc':
## 
##     describe

## The following object is masked from 'package:tidyr':
## 
##     extract

## The following object is masked from 'package:base':
## 
##     transform

plot_na_pareto(df_rename) # It will show pareto chart with missing values

plot_na_hclust(df_rename) # Distribution of missing value by combination of variables

plot_na_intersect(df_rename) # Missing with intersection of variables

7. Install and load the ‘car’ package for the vif() function

library(car)

## Loading required package: carData

## 
## Attaching package: 'car'

## The following objects are masked from 'package:rms':
## 
##     Predict, vif

## The following object is masked from 'package:dplyr':
## 
##     recode

## The following object is masked from 'package:purrr':
## 
##     some

# Calculate VIF values
vif_values <- vif(lm(target ~ ., data = df_rename_new))

# Check VIF values
print(vif_values)

##         SEQN          age          sex       ethnic         educ        masts 
##     1.098877     3.804959     4.512323     1.314169     1.634432     1.449934 
##          SBP          DBP          BMI          PIR          BUN         Crea 
##     1.964960     1.282325     1.373336     4.430750     1.767992    11.930585 
##          AST          ALT         apoB         HDLc         LDLc           TG 
##     3.010278     3.370629    10.593547  2869.306331 17188.895079  2238.961221 
##           TC          Glu       tolGlu         sCOT        sHCOT         uCOT 
## 22032.332445     2.339704     3.157822     6.914695     7.513166     7.444274 
##        uHCOT           db       dyslip        hyper         eGFR    fa_income 
##     8.383642     1.833021     1.680671     2.041241     9.764762     3.911760 
##        smsts       drksts           PA 
##     1.926273     1.242355     1.189574

# Count variables with VIF > 5
num_high_vif <- sum(vif_values > 5)

# Print the number of variables with VIF > 5
print(num_high_vif)

## [1] 11

# Get the names of variables with VIF > 5
high_vif_vars <- names(vif_values[vif_values > 5])

# Print the names of variables with VIF > 5
print(high_vif_vars)

##  [1] "Crea"  "apoB"  "HDLc"  "LDLc"  "TG"    "TC"    "sCOT"  "sHCOT" "uCOT" 
## [10] "uHCOT" "eGFR"

8. Combine the results between missing checking and VIF calculation, remove high VIF variables

df_new <- subset(df_rename_new, select = -c(SEQN, uCOT, uHCOT, tolGlu, apoB, LDLc, Crea))

# Check missing again
plot_na_pareto(df_new)

## Count missing values in each column
colSums(is.na(df_new))

##       age       sex    ethnic      educ     masts       SBP       DBP       BMI 
##         0         0         0       319       320       488       488        56 
##       PIR       BUN       AST       ALT      HDLc        TG        TC       Glu 
##       390       225       227       227       209      2902       209      2888 
##      sCOT     sHCOT        db    dyslip     hyper      eGFR fa_income     smsts 
##       208       208         0         0         0       225       215         0 
##    drksts        PA    target 
##         0         0         0

ggcorr(df_new, label = TRUE, label_size = 2.5, hjust = 1, layout.exp = 2)

9. Data encoding;

df_new <- df_new %>%
  mutate(sex = as.factor(sex),
         ethnic = as.factor(ethnic),
         educ = as.factor(educ),
         masts = as.factor(masts),
         db = as.factor(db),
         dyslip = as.factor(dyslip),
         hyper = as.factor(hyper),
         fa_income = as.factor(fa_income),
         smsts = as.factor(smsts),
         drksts = as.factor(drksts), 
         PA = as.factor(PA),
         target = as.factor(target))

table(df_new$sex)

## 
##    1    2 
## 2585 2787

table(df_new$ethnic)

## 
##    1    2    3    4    6    7 
##  753  481 2315 1087  562  174

table(df_new$educ)

## 
##    1    2    3    4    5 
##  357  683 1130 1591 1292

table(df_new$masts)

## 
##    1    2    3    4    5    6 
## 2637  369  582  151  955  358

table(df_new$db)

## 
##    0    1 
## 4542  830

table(df_new$dyslip)

## 
##    0    1 
## 1681 3691

table(df_new$hyper)

## 
##    0    1 
## 3458 1914

table(df_new$fa_income)

## 
##    1    2    3    4 
## 1210 2586  442  919

table(df_new$smsts)

## 
##    1    2    3 
## 3117 1199 1056

table(df_new$drksts)

## 
##    1    2    3 
##  914  687 3771

table(df_new$PA)

## 
##    1    2    3 
## 1320 3713  339

# Relevel 'sex' with the lowest category as reference
df_new$sex <- relevel(df_new$sex, ref = "1")

# Relevel 'ethnic' with the lowest category as reference
df_new$ethnic <- relevel(df_new$ethnic, ref = "1")

# Relevel 'educ' with the lowest category as reference
df_new$educ <- relevel(df_new$educ, ref = "1")

# Relevel 'masts' with the lowest category as reference
df_new$masts <- relevel(df_new$masts, ref = "1")

# Relevel 'db' with the lowest category as reference
df_new$db <- relevel(df_new$db, ref = "0")

# Relevel 'dyslip' with the lowest category as reference
df_new$dyslip <- relevel(df_new$dyslip, ref = "0")

# Relevel 'hyper' with the lowest category as reference
df_new$hyper <- relevel(df_new$hyper, ref = "0")

# Relevel 'fa_income' with the lowest category as reference
df_new$fa_income <- relevel(df_new$fa_income, ref = "1")

# Relevel 'smsts' with the lowest category as reference
df_new$smsts <- relevel(df_new$smsts, ref = "1")

# Relevel 'drksts' with the lowest category as reference
df_new$drksts <- relevel(df_new$drksts, ref = "1")

# Relevel 'PA' with the lowest category as reference
df_new$PA <- relevel(df_new$PA, ref = "1")

10. Deal with missing value by Imputation

library(missRanger)
# Drop SEQN and dp10

imputation_data <- subset(df_new, select = -c(target))

# Perform imputation
impu <- missRanger(imputation_data, 
                   formula = .~.,
                   num.trees = 1000, 
                   seed = 3)

## 
## Missing value imputation by random forests
## 
##   Variables to impute:       educ, masts, SBP, DBP, BMI, PIR, BUN, AST, ALT, HDLc, TG, TC, Glu, sCOT, sHCOT, eGFR, fa_income
##   Variables used to impute:  age, sex, ethnic, educ, masts, SBP, DBP, BMI, PIR, BUN, AST, ALT, HDLc, TG, TC, Glu, sCOT, sHCOT, db, dyslip, hyper, eGFR, fa_income, smsts, drksts, PA
## 
## iter 1
##   |                                                                              |                                                                      |   0%  |                                                                              |====                                                                  |   6%  |                                                                              |========                                                              |  12%  |                                                                              |============                                                          |  18%  |                                                                              |================                                                      |  24%  |                                                                              |=====================                                                 |  29%  |                                                                              |=========================                                             |  35%  |                                                                              |=============================                                         |  41%  |                                                                              |=================================                                     |  47%  |                                                                              |=====================================                                 |  53%  |                                                                              |=========================================                             |  59%  |                                                                              |=============================================                         |  65%  |                                                                              |=================================================                     |  71%  |                                                                              |======================================================                |  76%  |                                                                              |==========================================================            |  82%  |                                                                              |==============================================================        |  88%  |                                                                              |==================================================================    |  94%  |                                                                              |======================================================================| 100%
## iter 2
##   |                                                                              |                                                                      |   0%  |                                                                              |====                                                                  |   6%  |                                                                              |========                                                              |  12%  |                                                                              |============                                                          |  18%  |                                                                              |================                                                      |  24%  |                                                                              |=====================                                                 |  29%  |                                                                              |=========================                                             |  35%  |                                                                              |=============================                                         |  41%  |                                                                              |=================================                                     |  47%  |                                                                              |=====================================                                 |  53%  |                                                                              |=========================================                             |  59%  |                                                                              |=============================================                         |  65%  |                                                                              |=================================================                     |  71%  |                                                                              |======================================================                |  76%  |                                                                              |==========================================================            |  82%  |                                                                              |==============================================================        |  88%  |                                                                              |==================================================================    |  94%  |                                                                              |======================================================================| 100%
## iter 3
##   |                                                                              |                                                                      |   0%  |                                                                              |====                                                                  |   6%  |                                                                              |========                                                              |  12%  |                                                                              |============                                                          |  18%  |                                                                              |================                                                      |  24%  |                                                                              |=====================                                                 |  29%  |                                                                              |=========================                                             |  35%  |                                                                              |=============================                                         |  41%  |                                                                              |=================================                                     |  47%  |                                                                              |=====================================                                 |  53%  |                                                                              |=========================================                             |  59%  |                                                                              |=============================================                         |  65%  |                                                                              |=================================================                     |  71%  |                                                                              |======================================================                |  76%  |                                                                              |==========================================================            |  82%  |                                                                              |==============================================================        |  88%  |                                                                              |==================================================================    |  94%  |                                                                              |======================================================================| 100%
## iter 4
##   |                                                                              |                                                                      |   0%  |                                                                              |====                                                                  |   6%  |                                                                              |========                                                              |  12%  |                                                                              |============                                                          |  18%  |                                                                              |================                                                      |  24%  |                                                                              |=====================                                                 |  29%  |                                                                              |=========================                                             |  35%  |                                                                              |=============================                                         |  41%  |                                                                              |=================================                                     |  47%  |                                                                              |=====================================                                 |  53%  |                                                                              |=========================================                             |  59%  |                                                                              |=============================================                         |  65%  |                                                                              |=================================================                     |  71%  |                                                                              |======================================================                |  76%  |                                                                              |==========================================================            |  82%  |                                                                              |==============================================================        |  88%  |                                                                              |==================================================================    |  94%  |                                                                              |======================================================================| 100%

# Merge imputed data with original dataset (Nen lam vao luc nao ????????)
imputed_df <- cbind(df_new[c("target")], impu)
ggcorr(imputed_df, label = TRUE, label_size = 2.5, hjust = 1, layout.exp = 2)

## Warning in ggcorr(imputed_df, label = TRUE, label_size = 2.5, hjust = 1, : data
## in column(s) 'target', 'sex', 'ethnic', 'educ', 'masts', 'db', 'dyslip',
## 'hyper', 'fa_income', 'smsts', 'drksts', 'PA' are not numeric and were ignored

11. Normalization and Data Splitting

imputed_df2 = imputed_df

# Features selection
features <- imputed_df2[, c("age", "sex", "ethnic", "educ", "masts", "SBP", "DBP",
                            "BMI", "PIR", "BUN", "AST", "ALT", "HDLc", 
                            "TG", "TC", "Glu", "sCOT", "sHCOT", "db", "dyslip",
                            "hyper", "eGFR", "fa_income", "smsts", "drksts", "PA")]

target <- imputed_df2$target

# Data normalization
library(caret)

## 
## Attaching package: 'caret'

## The following object is masked from 'package:survival':
## 
##     cluster

## The following object is masked from 'package:purrr':
## 
##     lift

preprocessParams <- preProcess(features, method = c("center", "scale"))
features_normalized <- predict(preprocessParams, features)

12. Splitting the data

split <- createDataPartition(target, p = 0.8, list = FALSE)
X_train <- features_normalized[split, ]
X_test <- features_normalized[-split, ]
Y_train <- target[split]
Y_test <- target[-split]

## Print the shape of the training and test sets
print(paste("X_train shape:", paste(dim(X_train), collapse = "x")))

## [1] "X_train shape: 4298x26"

print(paste("X_test shape:", paste(dim(X_test), collapse = "x")))

## [1] "X_test shape: 1074x26"

13. Model training

## Combine features and target into a single data frame
train_data <- as.data.frame(cbind(target = Y_train, X_train))

# Training the logistic regression model
model <- glm(target ~ ., data = train_data, family = "binomial")

14. Model testing

# Making predictions on the test set
predictions <- predict(model, newdata = as.data.frame(X_test), type = "response")
## Converting probabilities to binary predictions based on threshold 0.5
binary_predictions <- ifelse(predictions >= 0.5, 1, 0)
## Combining actual values and predicted values into a data frame
result <- data.frame(actual = Y_test, predicted = binary_predictions)
## Evaluating the model
confusionMatrix(data = as.factor(binary_predictions), reference = as.factor(Y_test), 
                positive = "1") # positive = "1": very important

## Confusion Matrix and Statistics
## 
##           Reference
## Prediction   0   1
##          0 967 101
##          1   5   1
##                                           
##                Accuracy : 0.9013          
##                  95% CI : (0.8819, 0.9185)
##     No Information Rate : 0.905           
##     P-Value [Acc > NIR] : 0.6841          
##                                           
##                   Kappa : 0.0081          
##                                           
##  Mcnemar's Test P-Value : <2e-16          
##                                           
##             Sensitivity : 0.0098039       
##             Specificity : 0.9948560       
##          Pos Pred Value : 0.1666667       
##          Neg Pred Value : 0.9054307       
##              Prevalence : 0.0949721       
##          Detection Rate : 0.0009311       
##    Detection Prevalence : 0.0055866       
##       Balanced Accuracy : 0.5023299       
##                                           
##        'Positive' Class : 1               
##

####Overall, the model appears to have a high specificity but very low sensitivity, leading to poor performance in correctly identifying positive cases. This indicates that the model may need further refinement or adjustment to improve its ability to correctly classify positive cases. ####To improve the model, several strategies can be considered: Address Class Imbalance, Model Selection, Hyperparameter Tuning, Cross-Validation

15. Create a confusion matrix

conf_matrix <- table(factor(binary_predictions, levels = c("0", "1")), 
                     factor(Y_test, levels = c("0", "1")))
# Set the dimension names of the confusion matrix
dimnames(conf_matrix) <- list(Actual = c("0", "1"), Predicted = c("0", "1"))
# Plot the fourfold plot with color and main title
fourfoldplot(conf_matrix, color = c("lightblue", "yellow"), main = "Confusion Matrix")

Logistic Regression - ML for DD project

Vu Thien

2024-05-16