📘 Load Libraries

library(tidyverse)
library(dplyr)
library(DBI) #contains functions to interect with the databasa
library(RSQLite) #SQLite database driver
library(jsonlite)
library(haven)
library(janitor)
library(readxl)
library(writexl)
library(openxlsx)
library(xlsx)
library(microbenchmark)
library(data.table)
library(readr)
library(remotes)
library(tidyr)
library(dbplyr)
library(knitr)       
library(kableExtra)    
library(psych)
library(readxl)
library(DescTools)
library(ggplot2)
library(e1071)
library(moments)
library(skimr)
library(tableone)
library(table1)
library(gtsummary)
library(gmodels)
library(naniar)
library(ggthemes)
library(ggThemeAssist)
library(forcats)
library(ggpubr)
library(patchwork)
library(reshape2)
library(car)
library(lares)
library(ggstatsplot)
library(GGally)
library(PerformanceAnalytics)
library(heatmap3)
library(sjPlot)
library(sjmisc)
library(pairsD3)
library(gridExtra) 
library(gridExtra) 
library(lmtest) 
library(titanic) 
library(tidyverse) 
library(caret) 
library(pROC)
library(gt)
library(corrplot)
library(Hmisc)
nhanes_analysis <- read.csv("nhanes_analysis_without_outliar_hscrpcattegorisation.csv")
nhanes_final_clean <- read.csv("nhanes_final_clean.csv")

Data overview & preparation

# Basic overview
str(nhanes_analysis)
## 'data.frame':    35724 obs. of  20 variables:
##  $ SEQN          : int  83732 83733 83734 83735 83736 83737 83738 83739 83740 83741 ...
##  $ hsCRP         : num  0.6 1.4 0.6 9 0.5 2.5 0.3 0.08 1.32 1.3 ...
##  $ hypertension  : int  1 1 1 0 0 0 0 0 0 0 ...
##  $ age           : int  62 53 78 56 42 72 11 4 1 22 ...
##  $ gender        : chr  "Male" "Male" "Male" "Female" ...
##  $ race          : int  3 3 3 3 4 1 1 3 2 4 ...
##  $ PIR           : num  4.39 1.32 1.51 5 1.23 2.82 1.18 4.22 2.06 2.08 ...
##  $ BMI           : num  27.8 30.8 28.8 42.4 20.3 28.6 18.1 15.7 25.8 28 ...
##  $ waist         : num  101.1 107.9 116.5 110.1 80.4 ...
##  $ diabetes      : int  1 0 1 0 0 0 0 0 0 0 ...
##  $ kidney        : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ smoking       : int  1 1 1 0 0 0 0 0 0 1 ...
##  $ alcohol       : int  1 1 1 0 0 0 1 1 1 1 ...
##  $ physical      : int  1 1 1 1 1 1 1 1 1 1 ...
##  $ TC            : int  173 265 229 174 204 190 142 175 175 164 ...
##  $ HDL           : int  46 63 30 61 53 78 43 52 52 48 ...
##  $ HbA1c         : num  7 5.5 5.8 5.6 5.6 5.9 5.5 5.5 5.5 5.6 ...
##  $ hsCRP_log     : num  -0.357 0.405 -0.357 2.208 -0.511 ...
##  $ hsCRP_cap     : num  0.6 1.4 0.6 9 0.5 2.5 0.3 0.08 1.32 1.3 ...
##  $ hsCRP_quartile: int  1 3 1 4 1 3 1 1 2 2 ...
summary(nhanes_analysis)
##       SEQN            hsCRP          hypertension         age       
##  Min.   : 83732   Min.   :  0.080   Min.   :0.0000   Min.   : 1.00  
##  1st Qu.: 93149   1st Qu.:  0.700   1st Qu.:0.0000   1st Qu.:12.00  
##  Median :118128   Median :  1.320   Median :0.0000   Median :33.00  
##  Mean   :115763   Mean   :  2.854   Mean   :0.1809   Mean   :35.52  
##  3rd Qu.:133038   3rd Qu.:  2.500   3rd Qu.:0.0000   3rd Qu.:58.00  
##  Max.   :142310   Max.   :246.860   Max.   :1.0000   Max.   :80.00  
##     gender               race            PIR             BMI       
##  Length:35724       Min.   :1.000   Min.   :0.000   Min.   :11.10  
##  Class :character   1st Qu.:3.000   1st Qu.:1.200   1st Qu.:21.70  
##  Mode  :character   Median :3.000   Median :2.060   Median :25.80  
##                     Mean   :3.367   Mean   :2.412   Mean   :26.43  
##                     3rd Qu.:4.000   3rd Qu.:3.550   3rd Qu.:29.90  
##                     Max.   :7.000   Max.   :5.000   Max.   :92.30  
##      waist           diabetes           kidney            smoking      
##  Min.   : 39.80   Min.   :0.00000   Min.   :0.000000   Min.   :0.0000  
##  1st Qu.: 78.50   1st Qu.:0.00000   1st Qu.:0.000000   1st Qu.:0.0000  
##  Median : 91.00   Median :0.00000   Median :0.000000   Median :0.0000  
##  Mean   : 89.91   Mean   :0.09467   Mean   :0.001232   Mean   :0.2605  
##  3rd Qu.:101.30   3rd Qu.:0.00000   3rd Qu.:0.000000   3rd Qu.:1.0000  
##  Max.   :187.00   Max.   :1.00000   Max.   :1.000000   Max.   :1.0000  
##     alcohol          physical            TC           HDL        
##  Min.   :0.0000   Min.   :0.0000   Min.   : 62   Min.   :  5.00  
##  1st Qu.:1.0000   1st Qu.:1.0000   1st Qu.:162   1st Qu.: 47.00  
##  Median :1.0000   Median :1.0000   Median :175   Median : 52.00  
##  Mean   :0.9132   Mean   :0.7773   Mean   :178   Mean   : 53.35  
##  3rd Qu.:1.0000   3rd Qu.:1.0000   3rd Qu.:189   3rd Qu.: 57.00  
##  Max.   :1.0000   Max.   :1.0000   Max.   :545   Max.   :226.00  
##      HbA1c          hsCRP_log         hsCRP_cap      hsCRP_quartile
##  Min.   : 2.800   Min.   :-1.7148   Min.   : 0.080   Min.   :1.00  
##  1st Qu.: 5.400   1st Qu.:-0.2231   1st Qu.: 0.700   1st Qu.:1.75  
##  Median : 5.500   Median : 0.3507   Median : 1.320   Median :2.50  
##  Mean   : 5.648   Mean   : 0.4052   Mean   : 2.633   Mean   :2.50  
##  3rd Qu.: 5.600   3rd Qu.: 0.9555   3rd Qu.: 2.500   3rd Qu.:3.25  
##  Max.   :17.100   Max.   : 5.5092   Max.   :29.238   Max.   :4.00
# Missing value check
sapply(nhanes_analysis, function(x) sum(is.na(x)))
##           SEQN          hsCRP   hypertension            age         gender 
##              0              0              0              0              0 
##           race            PIR            BMI          waist       diabetes 
##              0              0              0              0              0 
##         kidney        smoking        alcohol       physical             TC 
##              0              0              0              0              0 
##            HDL          HbA1c      hsCRP_log      hsCRP_cap hsCRP_quartile 
##              0              0              0              0              0
# Outlier visualization for hsCRP
boxplot(nhanes_analysis$hsCRP, main = "Boxplot of hsCRP")

# Ensure variable types are correct
nhanes_analysis <- nhanes_analysis %>%
  mutate(
    hypertension = factor(hypertension, levels = c(0,1), labels = c("No","Yes")),
    gender = factor(gender, levels = c(1,2), labels = c("Male","Female")),
    race = as.factor(race)
  )

Graphical summaries

# Histogram for hsCRP (before log)
ggplot(nhanes_analysis, aes(x = hsCRP)) +
  geom_histogram(bins = 30, fill = "steelblue", color = "black") +
  labs(title = "Distribution of hsCRP", x = "hsCRP (mg/L)", y = "Frequency")

# Histogram for hsCRP_log (after log transform)
ggplot(nhanes_analysis, aes(x = hsCRP_log)) +
  geom_histogram(bins = 30, fill = "seagreen", color = "black") +
  labs(title = "Distribution of hsCRP (Log Transformed)", x = "log(hsCRP)", y = "Frequency")

# Boxplot of hsCRP_log by hypertension
ggplot(nhanes_analysis, aes(x = hypertension, y = hsCRP_log, fill = hypertension)) +
  geom_boxplot() +
  labs(title = "hsCRP (log) by Hypertension Status", x = "Hypertension", y = "log(hsCRP)") +
  theme_minimal()

Normality check

# Shapiro-Wilk test for hsCRP_log
#shapiro.test(nhanes_analysis$hsCRP_log)

# Q–Q plot
qqnorm(nhanes_analysis$hsCRP_log)
qqline(nhanes_analysis$hsCRP_log, col = "red")

# Q–Q plot
qqnorm(nhanes_analysis$hsCRP_log)
qqline(nhanes_analysis$hsCRP_log, col = "red")

# Histogram (visual check)
hist(nhanes_analysis$hsCRP_log, breaks = 40, col = "skyblue", main = "Distribution of hsCRP_log")

# Skewness and kurtosis test (from 'moments' package)
library(moments)
skewness(nhanes_analysis$hsCRP_log, na.rm = TRUE)
## [1] 0.4079302
kurtosis(nhanes_analysis$hsCRP_log, na.rm = TRUE)
## [1] 0.6616889
ks.test(nhanes_analysis$hsCRP_log, "pnorm", mean=mean(nhanes_analysis$hsCRP_log, na.rm=TRUE),
        sd=sd(nhanes_analysis$hsCRP_log, na.rm=TRUE))
## 
##  Asymptotic one-sample Kolmogorov-Smirnov test
## 
## data:  nhanes_analysis$hsCRP_log
## D = 0.15486, p-value < 2.2e-16
## alternative hypothesis: two-sided

Correlation analysis (optional but useful)

cor_vars <- nhanes_analysis %>%
  select(age, BMI, waist, TC, HDL, HbA1c, hsCRP_log) %>%
  na.omit()

cor_matrix <- cor(cor_vars)
round(cor_matrix, 2)
##            age   BMI waist   TC   HDL HbA1c hsCRP_log
## age       1.00  0.42  0.54 0.23  0.06  0.28      0.30
## BMI       0.42  1.00  0.89 0.14 -0.20  0.23      0.52
## waist     0.54  0.89  1.00 0.16 -0.20  0.24      0.49
## TC        0.23  0.14  0.16 1.00  0.22  0.08      0.16
## HDL       0.06 -0.20 -0.20 0.22  1.00 -0.14     -0.19
## HbA1c     0.28  0.23  0.24 0.08 -0.14  1.00      0.22
## hsCRP_log 0.30  0.52  0.49 0.16 -0.19  0.22      1.00

Group comparisons (optional)

# Continuous variable comparison (t-test)
t.test(hsCRP_log ~ hypertension, data = nhanes_analysis)
## 
##  Welch Two Sample t-test
## 
## data:  hsCRP_log by hypertension
## t = -40.437, df = 9353.7, p-value < 2.2e-16
## alternative hypothesis: true difference in means between group No and group Yes is not equal to 0
## 95 percent confidence interval:
##  -0.6143723 -0.5575617
## sample estimates:
##  mean in group No mean in group Yes 
##         0.2992454         0.8852124
# Categorical variable comparison (chi-square)
#chisq.test(table(nhanes_analysis$gender, nhanes_analysis$hypertension))
#chisq.test(table(nhanes_analysis$kidney_function, nhanes_analysis$hypertension))
table(nhanes_analysis$gender, nhanes_analysis$hypertension)
##         
##          No Yes
##   Male    0   0
##   Female  0   0
table(nhanes_analysis$hypertension, useNA = "ifany")
## 
##    No   Yes 
## 29263  6461
table(nhanes_analysis$gender, useNA = "ifany")
## 
##   Male Female   <NA> 
##      0      0  35724
str(nhanes_analysis$gender)
##  Factor w/ 2 levels "Male","Female": NA NA NA NA NA NA NA NA NA NA ...
nhanes_analysis$gender <- nhanes_final_clean$gender

table(nhanes_analysis$gender, useNA = "ifany")
## 
## Female   Male 
##  18395  17329
nhanes_analysis <- nhanes_analysis %>%
  mutate(
    gender = case_when(
      gender == "Male" ~ 1,
      gender == "Female" ~ 2,
      TRUE ~ NA_real_
    )
  )
table(nhanes_analysis$gender, useNA = "ifany")
## 
##     1     2 
## 17329 18395
table(nhanes_analysis$gender, nhanes_analysis$hypertension)
##    
##        No   Yes
##   1 14057  3272
##   2 15206  3189
chisq.test(table(nhanes_analysis$gender, nhanes_analysis$hypertension))
## 
##  Pearson's Chi-squared test with Yates' continuity correction
## 
## data:  table(nhanes_analysis$gender, nhanes_analysis$hypertension)
## X-squared = 14.281, df = 1, p-value = 0.0001575
# Gender vs Hypertension table with counts and percentages
library(dplyr)
library(tidyr)

nhanes_analysis %>%
  group_by(gender, hypertension) %>%
  summarise(n = n(), .groups = "drop") %>%
  group_by(gender) %>%
  mutate(percent = round(100 * n / sum(n), 2)) %>%
  arrange(gender, hypertension)
## # A tibble: 4 × 4
## # Groups:   gender [2]
##   gender hypertension     n percent
##    <dbl> <fct>        <int>   <dbl>
## 1      1 No           14057    81.1
## 2      1 Yes           3272    18.9
## 3      2 No           15206    82.7
## 4      2 Yes           3189    17.3
length(nhanes_analysis$kidney_function)
## [1] 0
length(nhanes_analysis$hypertension)
## [1] 35724
nhanes_analysis$kidney <- nhanes_final_clean$kidney
# Kidney function vs hypertension
table(nhanes_analysis$kidney, nhanes_analysis$hypertension, useNA = "ifany")
##    
##        No   Yes
##   0 29258  6422
##   1     5    39
chisq.test(table(nhanes_analysis$kidney, nhanes_analysis$hypertension))
## 
##  Pearson's Chi-squared test with Yates' continuity correction
## 
## data:  table(nhanes_analysis$kidney, nhanes_analysis$hypertension)
## X-squared = 143.28, df = 1, p-value < 2.2e-16
# ✅ Categorical variables এর তালিকা
cat_vars <- c("gender", "race", "diabetes", "kidney", "smoking", "alcohol", "physical","hsCRP_quartile")

# ✅ Chi-square test results সংরক্ষণের জন্য list বানানো
chi_results <- list()

# ✅ Loop করে প্রতিটা categorical variable এর জন্য chi-square test করা
for (var in cat_vars) {
  tbl <- table(nhanes_analysis[[var]], nhanes_analysis$hypertension, useNA = "ifany")
  chi <- chisq.test(tbl)
  chi_results[[var]] <- list(
    variable = var,
    table = tbl,
    chi_sq = chi$statistic,
    df = chi$parameter,
    p_value = chi$p.value
  )
}

# ✅ Chi-square test এর summary দেখতে
chi_results_summary <- lapply(chi_results, function(x) {
  data.frame(
    Variable = x$variable,
    Chi_sq = x$chi_sq,
    df = x$df,
    p_value = x$p_value
  )
})

# ✅ Summary print করা
chi_results_summary <- do.call(rbind, chi_results_summary)
print(chi_results_summary)
##                      Variable      Chi_sq df      p_value
## gender                 gender    14.28064  1 1.574765e-04
## race                     race   140.14515  5 1.666542e-28
## diabetes             diabetes 16913.33057  1 0.000000e+00
## kidney                 kidney   143.27985  1 5.105558e-33
## smoking               smoking  1868.45276  1 0.000000e+00
## alcohol               alcohol   400.48528  1 4.318117e-89
## physical             physical   121.74766  1 2.621468e-28
## hsCRP_quartile hsCRP_quartile  1514.09552  3 0.000000e+00
cont_vars <- c("age", "PIR", "BMI", "waist", "TC", "HDL", "HbA1c", "hsCRP_cap", "hsCRP_log")