📘 Load Libraries

library(tidyverse)
library(dplyr)
library(DBI) #contains functions to interect with the databasa
library(RSQLite) #SQLite database driver
library(jsonlite)
library(haven)
library(janitor)
library(readxl)
library(writexl)
library(openxlsx)
library(xlsx)
library(microbenchmark)
library(data.table)
library(readr)
library(remotes)
library(tidyr)
library(dbplyr)
library(knitr)       
library(kableExtra)    
library(psych)
library(readxl)
library(DescTools)
library(ggplot2)
library(e1071)
library(moments)
library(skimr)
library(tableone)
library(table1)
library(gtsummary)
library(gmodels)
library(naniar)
library(ggthemes)
library(ggThemeAssist)
library(forcats)
library(ggpubr)
library(patchwork)
library(reshape2)
library(car)
library(lares)
library(ggstatsplot)
library(GGally)
library(PerformanceAnalytics)
library(heatmap3)
library(sjPlot)
library(sjmisc)
library(pairsD3)
library(gridExtra) 
library(gridExtra) 
library(lmtest) 
library(titanic) 
library(tidyverse) 
library(caret) 
library(pROC)
library(gt)
library(corrplot)
library(Hmisc)
nhanes_analysis <- read.csv("nhanes_analysis_without_outliar_hscrpcattegorisation.csv")
library(dplyr)
library(gtsummary)

#cont_vars <- c("age","BMI","waist","TC","HDL","HbA1c","hsCRP")

cont_vars <- c("hsCRP", "hsCRP_cap", "hsCRP_log", "age", "PIR", "BMI", "waist", "TC", "HDL", "HbA1c")


desc_cont <- nhanes_analysis %>%
  select(all_of(cont_vars), hypertension) %>%
  tbl_summary(
    by = hypertension,
    type = all_continuous() ~ "continuous",
    statistic = all_continuous() ~ "{mean} ({sd})"
  ) %>%
  add_p(test = all_continuous() ~ "t.test") %>%
  modify_header(label = "**Variable**") %>%
  bold_labels()

desc_cont
Variable 0
N = 29,263
1
1
N = 6,461
1
p-value2
hsCRP 2.46 (5.55) 4.65 (9.66) <0.001
hsCRP_cap 2.31 (3.77) 4.11 (5.52) <0.001
hsCRP_log 0.30 (1.03) 0.89 (1.06) <0.001
age 30 (23) 61 (15) <0.001
PIR 2.42 (1.54) 2.39 (1.47) 0.2
BMI 25 (7) 31 (8) <0.001
waist 87 (20) 104 (17) <0.001
TC 177 (32) 184 (42) <0.001
HDL 54 (12) 52 (15) <0.001
HbA1c 5.46 (0.39) 6.49 (1.59) <0.001
1 Mean (SD)
2 Welch Two Sample t-test
cat_vars <- c("gender","race","diabetes","kidney","smoking","alcohol","physical","hsCRP_quartile")

desc_cat <- nhanes_analysis %>%
  select(all_of(cat_vars), hypertension) %>%
  tbl_summary(
    by = hypertension,
    statistic = all_categorical() ~ "{n} ({p}%)"
  ) %>%
  add_p(test = all_categorical() ~ "chisq.test") %>%
  modify_header(label = "**Variable**") %>%
  bold_labels()

desc_cat
Variable 0
N = 29,263
1
1
N = 6,461
1
p-value2
gender

<0.001
    Female 15,206 (52%) 3,189 (49%)
    Male 14,057 (48%) 3,272 (51%)
race

<0.001
    1 4,000 (14%) 752 (12%)
    2 3,323 (11%) 677 (10%)
    3 11,411 (39%) 2,485 (38%)
    4 5,814 (20%) 1,674 (26%)
    6 2,698 (9.2%) 536 (8.3%)
    7 2,017 (6.9%) 337 (5.2%)
diabetes 0 (0%) 3,382 (52%) <0.001
kidney 5 (<0.1%) 39 (0.6%) <0.001
smoking 6,243 (21%) 3,064 (47%) <0.001
alcohol 27,134 (93%) 5,490 (85%) <0.001
physical 23,082 (79%) 4,688 (73%) <0.001
hsCRP_quartile

<0.001
    1 8,054 (28%) 877 (14%)
    2 7,581 (26%) 1,350 (21%)
    3 7,479 (26%) 1,452 (22%)
    4 6,149 (21%) 2,782 (43%)
1 n (%)
2 Pearson’s Chi-squared test
library(e1071)
skewness(nhanes_analysis$hsCRP_log, na.rm = TRUE)
## [1] 0.4079302
kurtosis(nhanes_analysis$hsCRP_log, na.rm = TRUE)
## [1] 0.6616889
hsCRP_skew <- skewness(nhanes_analysis$hsCRP_log, na.rm = TRUE)
hsCRP_kurt <- kurtosis(nhanes_analysis$hsCRP_log, na.rm = TRUE)
cat("Skewness (log hsCRP):", hsCRP_skew, "\n")
## Skewness (log hsCRP): 0.4079302
cat("Kurtosis (log hsCRP):", hsCRP_kurt, "\n")
## Kurtosis (log hsCRP): 0.6616889
# -----------------------------
# Step 3: Descriptive statistics for continuous variables
# -----------------------------
#cont_vars <- c("hsCRP", "hsCRP_log", "age", "PIR", "BMI", "waist", "TC", "HDL", "HbA1c")

cont_vars <- c("hsCRP", "hsCRP_cap", "hsCRP_log", "age", "PIR", "BMI", "waist", "TC", "HDL", "HbA1c")


cont_summary <- nhanes_analysis %>%
  select(all_of(cont_vars)) %>%
  summarise(across(everything(),
                   list(mean = ~mean(.x, na.rm=TRUE),
                        sd   = ~sd(.x, na.rm=TRUE),
                        median = ~median(.x, na.rm=TRUE),
                        Q1 = ~quantile(.x, 0.25, na.rm=TRUE),
                        Q3 = ~quantile(.x, 0.75, na.rm=TRUE)
                        )
                   ))

print(cont_summary)
##   hsCRP_mean hsCRP_sd hsCRP_median hsCRP_Q1 hsCRP_Q3 hsCRP_cap_mean
## 1   2.853738  6.54405         1.32      0.7      2.5       2.633293
##   hsCRP_cap_sd hsCRP_cap_median hsCRP_cap_Q1 hsCRP_cap_Q3 hsCRP_log_mean
## 1     4.197594             1.32          0.7          2.5      0.4052227
##   hsCRP_log_sd hsCRP_log_median hsCRP_log_Q1 hsCRP_log_Q3 age_mean   age_sd
## 1     1.060311        0.3506569   -0.2231436    0.9555114 35.52248 24.98483
##   age_median age_Q1 age_Q3 PIR_mean  PIR_sd PIR_median PIR_Q1 PIR_Q3 BMI_mean
## 1         33     12     58 2.412445 1.52466       2.06    1.2   3.55 26.43016
##     BMI_sd BMI_median BMI_Q1 BMI_Q3 waist_mean waist_sd waist_median waist_Q1
## 1 7.502948       25.8   21.7   29.9   89.90923 20.39434           91     78.5
##   waist_Q3  TC_mean    TC_sd TC_median TC_Q1 TC_Q3 HDL_mean   HDL_sd HDL_median
## 1    101.3 177.9767 34.23275       175   162   189 53.34957 12.70463         52
##   HDL_Q1 HDL_Q3 HbA1c_mean  HbA1c_sd HbA1c_median HbA1c_Q1 HbA1c_Q3
## 1     47     57   5.648368 0.8584626          5.5      5.4      5.6
# -----------------------------
# Step 4: Normality test
# -----------------------------
# Shapiro-Wilk (sample ≤ 5000) else Kolmogorov-Smirnov
if(nrow(nhanes_analysis) <= 5000){
  shapiro_result <- shapiro.test(nhanes_analysis$hsCRP_log)
  print(shapiro_result)
} else {
  ks_result <- ks.test(nhanes_analysis$hsCRP_log, "pnorm",
                       mean=mean(nhanes_analysis$hsCRP_log, na.rm=TRUE),
                       sd=sd(nhanes_analysis$hsCRP_log, na.rm=TRUE))
  print(ks_result)
}
## 
##  Asymptotic one-sample Kolmogorov-Smirnov test
## 
## data:  nhanes_analysis$hsCRP_log
## D = 0.15486, p-value < 2.2e-16
## alternative hypothesis: two-sided
# -----------------------------
# Step 5: Optional histogram
# -----------------------------
ggplot(nhanes_analysis, aes(x=hsCRP_log)) +
  geom_histogram(bins=50, fill="skyblue", color="black") +
  labs(title="Histogram of log-transformed hsCRP",
       x="log(hsCRP)", y="Frequency")

# Continuous variable list
#cont_vars <- c("hsCRP", "hsCRP_log", "age", "PIR", "BMI", "waist", "TC", "HDL", "HbA1c")

cont_vars <- c("hsCRP", "hsCRP_cap", "hsCRP_log", "age", "PIR", "BMI", "waist", "TC", "HDL", "HbA1c")



for (var in cont_vars){
  cat("\nVariable:", var, "\n")
  t_res <- t.test(nhanes_analysis[[var]] ~ nhanes_analysis$hypertension)
  print(t_res)
}
## 
## Variable: hsCRP 
## 
##  Welch Two Sample t-test
## 
## data:  nhanes_analysis[[var]] by nhanes_analysis$hypertension
## t = -17.587, df = 7429.2, p-value < 2.2e-16
## alternative hypothesis: true difference in means between group 0 and group 1 is not equal to 0
## 95 percent confidence interval:
##  -2.432165 -1.944338
## sample estimates:
## mean in group 0 mean in group 1 
##        2.457974        4.646225 
## 
## 
## Variable: hsCRP_cap 
## 
##  Welch Two Sample t-test
## 
## data:  nhanes_analysis[[var]] by nhanes_analysis$hypertension
## t = -25.04, df = 7835.7, p-value < 2.2e-16
## alternative hypothesis: true difference in means between group 0 and group 1 is not equal to 0
## 95 percent confidence interval:
##  -1.948409 -1.665489
## sample estimates:
## mean in group 0 mean in group 1 
##         2.30649         4.11344 
## 
## 
## Variable: hsCRP_log 
## 
##  Welch Two Sample t-test
## 
## data:  nhanes_analysis[[var]] by nhanes_analysis$hypertension
## t = -40.437, df = 9353.7, p-value < 2.2e-16
## alternative hypothesis: true difference in means between group 0 and group 1 is not equal to 0
## 95 percent confidence interval:
##  -0.6143723 -0.5575617
## sample estimates:
## mean in group 0 mean in group 1 
##       0.2992454       0.8852124 
## 
## 
## Variable: age 
## 
##  Welch Two Sample t-test
## 
## data:  nhanes_analysis[[var]] by nhanes_analysis$hypertension
## t = -136.56, df = 14602, p-value < 2.2e-16
## alternative hypothesis: true difference in means between group 0 and group 1 is not equal to 0
## 95 percent confidence interval:
##  -31.52081 -30.62871
## sample estimates:
## mean in group 0 mean in group 1 
##        29.90233        60.97709 
## 
## 
## Variable: PIR 
## 
##  Welch Two Sample t-test
## 
## data:  nhanes_analysis[[var]] by nhanes_analysis$hypertension
## t = 1.4178, df = 9816.9, p-value = 0.1563
## alternative hypothesis: true difference in means between group 0 and group 1 is not equal to 0
## 95 percent confidence interval:
##  -0.01106070  0.06888035
## sample estimates:
## mean in group 0 mean in group 1 
##        2.417673        2.388763 
## 
## 
## Variable: BMI 
## 
##  Welch Two Sample t-test
## 
## data:  nhanes_analysis[[var]] by nhanes_analysis$hypertension
## t = -54.218, df = 9172.4, p-value < 2.2e-16
## alternative hypothesis: true difference in means between group 0 and group 1 is not equal to 0
## 95 percent confidence interval:
##  -5.762018 -5.359910
## sample estimates:
## mean in group 0 mean in group 1 
##        25.42441        30.98537 
## 
## 
## Variable: waist 
## 
##  Welch Two Sample t-test
## 
## data:  nhanes_analysis[[var]] by nhanes_analysis$hypertension
## t = -75.074, df = 10913, p-value < 2.2e-16
## alternative hypothesis: true difference in means between group 0 and group 1 is not equal to 0
## 95 percent confidence interval:
##  -18.20249 -17.27614
## sample estimates:
## mean in group 0 mean in group 1 
##        86.70092       104.44024 
## 
## 
## Variable: TC 
## 
##  Welch Two Sample t-test
## 
## data:  nhanes_analysis[[var]] by nhanes_analysis$hypertension
## t = -12.297, df = 8172.7, p-value < 2.2e-16
## alternative hypothesis: true difference in means between group 0 and group 1 is not equal to 0
## 95 percent confidence interval:
##  -7.963790 -5.773949
## sample estimates:
## mean in group 0 mean in group 1 
##        176.7344        183.6033 
## 
## 
## Variable: HDL 
## 
##  Welch Two Sample t-test
## 
## data:  nhanes_analysis[[var]] by nhanes_analysis$hypertension
## t = 6.4386, df = 8264.3, p-value = 1.273e-10
## alternative hypothesis: true difference in means between group 0 and group 1 is not equal to 0
## 95 percent confidence interval:
##  0.9162147 1.7183069
## sample estimates:
## mean in group 0 mean in group 1 
##        53.58781        52.27055 
## 
## 
## Variable: HbA1c 
## 
##  Welch Two Sample t-test
## 
## data:  nhanes_analysis[[var]] by nhanes_analysis$hypertension
## t = -51.479, df = 6634.1, p-value < 2.2e-16
## alternative hypothesis: true difference in means between group 0 and group 1 is not equal to 0
## 95 percent confidence interval:
##  -1.0629361 -0.9849519
## sample estimates:
## mean in group 0 mean in group 1 
##        5.463179        6.487123
library(dplyr)
library(psych)

#cont_vars <- c("hsCRP","hsCRP_log","age","PIR","BMI","waist","TC","HDL","HbA1c")

cont_vars <- c("hsCRP", "hsCRP_cap", "hsCRP_log", "age", "PIR", "BMI", "waist", "TC", "HDL", "HbA1c")


# Group summary for continuous variables
cont_summary_list <- lapply(cont_vars, function(var){
  x0 <- nhanes_analysis[[var]][nhanes_analysis$hypertension==0]
  x1 <- nhanes_analysis[[var]][nhanes_analysis$hypertension==1]
  
  data.frame(
    Variable = var,
    Mean_0 = mean(x0, na.rm=TRUE),
    SD_0 = sd(x0, na.rm=TRUE),
    Mean_1 = mean(x1, na.rm=TRUE),
    SD_1 = sd(x1, na.rm=TRUE),
    Median_0 = median(x0, na.rm=TRUE),
    Q1_0 = quantile(x0, 0.25, na.rm=TRUE),
    Q3_0 = quantile(x0, 0.75, na.rm=TRUE),
    Median_1 = median(x1, na.rm=TRUE),
    Q1_1 = quantile(x1, 0.25, na.rm=TRUE),
    Q3_1 = quantile(x1, 0.75, na.rm=TRUE),
    t_test_p = t.test(x0, x1)$p.value
  )
})

cont_summary <- do.call(rbind, cont_summary_list)
cont_summary
##       Variable      Mean_0      SD_0      Mean_1      SD_1    Median_0
## 25%      hsCRP   2.4579736  5.552821   4.6462250  9.655209   1.3200000
## 25%1 hsCRP_cap   2.3064904  3.766051   4.1134396  5.524008   1.3200000
## 25%2 hsCRP_log   0.2992454  1.030861   0.8852124  1.059288   0.3506569
## 25%3       age  29.9023340 23.235217  60.9770933 14.675713  24.0000000
## 25%4       PIR   2.4176732  1.536115   2.3887633  1.471545   2.0600000
## 25%5       BMI  25.4244097  7.112767  30.9853738  7.536552  25.8000000
## 25%6     waist  86.7009227 19.754291 104.4402414 16.570545  91.0000000
## 25%7        TC 176.7344428 32.049247 183.6033122 42.296342 175.0000000
## 25%8       HDL  53.5878071 12.002130  52.2705464 15.447690  52.0000000
## 25%9     HbA1c   5.4631788  0.391398   6.4871227  1.588206   5.5000000
##             Q1_0        Q3_0    Median_1        Q1_1       Q3_1      t_test_p
## 25%    0.6100000   1.9700000   1.9300000   1.2000000   4.800000  7.311118e-68
## 25%1   0.6100000   1.9700000   1.9300000   1.2000000   4.800000 3.503138e-133
## 25%2  -0.3424903   0.7275486   0.7080358   0.2623643   1.589235  0.000000e+00
## 25%3  10.0000000  48.0000000  63.0000000  52.0000000  72.000000  0.000000e+00
## 25%4   1.1950000   3.6000000   2.0600000   1.2300000   3.370000  1.562883e-01
## 25%5  20.5000000  28.4000000  29.4000000  25.8000000  34.900000  0.000000e+00
## 25%6  74.3000000  96.5000000 102.1000000  91.0000000 114.800000  0.000000e+00
## 25%7 163.0000000 185.0000000 175.0000000 157.0000000 208.000000  1.882865e-34
## 25%8  48.0000000  56.0000000  52.0000000  42.0000000  58.000000  1.273363e-10
## 25%9   5.3000000   5.5000000   5.9000000   5.5000000   6.900000  0.000000e+00
cat_vars <- c("gender","race","diabetes","kidney","smoking","alcohol","physical","hsCRP_quartile")

cat_summary_list <- lapply(cat_vars, function(var){
  tab <- table(nhanes_analysis[[var]], nhanes_analysis$hypertension, useNA="ifany")
  p_val <- chisq.test(tab)$p.value
  data.frame(
    Variable = var,
    Table = paste(capture.output(print(tab)), collapse = "; "),
    Chi_sq_p = p_val
  )
})

cat_summary <- do.call(rbind, cat_summary_list)
cat_summary
##         Variable
## 1         gender
## 2           race
## 3       diabetes
## 4         kidney
## 5        smoking
## 6        alcohol
## 7       physical
## 8 hsCRP_quartile
##                                                                                                                        Table
## 1                                                         ;              0     1;   Female 15206  3189;   Male   14057  3272
## 2    ;         0     1;   1  4000   752;   2  3323   677;   3 11411  2485;   4  5814  1674;   6  2698   536;   7  2017   337
## 3                                                                        ;         0     1;   0 29263  3079;   1     0  3382
## 4                                                                        ;         0     1;   0 29258  6422;   1     5    39
## 5                                                                        ;         0     1;   0 23020  3397;   1  6243  3064
## 6                                                                        ;         0     1;   0  2129   971;   1 27134  5490
## 7                                                                        ;         0     1;   0  6181  1773;   1 23082  4688
## 8                                                ;        0    1;   1 8054  877;   2 7581 1350;   3 7479 1452;   4 6149 2782
##       Chi_sq_p
## 1 1.574765e-04
## 2 1.666542e-28
## 3 0.000000e+00
## 4 5.105558e-33
## 5 0.000000e+00
## 6 4.318117e-89
## 7 2.621468e-28
## 8 0.000000e+00
library(dplyr)

# Continuous + categorical summary একসাথে
full_summary <- bind_rows(
  cont_summary %>% mutate(Type = "Continuous"),
  cat_summary %>% mutate(Type = "Categorical")
)

# CSV আকারে save করতে
#write.csv(full_summary, "nhanes_descriptive_summary.csv", row.names = FALSE)

# preview
head(full_summary, 20)
##        Variable      Mean_0      SD_0      Mean_1      SD_1    Median_0
## 25%       hsCRP   2.4579736  5.552821   4.6462250  9.655209   1.3200000
## 25%1  hsCRP_log   0.2992454  1.030861   0.8852124  1.059288   0.3506569
## 25%2        age  29.9023340 23.235217  60.9770933 14.675713  24.0000000
## 25%3        PIR   2.4176732  1.536115   2.3887633  1.471545   2.0600000
## 25%4        BMI  25.4244097  7.112767  30.9853738  7.536552  25.8000000
## 25%5      waist  86.7009227 19.754291 104.4402414 16.570545  91.0000000
## 25%6         TC 176.7344428 32.049247 183.6033122 42.296342 175.0000000
## 25%7        HDL  53.5878071 12.002130  52.2705464 15.447690  52.0000000
## 25%8      HbA1c   5.4631788  0.391398   6.4871227  1.588206   5.5000000
## ...10    gender          NA        NA          NA        NA          NA
## ...11      race          NA        NA          NA        NA          NA
## ...12  diabetes          NA        NA          NA        NA          NA
## ...13    kidney          NA        NA          NA        NA          NA
## ...14   smoking          NA        NA          NA        NA          NA
## ...15   alcohol          NA        NA          NA        NA          NA
## ...16  physical          NA        NA          NA        NA          NA
##              Q1_0        Q3_0    Median_1        Q1_1       Q3_1     t_test_p
## 25%     0.6100000   1.9700000   1.9300000   1.2000000   4.800000 7.311118e-68
## 25%1   -0.3424903   0.7275486   0.7080358   0.2623643   1.589235 0.000000e+00
## 25%2   10.0000000  48.0000000  63.0000000  52.0000000  72.000000 0.000000e+00
## 25%3    1.1950000   3.6000000   2.0600000   1.2300000   3.370000 1.562883e-01
## 25%4   20.5000000  28.4000000  29.4000000  25.8000000  34.900000 0.000000e+00
## 25%5   74.3000000  96.5000000 102.1000000  91.0000000 114.800000 0.000000e+00
## 25%6  163.0000000 185.0000000 175.0000000 157.0000000 208.000000 1.882865e-34
## 25%7   48.0000000  56.0000000  52.0000000  42.0000000  58.000000 1.273363e-10
## 25%8    5.3000000   5.5000000   5.9000000   5.5000000   6.900000 0.000000e+00
## ...10          NA          NA          NA          NA         NA           NA
## ...11          NA          NA          NA          NA         NA           NA
## ...12          NA          NA          NA          NA         NA           NA
## ...13          NA          NA          NA          NA         NA           NA
## ...14          NA          NA          NA          NA         NA           NA
## ...15          NA          NA          NA          NA         NA           NA
## ...16          NA          NA          NA          NA         NA           NA
##              Type
## 25%    Continuous
## 25%1   Continuous
## 25%2   Continuous
## 25%3   Continuous
## 25%4   Continuous
## 25%5   Continuous
## 25%6   Continuous
## 25%7   Continuous
## 25%8   Continuous
## ...10 Categorical
## ...11 Categorical
## ...12 Categorical
## ...13 Categorical
## ...14 Categorical
## ...15 Categorical
## ...16 Categorical
##                                                                                                                            Table
## 25%                                                                                                                         <NA>
## 25%1                                                                                                                        <NA>
## 25%2                                                                                                                        <NA>
## 25%3                                                                                                                        <NA>
## 25%4                                                                                                                        <NA>
## 25%5                                                                                                                        <NA>
## 25%6                                                                                                                        <NA>
## 25%7                                                                                                                        <NA>
## 25%8                                                                                                                        <NA>
## ...10                                                         ;              0     1;   Female 15206  3189;   Male   14057  3272
## ...11    ;         0     1;   1  4000   752;   2  3323   677;   3 11411  2485;   4  5814  1674;   6  2698   536;   7  2017   337
## ...12                                                                        ;         0     1;   0 29263  3079;   1     0  3382
## ...13                                                                        ;         0     1;   0 29258  6422;   1     5    39
## ...14                                                                        ;         0     1;   0 23020  3397;   1  6243  3064
## ...15                                                                        ;         0     1;   0  2129   971;   1 27134  5490
## ...16                                                                        ;         0     1;   0  6181  1773;   1 23082  4688
##           Chi_sq_p
## 25%             NA
## 25%1            NA
## 25%2            NA
## 25%3            NA
## 25%4            NA
## 25%5            NA
## 25%6            NA
## 25%7            NA
## 25%8            NA
## ...10 1.574765e-04
## ...11 1.666542e-28
## ...12 0.000000e+00
## ...13 5.105558e-33
## ...14 0.000000e+00
## ...15 4.318117e-89
## ...16 2.621468e-28