library(tidyverse)
library(dplyr)
library(DBI) #contains functions to interect with the databasa
library(RSQLite) #SQLite database driver
library(jsonlite)
library(haven)
library(janitor)
library(readxl)
library(writexl)
library(openxlsx)
library(xlsx)
library(microbenchmark)
library(data.table)
library(readr)
library(remotes)
library(tidyr)
library(dbplyr)
library(knitr)
library(kableExtra)
library(psych)
library(readxl)
library(DescTools)
library(ggplot2)
library(e1071)
library(moments)
library(skimr)
library(tableone)
library(table1)
library(gtsummary)
library(gmodels)
library(naniar)
library(ggthemes)
library(ggThemeAssist)
library(forcats)
library(ggpubr)
library(patchwork)
library(reshape2)
library(car)
library(lares)
library(ggstatsplot)
library(GGally)
library(PerformanceAnalytics)
library(heatmap3)
library(sjPlot)
library(sjmisc)
library(pairsD3)
library(gridExtra)
library(gridExtra)
library(lmtest)
library(titanic)
library(tidyverse)
library(caret)
library(pROC)
library(gt)
library(corrplot)
library(Hmisc)library(dplyr)
library(gtsummary)
#cont_vars <- c("age","BMI","waist","TC","HDL","HbA1c","hsCRP")
cont_vars <- c("hsCRP", "hsCRP_cap", "hsCRP_log", "age", "PIR", "BMI", "waist", "TC", "HDL", "HbA1c")
desc_cont <- nhanes_analysis %>%
select(all_of(cont_vars), hypertension) %>%
tbl_summary(
by = hypertension,
type = all_continuous() ~ "continuous",
statistic = all_continuous() ~ "{mean} ({sd})"
) %>%
add_p(test = all_continuous() ~ "t.test") %>%
modify_header(label = "**Variable**") %>%
bold_labels()
desc_cont| Variable | 0 N = 29,2631 |
1 N = 6,4611 |
p-value2 |
|---|---|---|---|
| hsCRP | 2.46 (5.55) | 4.65 (9.66) | <0.001 |
| hsCRP_cap | 2.31 (3.77) | 4.11 (5.52) | <0.001 |
| hsCRP_log | 0.30 (1.03) | 0.89 (1.06) | <0.001 |
| age | 30 (23) | 61 (15) | <0.001 |
| PIR | 2.42 (1.54) | 2.39 (1.47) | 0.2 |
| BMI | 25 (7) | 31 (8) | <0.001 |
| waist | 87 (20) | 104 (17) | <0.001 |
| TC | 177 (32) | 184 (42) | <0.001 |
| HDL | 54 (12) | 52 (15) | <0.001 |
| HbA1c | 5.46 (0.39) | 6.49 (1.59) | <0.001 |
| 1 Mean (SD) | |||
| 2 Welch Two Sample t-test | |||
cat_vars <- c("gender","race","diabetes","kidney","smoking","alcohol","physical","hsCRP_quartile")
desc_cat <- nhanes_analysis %>%
select(all_of(cat_vars), hypertension) %>%
tbl_summary(
by = hypertension,
statistic = all_categorical() ~ "{n} ({p}%)"
) %>%
add_p(test = all_categorical() ~ "chisq.test") %>%
modify_header(label = "**Variable**") %>%
bold_labels()
desc_cat| Variable | 0 N = 29,2631 |
1 N = 6,4611 |
p-value2 |
|---|---|---|---|
| gender | <0.001 | ||
| Female | 15,206 (52%) | 3,189 (49%) | |
| Male | 14,057 (48%) | 3,272 (51%) | |
| race | <0.001 | ||
| 1 | 4,000 (14%) | 752 (12%) | |
| 2 | 3,323 (11%) | 677 (10%) | |
| 3 | 11,411 (39%) | 2,485 (38%) | |
| 4 | 5,814 (20%) | 1,674 (26%) | |
| 6 | 2,698 (9.2%) | 536 (8.3%) | |
| 7 | 2,017 (6.9%) | 337 (5.2%) | |
| diabetes | 0 (0%) | 3,382 (52%) | <0.001 |
| kidney | 5 (<0.1%) | 39 (0.6%) | <0.001 |
| smoking | 6,243 (21%) | 3,064 (47%) | <0.001 |
| alcohol | 27,134 (93%) | 5,490 (85%) | <0.001 |
| physical | 23,082 (79%) | 4,688 (73%) | <0.001 |
| hsCRP_quartile | <0.001 | ||
| 1 | 8,054 (28%) | 877 (14%) | |
| 2 | 7,581 (26%) | 1,350 (21%) | |
| 3 | 7,479 (26%) | 1,452 (22%) | |
| 4 | 6,149 (21%) | 2,782 (43%) | |
| 1 n (%) | |||
| 2 Pearson’s Chi-squared test | |||
## [1] 0.4079302
## [1] 0.6616889
hsCRP_skew <- skewness(nhanes_analysis$hsCRP_log, na.rm = TRUE)
hsCRP_kurt <- kurtosis(nhanes_analysis$hsCRP_log, na.rm = TRUE)
cat("Skewness (log hsCRP):", hsCRP_skew, "\n")## Skewness (log hsCRP): 0.4079302
## Kurtosis (log hsCRP): 0.6616889
# -----------------------------
# Step 3: Descriptive statistics for continuous variables
# -----------------------------
#cont_vars <- c("hsCRP", "hsCRP_log", "age", "PIR", "BMI", "waist", "TC", "HDL", "HbA1c")
cont_vars <- c("hsCRP", "hsCRP_cap", "hsCRP_log", "age", "PIR", "BMI", "waist", "TC", "HDL", "HbA1c")
cont_summary <- nhanes_analysis %>%
select(all_of(cont_vars)) %>%
summarise(across(everything(),
list(mean = ~mean(.x, na.rm=TRUE),
sd = ~sd(.x, na.rm=TRUE),
median = ~median(.x, na.rm=TRUE),
Q1 = ~quantile(.x, 0.25, na.rm=TRUE),
Q3 = ~quantile(.x, 0.75, na.rm=TRUE)
)
))
print(cont_summary)## hsCRP_mean hsCRP_sd hsCRP_median hsCRP_Q1 hsCRP_Q3 hsCRP_cap_mean
## 1 2.853738 6.54405 1.32 0.7 2.5 2.633293
## hsCRP_cap_sd hsCRP_cap_median hsCRP_cap_Q1 hsCRP_cap_Q3 hsCRP_log_mean
## 1 4.197594 1.32 0.7 2.5 0.4052227
## hsCRP_log_sd hsCRP_log_median hsCRP_log_Q1 hsCRP_log_Q3 age_mean age_sd
## 1 1.060311 0.3506569 -0.2231436 0.9555114 35.52248 24.98483
## age_median age_Q1 age_Q3 PIR_mean PIR_sd PIR_median PIR_Q1 PIR_Q3 BMI_mean
## 1 33 12 58 2.412445 1.52466 2.06 1.2 3.55 26.43016
## BMI_sd BMI_median BMI_Q1 BMI_Q3 waist_mean waist_sd waist_median waist_Q1
## 1 7.502948 25.8 21.7 29.9 89.90923 20.39434 91 78.5
## waist_Q3 TC_mean TC_sd TC_median TC_Q1 TC_Q3 HDL_mean HDL_sd HDL_median
## 1 101.3 177.9767 34.23275 175 162 189 53.34957 12.70463 52
## HDL_Q1 HDL_Q3 HbA1c_mean HbA1c_sd HbA1c_median HbA1c_Q1 HbA1c_Q3
## 1 47 57 5.648368 0.8584626 5.5 5.4 5.6
# -----------------------------
# Step 4: Normality test
# -----------------------------
# Shapiro-Wilk (sample ≤ 5000) else Kolmogorov-Smirnov
if(nrow(nhanes_analysis) <= 5000){
shapiro_result <- shapiro.test(nhanes_analysis$hsCRP_log)
print(shapiro_result)
} else {
ks_result <- ks.test(nhanes_analysis$hsCRP_log, "pnorm",
mean=mean(nhanes_analysis$hsCRP_log, na.rm=TRUE),
sd=sd(nhanes_analysis$hsCRP_log, na.rm=TRUE))
print(ks_result)
}##
## Asymptotic one-sample Kolmogorov-Smirnov test
##
## data: nhanes_analysis$hsCRP_log
## D = 0.15486, p-value < 2.2e-16
## alternative hypothesis: two-sided
# -----------------------------
# Step 5: Optional histogram
# -----------------------------
ggplot(nhanes_analysis, aes(x=hsCRP_log)) +
geom_histogram(bins=50, fill="skyblue", color="black") +
labs(title="Histogram of log-transformed hsCRP",
x="log(hsCRP)", y="Frequency")# Continuous variable list
#cont_vars <- c("hsCRP", "hsCRP_log", "age", "PIR", "BMI", "waist", "TC", "HDL", "HbA1c")
cont_vars <- c("hsCRP", "hsCRP_cap", "hsCRP_log", "age", "PIR", "BMI", "waist", "TC", "HDL", "HbA1c")
for (var in cont_vars){
cat("\nVariable:", var, "\n")
t_res <- t.test(nhanes_analysis[[var]] ~ nhanes_analysis$hypertension)
print(t_res)
}##
## Variable: hsCRP
##
## Welch Two Sample t-test
##
## data: nhanes_analysis[[var]] by nhanes_analysis$hypertension
## t = -17.587, df = 7429.2, p-value < 2.2e-16
## alternative hypothesis: true difference in means between group 0 and group 1 is not equal to 0
## 95 percent confidence interval:
## -2.432165 -1.944338
## sample estimates:
## mean in group 0 mean in group 1
## 2.457974 4.646225
##
##
## Variable: hsCRP_cap
##
## Welch Two Sample t-test
##
## data: nhanes_analysis[[var]] by nhanes_analysis$hypertension
## t = -25.04, df = 7835.7, p-value < 2.2e-16
## alternative hypothesis: true difference in means between group 0 and group 1 is not equal to 0
## 95 percent confidence interval:
## -1.948409 -1.665489
## sample estimates:
## mean in group 0 mean in group 1
## 2.30649 4.11344
##
##
## Variable: hsCRP_log
##
## Welch Two Sample t-test
##
## data: nhanes_analysis[[var]] by nhanes_analysis$hypertension
## t = -40.437, df = 9353.7, p-value < 2.2e-16
## alternative hypothesis: true difference in means between group 0 and group 1 is not equal to 0
## 95 percent confidence interval:
## -0.6143723 -0.5575617
## sample estimates:
## mean in group 0 mean in group 1
## 0.2992454 0.8852124
##
##
## Variable: age
##
## Welch Two Sample t-test
##
## data: nhanes_analysis[[var]] by nhanes_analysis$hypertension
## t = -136.56, df = 14602, p-value < 2.2e-16
## alternative hypothesis: true difference in means between group 0 and group 1 is not equal to 0
## 95 percent confidence interval:
## -31.52081 -30.62871
## sample estimates:
## mean in group 0 mean in group 1
## 29.90233 60.97709
##
##
## Variable: PIR
##
## Welch Two Sample t-test
##
## data: nhanes_analysis[[var]] by nhanes_analysis$hypertension
## t = 1.4178, df = 9816.9, p-value = 0.1563
## alternative hypothesis: true difference in means between group 0 and group 1 is not equal to 0
## 95 percent confidence interval:
## -0.01106070 0.06888035
## sample estimates:
## mean in group 0 mean in group 1
## 2.417673 2.388763
##
##
## Variable: BMI
##
## Welch Two Sample t-test
##
## data: nhanes_analysis[[var]] by nhanes_analysis$hypertension
## t = -54.218, df = 9172.4, p-value < 2.2e-16
## alternative hypothesis: true difference in means between group 0 and group 1 is not equal to 0
## 95 percent confidence interval:
## -5.762018 -5.359910
## sample estimates:
## mean in group 0 mean in group 1
## 25.42441 30.98537
##
##
## Variable: waist
##
## Welch Two Sample t-test
##
## data: nhanes_analysis[[var]] by nhanes_analysis$hypertension
## t = -75.074, df = 10913, p-value < 2.2e-16
## alternative hypothesis: true difference in means between group 0 and group 1 is not equal to 0
## 95 percent confidence interval:
## -18.20249 -17.27614
## sample estimates:
## mean in group 0 mean in group 1
## 86.70092 104.44024
##
##
## Variable: TC
##
## Welch Two Sample t-test
##
## data: nhanes_analysis[[var]] by nhanes_analysis$hypertension
## t = -12.297, df = 8172.7, p-value < 2.2e-16
## alternative hypothesis: true difference in means between group 0 and group 1 is not equal to 0
## 95 percent confidence interval:
## -7.963790 -5.773949
## sample estimates:
## mean in group 0 mean in group 1
## 176.7344 183.6033
##
##
## Variable: HDL
##
## Welch Two Sample t-test
##
## data: nhanes_analysis[[var]] by nhanes_analysis$hypertension
## t = 6.4386, df = 8264.3, p-value = 1.273e-10
## alternative hypothesis: true difference in means between group 0 and group 1 is not equal to 0
## 95 percent confidence interval:
## 0.9162147 1.7183069
## sample estimates:
## mean in group 0 mean in group 1
## 53.58781 52.27055
##
##
## Variable: HbA1c
##
## Welch Two Sample t-test
##
## data: nhanes_analysis[[var]] by nhanes_analysis$hypertension
## t = -51.479, df = 6634.1, p-value < 2.2e-16
## alternative hypothesis: true difference in means between group 0 and group 1 is not equal to 0
## 95 percent confidence interval:
## -1.0629361 -0.9849519
## sample estimates:
## mean in group 0 mean in group 1
## 5.463179 6.487123
library(dplyr)
library(psych)
#cont_vars <- c("hsCRP","hsCRP_log","age","PIR","BMI","waist","TC","HDL","HbA1c")
cont_vars <- c("hsCRP", "hsCRP_cap", "hsCRP_log", "age", "PIR", "BMI", "waist", "TC", "HDL", "HbA1c")
# Group summary for continuous variables
cont_summary_list <- lapply(cont_vars, function(var){
x0 <- nhanes_analysis[[var]][nhanes_analysis$hypertension==0]
x1 <- nhanes_analysis[[var]][nhanes_analysis$hypertension==1]
data.frame(
Variable = var,
Mean_0 = mean(x0, na.rm=TRUE),
SD_0 = sd(x0, na.rm=TRUE),
Mean_1 = mean(x1, na.rm=TRUE),
SD_1 = sd(x1, na.rm=TRUE),
Median_0 = median(x0, na.rm=TRUE),
Q1_0 = quantile(x0, 0.25, na.rm=TRUE),
Q3_0 = quantile(x0, 0.75, na.rm=TRUE),
Median_1 = median(x1, na.rm=TRUE),
Q1_1 = quantile(x1, 0.25, na.rm=TRUE),
Q3_1 = quantile(x1, 0.75, na.rm=TRUE),
t_test_p = t.test(x0, x1)$p.value
)
})
cont_summary <- do.call(rbind, cont_summary_list)
cont_summary## Variable Mean_0 SD_0 Mean_1 SD_1 Median_0
## 25% hsCRP 2.4579736 5.552821 4.6462250 9.655209 1.3200000
## 25%1 hsCRP_cap 2.3064904 3.766051 4.1134396 5.524008 1.3200000
## 25%2 hsCRP_log 0.2992454 1.030861 0.8852124 1.059288 0.3506569
## 25%3 age 29.9023340 23.235217 60.9770933 14.675713 24.0000000
## 25%4 PIR 2.4176732 1.536115 2.3887633 1.471545 2.0600000
## 25%5 BMI 25.4244097 7.112767 30.9853738 7.536552 25.8000000
## 25%6 waist 86.7009227 19.754291 104.4402414 16.570545 91.0000000
## 25%7 TC 176.7344428 32.049247 183.6033122 42.296342 175.0000000
## 25%8 HDL 53.5878071 12.002130 52.2705464 15.447690 52.0000000
## 25%9 HbA1c 5.4631788 0.391398 6.4871227 1.588206 5.5000000
## Q1_0 Q3_0 Median_1 Q1_1 Q3_1 t_test_p
## 25% 0.6100000 1.9700000 1.9300000 1.2000000 4.800000 7.311118e-68
## 25%1 0.6100000 1.9700000 1.9300000 1.2000000 4.800000 3.503138e-133
## 25%2 -0.3424903 0.7275486 0.7080358 0.2623643 1.589235 0.000000e+00
## 25%3 10.0000000 48.0000000 63.0000000 52.0000000 72.000000 0.000000e+00
## 25%4 1.1950000 3.6000000 2.0600000 1.2300000 3.370000 1.562883e-01
## 25%5 20.5000000 28.4000000 29.4000000 25.8000000 34.900000 0.000000e+00
## 25%6 74.3000000 96.5000000 102.1000000 91.0000000 114.800000 0.000000e+00
## 25%7 163.0000000 185.0000000 175.0000000 157.0000000 208.000000 1.882865e-34
## 25%8 48.0000000 56.0000000 52.0000000 42.0000000 58.000000 1.273363e-10
## 25%9 5.3000000 5.5000000 5.9000000 5.5000000 6.900000 0.000000e+00
cat_vars <- c("gender","race","diabetes","kidney","smoking","alcohol","physical","hsCRP_quartile")
cat_summary_list <- lapply(cat_vars, function(var){
tab <- table(nhanes_analysis[[var]], nhanes_analysis$hypertension, useNA="ifany")
p_val <- chisq.test(tab)$p.value
data.frame(
Variable = var,
Table = paste(capture.output(print(tab)), collapse = "; "),
Chi_sq_p = p_val
)
})
cat_summary <- do.call(rbind, cat_summary_list)
cat_summary## Variable
## 1 gender
## 2 race
## 3 diabetes
## 4 kidney
## 5 smoking
## 6 alcohol
## 7 physical
## 8 hsCRP_quartile
## Table
## 1 ; 0 1; Female 15206 3189; Male 14057 3272
## 2 ; 0 1; 1 4000 752; 2 3323 677; 3 11411 2485; 4 5814 1674; 6 2698 536; 7 2017 337
## 3 ; 0 1; 0 29263 3079; 1 0 3382
## 4 ; 0 1; 0 29258 6422; 1 5 39
## 5 ; 0 1; 0 23020 3397; 1 6243 3064
## 6 ; 0 1; 0 2129 971; 1 27134 5490
## 7 ; 0 1; 0 6181 1773; 1 23082 4688
## 8 ; 0 1; 1 8054 877; 2 7581 1350; 3 7479 1452; 4 6149 2782
## Chi_sq_p
## 1 1.574765e-04
## 2 1.666542e-28
## 3 0.000000e+00
## 4 5.105558e-33
## 5 0.000000e+00
## 6 4.318117e-89
## 7 2.621468e-28
## 8 0.000000e+00
library(dplyr)
# Continuous + categorical summary একসাথে
full_summary <- bind_rows(
cont_summary %>% mutate(Type = "Continuous"),
cat_summary %>% mutate(Type = "Categorical")
)
# CSV আকারে save করতে
#write.csv(full_summary, "nhanes_descriptive_summary.csv", row.names = FALSE)
# preview
head(full_summary, 20)## Variable Mean_0 SD_0 Mean_1 SD_1 Median_0
## 25% hsCRP 2.4579736 5.552821 4.6462250 9.655209 1.3200000
## 25%1 hsCRP_log 0.2992454 1.030861 0.8852124 1.059288 0.3506569
## 25%2 age 29.9023340 23.235217 60.9770933 14.675713 24.0000000
## 25%3 PIR 2.4176732 1.536115 2.3887633 1.471545 2.0600000
## 25%4 BMI 25.4244097 7.112767 30.9853738 7.536552 25.8000000
## 25%5 waist 86.7009227 19.754291 104.4402414 16.570545 91.0000000
## 25%6 TC 176.7344428 32.049247 183.6033122 42.296342 175.0000000
## 25%7 HDL 53.5878071 12.002130 52.2705464 15.447690 52.0000000
## 25%8 HbA1c 5.4631788 0.391398 6.4871227 1.588206 5.5000000
## ...10 gender NA NA NA NA NA
## ...11 race NA NA NA NA NA
## ...12 diabetes NA NA NA NA NA
## ...13 kidney NA NA NA NA NA
## ...14 smoking NA NA NA NA NA
## ...15 alcohol NA NA NA NA NA
## ...16 physical NA NA NA NA NA
## Q1_0 Q3_0 Median_1 Q1_1 Q3_1 t_test_p
## 25% 0.6100000 1.9700000 1.9300000 1.2000000 4.800000 7.311118e-68
## 25%1 -0.3424903 0.7275486 0.7080358 0.2623643 1.589235 0.000000e+00
## 25%2 10.0000000 48.0000000 63.0000000 52.0000000 72.000000 0.000000e+00
## 25%3 1.1950000 3.6000000 2.0600000 1.2300000 3.370000 1.562883e-01
## 25%4 20.5000000 28.4000000 29.4000000 25.8000000 34.900000 0.000000e+00
## 25%5 74.3000000 96.5000000 102.1000000 91.0000000 114.800000 0.000000e+00
## 25%6 163.0000000 185.0000000 175.0000000 157.0000000 208.000000 1.882865e-34
## 25%7 48.0000000 56.0000000 52.0000000 42.0000000 58.000000 1.273363e-10
## 25%8 5.3000000 5.5000000 5.9000000 5.5000000 6.900000 0.000000e+00
## ...10 NA NA NA NA NA NA
## ...11 NA NA NA NA NA NA
## ...12 NA NA NA NA NA NA
## ...13 NA NA NA NA NA NA
## ...14 NA NA NA NA NA NA
## ...15 NA NA NA NA NA NA
## ...16 NA NA NA NA NA NA
## Type
## 25% Continuous
## 25%1 Continuous
## 25%2 Continuous
## 25%3 Continuous
## 25%4 Continuous
## 25%5 Continuous
## 25%6 Continuous
## 25%7 Continuous
## 25%8 Continuous
## ...10 Categorical
## ...11 Categorical
## ...12 Categorical
## ...13 Categorical
## ...14 Categorical
## ...15 Categorical
## ...16 Categorical
## Table
## 25% <NA>
## 25%1 <NA>
## 25%2 <NA>
## 25%3 <NA>
## 25%4 <NA>
## 25%5 <NA>
## 25%6 <NA>
## 25%7 <NA>
## 25%8 <NA>
## ...10 ; 0 1; Female 15206 3189; Male 14057 3272
## ...11 ; 0 1; 1 4000 752; 2 3323 677; 3 11411 2485; 4 5814 1674; 6 2698 536; 7 2017 337
## ...12 ; 0 1; 0 29263 3079; 1 0 3382
## ...13 ; 0 1; 0 29258 6422; 1 5 39
## ...14 ; 0 1; 0 23020 3397; 1 6243 3064
## ...15 ; 0 1; 0 2129 971; 1 27134 5490
## ...16 ; 0 1; 0 6181 1773; 1 23082 4688
## Chi_sq_p
## 25% NA
## 25%1 NA
## 25%2 NA
## 25%3 NA
## 25%4 NA
## 25%5 NA
## 25%6 NA
## 25%7 NA
## 25%8 NA
## ...10 1.574765e-04
## ...11 1.666542e-28
## ...12 0.000000e+00
## ...13 5.105558e-33
## ...14 0.000000e+00
## ...15 4.318117e-89
## ...16 2.621468e-28