📘 Load Libraries

library(tidyverse)
library(dplyr)
library(DBI) #contains functions to interect with the databasa
library(RSQLite) #SQLite database driver
library(jsonlite)
library(haven)
library(janitor)
library(readxl)
library(writexl)
library(openxlsx)
library(xlsx)
library(microbenchmark)
library(data.table)
library(readr)
library(remotes)
library(tidyr)
library(dbplyr)
library(knitr)       
library(kableExtra)    
library(psych)
library(readxl)
library(DescTools)
library(ggplot2)
library(e1071)
library(moments)
library(skimr)
library(tableone)
library(table1)
library(gtsummary)
library(gmodels)
library(naniar)
library(ggthemes)
library(ggThemeAssist)
library(forcats)
library(ggpubr)
library(patchwork)
library(reshape2)
library(car)
library(lares)
library(ggstatsplot)
library(GGally)
library(PerformanceAnalytics)
library(heatmap3)
library(sjPlot)
library(sjmisc)
library(pairsD3)
library(gridExtra) 
library(gridExtra) 
library(lmtest) 
library(titanic) 
library(tidyverse) 
library(caret) 
library(pROC)
library(gt)
library(corrplot)
nhanes_analysis <- read.csv("nhanes_analysis_without_outliar_hscrpcattegorisation.csv")

Continuous variables summary

# Continuous variables summary
library(dplyr)

nhanes_analysis %>%
  summarise(
    hsCRP_mean = mean(hsCRP, na.rm = TRUE),
    hsCRP_sd   = sd(hsCRP, na.rm = TRUE),
    hsCRP_median = median(hsCRP, na.rm = TRUE),
    
    age_mean = mean(age, na.rm = TRUE),
    age_sd   = sd(age, na.rm = TRUE),
    age_median = median(age, na.rm = TRUE),
    
    BMI_mean = mean(BMI, na.rm = TRUE),
    BMI_sd   = sd(BMI, na.rm = TRUE),
    BMI_median = median(BMI, na.rm = TRUE),
    
    waist_mean = mean(waist, na.rm = TRUE),
    waist_sd   = sd(waist, na.rm = TRUE),
    waist_median = median(waist, na.rm = TRUE),
    
    PIR_mean = mean(PIR, na.rm = TRUE),
    PIR_sd   = sd(PIR, na.rm = TRUE),
    PIR_median = median(PIR, na.rm = TRUE),
    
    TC_mean = mean(TC, na.rm = TRUE),
    TC_sd   = sd(TC, na.rm = TRUE),
    TC_median = median(TC, na.rm = TRUE),
    
    HDL_mean = mean(HDL, na.rm = TRUE),
    HDL_sd   = sd(HDL, na.rm = TRUE),
    HDL_median = median(HDL, na.rm = TRUE),
    
    HbA1c_mean = mean(HbA1c, na.rm = TRUE),
    HbA1c_sd   = sd(HbA1c, na.rm = TRUE),
    HbA1c_median = median(HbA1c, na.rm = TRUE)
  )
##   hsCRP_mean hsCRP_sd hsCRP_median age_mean   age_sd age_median BMI_mean
## 1   2.853738  6.54405         1.32 35.52248 24.98483         33 26.43016
##     BMI_sd BMI_median waist_mean waist_sd waist_median PIR_mean  PIR_sd
## 1 7.502948       25.8   89.90923 20.39434           91 2.412445 1.52466
##   PIR_median  TC_mean    TC_sd TC_median HDL_mean   HDL_sd HDL_median
## 1       2.06 177.9767 34.23275       175 53.34957 12.70463         52
##   HbA1c_mean  HbA1c_sd HbA1c_median
## 1   5.648368 0.8584626          5.5

Categorical variables summary

# Categorical variables summary
library(dplyr)

categorical_vars <- c("hypertension", "gender", "race", "diabetes", "kidney", 
                      "smoking", "alcohol", "physical")

for (var in categorical_vars) {
  print(var)
  print(table(nhanes_analysis[[var]], useNA = "ifany"))
  print(round(prop.table(table(nhanes_analysis[[var]]))*100, 2))  # percentage
  cat("\n")
}
## [1] "hypertension"
## 
##     0     1 
## 29263  6461 
## 
##     0     1 
## 81.91 18.09 
## 
## [1] "gender"
## 
## Female   Male 
##  18395  17329 
## 
## Female   Male 
##  51.49  48.51 
## 
## [1] "race"
## 
##     1     2     3     4     6     7 
##  4752  4000 13896  7488  3234  2354 
## 
##     1     2     3     4     6     7 
## 13.30 11.20 38.90 20.96  9.05  6.59 
## 
## [1] "diabetes"
## 
##     0     1 
## 32342  3382 
## 
##     0     1 
## 90.53  9.47 
## 
## [1] "kidney"
## 
##     0     1 
## 35680    44 
## 
##     0     1 
## 99.88  0.12 
## 
## [1] "smoking"
## 
##     0     1 
## 26417  9307 
## 
##     0     1 
## 73.95 26.05 
## 
## [1] "alcohol"
## 
##     0     1 
##  3100 32624 
## 
##     0     1 
##  8.68 91.32 
## 
## [1] "physical"
## 
##     0     1 
##  7954 27770 
## 
##     0     1 
## 22.27 77.73
con <- dbConnect(drv = SQLite(),":memory:")
library(ggplot2)

# Histogram for continuous
ggplot(nhanes_analysis, aes(x = hsCRP)) +
  geom_histogram(binwidth = 0.5, fill = "skyblue", color = "black") +
  labs(title = "Distribution of hsCRP")

# Bar plot for categorical
ggplot(nhanes_analysis, aes(x = factor(hypertension))) +
  geom_bar(fill = "orange") +
  labs(title = "Hypertension Frequency", x = "Hypertension", y = "Count")

1️⃣ Missing Data Overview

library(dplyr)
library(naniar)  # Missing data visualization

# Missing data summary
missing_summary <- nhanes_analysis %>%
  summarise(across(everything(), ~sum(is.na(.))))

missing_summary
##   SEQN hsCRP hypertension age gender race PIR BMI waist diabetes kidney smoking
## 1    0     0            0   0      0    0   0   0     0        0      0       0
##   alcohol physical TC HDL HbA1c hsCRP_log hsCRP_cap hsCRP_quartile
## 1       0        0  0   0     0         0         0              0
# Missing data visualization
vis_miss(nhanes_analysis)

2️⃣ Continuous Variables Summary

Step 2a: Basic statistics

continuous_vars <- c("hsCRP", "age", "PIR", "BMI", "waist", "TC", "HDL", "HbA1c")

nhanes_analysis %>%
  select(all_of(continuous_vars)) %>%
  summarise(across(everything(), list(
    mean = ~mean(.x, na.rm = TRUE),
    sd = ~sd(.x, na.rm = TRUE),
    median = ~median(.x, na.rm = TRUE),
    min = ~min(.x, na.rm = TRUE),
    max = ~max(.x, na.rm = TRUE),
    Q1 = ~quantile(.x, 0.25, na.rm = TRUE),
    Q3 = ~quantile(.x, 0.75, na.rm = TRUE)
  )))
##   hsCRP_mean hsCRP_sd hsCRP_median hsCRP_min hsCRP_max hsCRP_Q1 hsCRP_Q3
## 1   2.853738  6.54405         1.32      0.08    246.86      0.7      2.5
##   age_mean   age_sd age_median age_min age_max age_Q1 age_Q3 PIR_mean  PIR_sd
## 1 35.52248 24.98483         33       1      80     12     58 2.412445 1.52466
##   PIR_median PIR_min PIR_max PIR_Q1 PIR_Q3 BMI_mean   BMI_sd BMI_median BMI_min
## 1       2.06       0       5    1.2   3.55 26.43016 7.502948       25.8    11.1
##   BMI_max BMI_Q1 BMI_Q3 waist_mean waist_sd waist_median waist_min waist_max
## 1    92.3   21.7   29.9   89.90923 20.39434           91      39.8       187
##   waist_Q1 waist_Q3  TC_mean    TC_sd TC_median TC_min TC_max TC_Q1 TC_Q3
## 1     78.5    101.3 177.9767 34.23275       175     62    545   162   189
##   HDL_mean   HDL_sd HDL_median HDL_min HDL_max HDL_Q1 HDL_Q3 HbA1c_mean
## 1 53.34957 12.70463         52       5     226     47     57   5.648368
##    HbA1c_sd HbA1c_median HbA1c_min HbA1c_max HbA1c_Q1 HbA1c_Q3
## 1 0.8584626          5.5       2.8      17.1      5.4      5.6

Step 2b: Distribution plots

library(ggplot2)

# Histogram + Density overlay
for (var in continuous_vars) {
  ggplot(nhanes_analysis, aes_string(x = var)) +
    geom_histogram(aes(y = ..density..), binwidth = 0.5, fill = "skyblue", color = "black") +
    geom_density(color = "red") +
    labs(title = paste("Distribution of", var)) +
    theme_minimal() -> p
  print(p)
}

Step 2c: Boxplots (Outlier detection)

for (var in continuous_vars) {
  ggplot(nhanes_analysis, aes_string(y = var)) +
    geom_boxplot(fill = "orange") +
    labs(title = paste("Boxplot of", var)) +
    theme_minimal() -> p
  print(p)
}

3️⃣ Categorical Variables Summary

Step 3a: Frequency + %

categorical_vars <- c("hypertension", "gender", "race", "diabetes", "kidney", 
                      "smoking", "alcohol", "physical")

for (var in categorical_vars) {
  print(var)
  print(table(nhanes_analysis[[var]], useNA = "ifany"))
  print(round(prop.table(table(nhanes_analysis[[var]]))*100, 2))
  cat("\n")
}
## [1] "hypertension"
## 
##     0     1 
## 29263  6461 
## 
##     0     1 
## 81.91 18.09 
## 
## [1] "gender"
## 
## Female   Male 
##  18395  17329 
## 
## Female   Male 
##  51.49  48.51 
## 
## [1] "race"
## 
##     1     2     3     4     6     7 
##  4752  4000 13896  7488  3234  2354 
## 
##     1     2     3     4     6     7 
## 13.30 11.20 38.90 20.96  9.05  6.59 
## 
## [1] "diabetes"
## 
##     0     1 
## 32342  3382 
## 
##     0     1 
## 90.53  9.47 
## 
## [1] "kidney"
## 
##     0     1 
## 35680    44 
## 
##     0     1 
## 99.88  0.12 
## 
## [1] "smoking"
## 
##     0     1 
## 26417  9307 
## 
##     0     1 
## 73.95 26.05 
## 
## [1] "alcohol"
## 
##     0     1 
##  3100 32624 
## 
##     0     1 
##  8.68 91.32 
## 
## [1] "physical"
## 
##     0     1 
##  7954 27770 
## 
##     0     1 
## 22.27 77.73

Step 3b: Barplots

for (var in categorical_vars) {
  ggplot(nhanes_analysis, aes_string(x = var)) +
    geom_bar(fill = "steelblue") +
    labs(title = paste("Barplot of", var)) +
    theme_minimal() -> p
  print(p)
}

4️⃣ Stratified Descriptive Analysis

# Example: hsCRP by hypertension
nhanes_analysis %>%
  group_by(hypertension) %>%
  summarise(
    mean_hsCRP = mean(hsCRP, na.rm = TRUE),
    sd_hsCRP = sd(hsCRP, na.rm = TRUE),
    median_hsCRP = median(hsCRP, na.rm = TRUE),
    n = n()
  )
## # A tibble: 2 × 5
##   hypertension mean_hsCRP sd_hsCRP median_hsCRP     n
##          <int>      <dbl>    <dbl>        <dbl> <int>
## 1            0       2.46     5.55         1.32 29263
## 2            1       4.65     9.66         1.93  6461
# Boxplot by hypertension
ggplot(nhanes_analysis, aes(x = factor(hypertension), y = hsCRP)) +
  geom_boxplot(fill = "lightgreen") +
  labs(title = "hsCRP by Hypertension", x = "Hypertension", y = "hsCRP") +
  theme_minimal()

Stratify by multiple variables (like gender + hypertension):

nhanes_analysis %>%
  group_by(gender, hypertension) %>%
  summarise(
    mean_hsCRP = mean(hsCRP, na.rm = TRUE),
    sd_hsCRP = sd(hsCRP, na.rm = TRUE),
    n = n()
  )
## # A tibble: 4 × 5
## # Groups:   gender [2]
##   gender hypertension mean_hsCRP sd_hsCRP     n
##   <chr>         <int>      <dbl>    <dbl> <int>
## 1 Female            0       2.78     6.00 15206
## 2 Female            1       5.04     8.57  3189
## 3 Male              0       2.11     5.00 14057
## 4 Male              1       4.26    10.6   3272

5️⃣ Correlation Matrix for Continuous Variables

library(corrplot)

continuous_data <- nhanes_analysis %>% select(all_of(continuous_vars))
corr_matrix <- cor(continuous_data, use = "complete.obs")

corrplot(corr_matrix, method = "color", addCoef.col = "black", number.cex = 0.7)

6️⃣ Advanced Summary Table (Report-ready)

library(gtsummary)

# Continuous + Categorical variables একসাথে
tbl <- nhanes_analysis %>%
  select(hsCRP,hsCRP_cap,hsCRP_log,hsCRP_quartile, hypertension, age, gender, race, PIR, BMI, waist, diabetes, kidney, smoking, alcohol, physical, TC, HDL, HbA1c) %>%
  tbl_summary(
    by = hypertension,  # stratify by outcome
    missing = "no",
    statistic = list(
      all_continuous() ~ "{mean} ({sd})",
      all_categorical() ~ "{n} ({p}%)"
    )
  ) %>%
  add_overall() %>%
  add_p()  # p-value by group

tbl
Characteristic Overall
N = 35,724
1
0
N = 29,263
1
1
N = 6,461
1
p-value2
hsCRP 2.85 (6.54) 2.46 (5.55) 4.65 (9.66) <0.001
hsCRP_cap 2.63 (4.20) 2.31 (3.77) 4.11 (5.52) <0.001
hsCRP_log 0.41 (1.06) 0.30 (1.03) 0.89 (1.06) <0.001
hsCRP_quartile


<0.001
    1 8,931 (25%) 8,054 (28%) 877 (14%)
    2 8,931 (25%) 7,581 (26%) 1,350 (21%)
    3 8,931 (25%) 7,479 (26%) 1,452 (22%)
    4 8,931 (25%) 6,149 (21%) 2,782 (43%)
age 36 (25) 30 (23) 61 (15) <0.001
gender


<0.001
    Female 18,395 (51%) 15,206 (52%) 3,189 (49%)
    Male 17,329 (49%) 14,057 (48%) 3,272 (51%)
race


<0.001
    1 4,752 (13%) 4,000 (14%) 752 (12%)
    2 4,000 (11%) 3,323 (11%) 677 (10%)
    3 13,896 (39%) 11,411 (39%) 2,485 (38%)
    4 7,488 (21%) 5,814 (20%) 1,674 (26%)
    6 3,234 (9.1%) 2,698 (9.2%) 536 (8.3%)
    7 2,354 (6.6%) 2,017 (6.9%) 337 (5.2%)
PIR 2.41 (1.52) 2.42 (1.54) 2.39 (1.47) 0.6
BMI 26 (8) 25 (7) 31 (8) <0.001
waist 90 (20) 87 (20) 104 (17) <0.001
diabetes 3,382 (9.5%) 0 (0%) 3,382 (52%) <0.001
kidney 44 (0.1%) 5 (<0.1%) 39 (0.6%) <0.001
smoking 9,307 (26%) 6,243 (21%) 3,064 (47%) <0.001
alcohol 32,624 (91%) 27,134 (93%) 5,490 (85%) <0.001
physical 27,770 (78%) 23,082 (79%) 4,688 (73%) <0.001
TC 178 (34) 177 (32) 184 (42) <0.001
HDL 53 (13) 54 (12) 52 (15) <0.001
HbA1c 5.65 (0.86) 5.46 (0.39) 6.49 (1.59) <0.001
1 Mean (SD); n (%)
2 Wilcoxon rank sum test; Pearson’s Chi-squared test

After outliar handling

#.rs.restartR()


library(dplyr)
library(gtsummary)
library(writexl)

cat_vars <- c("gender", "hypertension", "kidney", "diabetes", 
              "smoking", "alcohol", "physical", "hsCRP_quartile")
cont_vars <- c("hsCRP", "hsCRP_cap", "hsCRP_log", "age", "PIR", "BMI", "waist", "TC", "HDL", "HbA1c")

nhanes_analysis <- nhanes_analysis %>%
  mutate(across(all_of(cat_vars), as.factor))

descriptive_table <- tbl_summary(
  data = nhanes_analysis,
  include = c(cont_vars, cat_vars),
  statistic = list(
    all_continuous() ~ "{mean} ± {sd} ({median}, {p25}-{p75})",
    all_categorical() ~ "{n} ({p}%)"
  ),
  missing = "no"
)

descriptive_df <- descriptive_table$table_body %>% as.data.frame()
#write_xlsx(descriptive_df, "NHANES_Descriptive_Summary.xlsx")

1️⃣ Distribution of Key Variables

Continuous variables (age, BMI, waist, hsCRP, TC, HDL, HbA1c):

# Continuous variables summary
summary(nhanes_analysis[, c("age","BMI","waist","hsCRP","TC","HDL","HbA1c")])
##       age             BMI            waist            hsCRP        
##  Min.   : 1.00   Min.   :11.10   Min.   : 39.80   Min.   :  0.080  
##  1st Qu.:12.00   1st Qu.:21.70   1st Qu.: 78.50   1st Qu.:  0.700  
##  Median :33.00   Median :25.80   Median : 91.00   Median :  1.320  
##  Mean   :35.52   Mean   :26.43   Mean   : 89.91   Mean   :  2.854  
##  3rd Qu.:58.00   3rd Qu.:29.90   3rd Qu.:101.30   3rd Qu.:  2.500  
##  Max.   :80.00   Max.   :92.30   Max.   :187.00   Max.   :246.860  
##        TC           HDL             HbA1c       
##  Min.   : 62   Min.   :  5.00   Min.   : 2.800  
##  1st Qu.:162   1st Qu.: 47.00   1st Qu.: 5.400  
##  Median :175   Median : 52.00   Median : 5.500  
##  Mean   :178   Mean   : 53.35   Mean   : 5.648  
##  3rd Qu.:189   3rd Qu.: 57.00   3rd Qu.: 5.600  
##  Max.   :545   Max.   :226.00   Max.   :17.100
# Histograms
library(ggplot2)
ggplot(nhanes_analysis, aes(x=hsCRP)) + 
  geom_histogram(binwidth=0.5, fill="skyblue", color="black") +
  theme_minimal() +
  labs(title="Histogram of hsCRP", x="hsCRP", y="Frequency")

summary(nhanes_analysis$hsCRP)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   0.080   0.700   1.320   2.854   2.500 246.860

Categorical variables (gender, hypertension, kidney, diabetes, smoking, alcohol, physical):

# Frequency tables
table(nhanes_analysis$gender)
## 
## Female   Male 
##  18395  17329
table(nhanes_analysis$hypertension)
## 
##     0     1 
## 29263  6461
table(nhanes_analysis$diabetes)
## 
##     0     1 
## 32342  3382
# Bar plot example
ggplot(nhanes_analysis, aes(x=factor(hypertension))) +
  geom_bar(fill="lightgreen") +
  theme_minimal() +
  labs(title="Hypertension Distribution", x="Hypertension", y="Count")

2️⃣ Stratified Descriptive Tables

library(gtsummary)

# By Gender
tbl_gender <- nhanes_analysis %>%
  tbl_summary(
    by = "gender",
    statistic = list(all_continuous() ~ "{mean} ({sd})", all_categorical() ~ "{n} / {N} ({p}%)")
  )
tbl_gender
Characteristic Female
N = 18,395
1
Male
N = 17,329
1
SEQN 116,122 (18,670) 115,381 (18,467)
hsCRP 3.17 (6.58) 2.52 (6.49)
hypertension

    0 15,206 / 18,395 (83%) 14,057 / 17,329 (81%)
    1 3,189 / 18,395 (17%) 3,272 / 17,329 (19%)
age 36 (25) 35 (25)
race

    1 2,488 / 18,395 (14%) 2,264 / 17,329 (13%)
    2 2,098 / 18,395 (11%) 1,902 / 17,329 (11%)
    3 7,144 / 18,395 (39%) 6,752 / 17,329 (39%)
    4 3,858 / 18,395 (21%) 3,630 / 17,329 (21%)
    6 1,670 / 18,395 (9.1%) 1,564 / 17,329 (9.0%)
    7 1,137 / 18,395 (6.2%) 1,217 / 17,329 (7.0%)
PIR 2.37 (1.52) 2.45 (1.53)
BMI 27 (8) 26 (7)
waist 89 (20) 90 (21)
diabetes

    0 16,768 / 18,395 (91%) 15,574 / 17,329 (90%)
    1 1,627 / 18,395 (8.8%) 1,755 / 17,329 (10%)
kidney

    0 18,371 / 18,395 (100%) 17,309 / 17,329 (100%)
    1 24 / 18,395 (0.1%) 20 / 17,329 (0.1%)
smoking

    0 14,460 / 18,395 (79%) 11,957 / 17,329 (69%)
    1 3,935 / 18,395 (21%) 5,372 / 17,329 (31%)
alcohol

    0 2,065 / 18,395 (11%) 1,035 / 17,329 (6.0%)
    1 16,330 / 18,395 (89%) 16,294 / 17,329 (94%)
physical

    0 3,670 / 18,395 (20%) 4,284 / 17,329 (25%)
    1 14,725 / 18,395 (80%) 13,045 / 17,329 (75%)
TC 180 (34) 175 (34)
HDL 56 (13) 51 (11)
HbA1c 5.63 (0.84) 5.66 (0.88)
hsCRP_log 0.50 (1.09) 0.31 (1.02)
hsCRP_cap 2.96 (4.54) 2.29 (3.77)
hsCRP_quartile

    1 4,189 / 18,395 (23%) 4,742 / 17,329 (27%)
    2 4,360 / 18,395 (24%) 4,571 / 17,329 (26%)
    3 4,590 / 18,395 (25%) 4,341 / 17,329 (25%)
    4 5,256 / 18,395 (29%) 3,675 / 17,329 (21%)
1 Mean (SD); n / N (%)
# By Hypertension
tbl_htn <- nhanes_analysis %>%
  tbl_summary(
    by = "hypertension",
    statistic = list(all_continuous() ~ "{mean} ({sd})", all_categorical() ~ "{n} / {N} ({p}%)")
  )
tbl_htn
Characteristic 0
N = 29,263
1
1
N = 6,461
1
SEQN 115,819 (18,628) 115,507 (18,332)
hsCRP 2.46 (5.55) 4.65 (9.66)
age 30 (23) 61 (15)
gender

    Female 15,206 / 29,263 (52%) 3,189 / 6,461 (49%)
    Male 14,057 / 29,263 (48%) 3,272 / 6,461 (51%)
race

    1 4,000 / 29,263 (14%) 752 / 6,461 (12%)
    2 3,323 / 29,263 (11%) 677 / 6,461 (10%)
    3 11,411 / 29,263 (39%) 2,485 / 6,461 (38%)
    4 5,814 / 29,263 (20%) 1,674 / 6,461 (26%)
    6 2,698 / 29,263 (9.2%) 536 / 6,461 (8.3%)
    7 2,017 / 29,263 (6.9%) 337 / 6,461 (5.2%)
PIR 2.42 (1.54) 2.39 (1.47)
BMI 25 (7) 31 (8)
waist 87 (20) 104 (17)
diabetes

    0 29,263 / 29,263 (100%) 3,079 / 6,461 (48%)
    1 0 / 29,263 (0%) 3,382 / 6,461 (52%)
kidney

    0 29,258 / 29,263 (100%) 6,422 / 6,461 (99%)
    1 5 / 29,263 (<0.1%) 39 / 6,461 (0.6%)
smoking

    0 23,020 / 29,263 (79%) 3,397 / 6,461 (53%)
    1 6,243 / 29,263 (21%) 3,064 / 6,461 (47%)
alcohol

    0 2,129 / 29,263 (7.3%) 971 / 6,461 (15%)
    1 27,134 / 29,263 (93%) 5,490 / 6,461 (85%)
physical

    0 6,181 / 29,263 (21%) 1,773 / 6,461 (27%)
    1 23,082 / 29,263 (79%) 4,688 / 6,461 (73%)
TC 177 (32) 184 (42)
HDL 54 (12) 52 (15)
HbA1c 5.46 (0.39) 6.49 (1.59)
hsCRP_log 0.30 (1.03) 0.89 (1.06)
hsCRP_cap 2.31 (3.77) 4.11 (5.52)
hsCRP_quartile

    1 8,054 / 29,263 (28%) 877 / 6,461 (14%)
    2 7,581 / 29,263 (26%) 1,350 / 6,461 (21%)
    3 7,479 / 29,263 (26%) 1,452 / 6,461 (22%)
    4 6,149 / 29,263 (21%) 2,782 / 6,461 (43%)
1 Mean (SD); n / N (%)

3️⃣ hsCRP Categorization

# Quartiles
nhanes_analysis <- nhanes_analysis %>%
  mutate(hsCRP_quartile = ntile(hsCRP, 4))

table(nhanes_analysis$hsCRP_quartile)
## 
##    1    2    3    4 
## 8931 8931 8931 8931
# Boxplot of BMI by hsCRP quartile
ggplot(nhanes_analysis, aes(x=factor(hsCRP_quartile), y=BMI)) +
  geom_boxplot(fill="orange") +
  theme_minimal() +
  labs(title="BMI by hsCRP Quartile", x="hsCRP Quartile", y="BMI")

4️⃣ Correlation Analysis

# Continuous variables correlation
cor_vars <- nhanes_analysis[, c("hsCRP","BMI","waist","TC","HDL","HbA1c")]
cor(cor_vars, use="complete.obs")
##             hsCRP        BMI      waist        TC         HDL      HbA1c
## hsCRP  1.00000000  0.2756943  0.2438136 0.0304140 -0.08883523  0.1506126
## BMI    0.27569425  1.0000000  0.8895485 0.1351613 -0.20166659  0.2269670
## waist  0.24381363  0.8895485  1.0000000 0.1647872 -0.19949949  0.2439794
## TC     0.03041400  0.1351613  0.1647872 1.0000000  0.21603822  0.0807612
## HDL   -0.08883523 -0.2016666 -0.1994995 0.2160382  1.00000000 -0.1379476
## HbA1c  0.15061262  0.2269670  0.2439794 0.0807612 -0.13794760  1.0000000
# Scatter plot example
ggplot(nhanes_analysis, aes(x=BMI, y=hsCRP)) +
  geom_point(alpha=0.5) +
  geom_smooth(method="lm", color="red") +
  theme_minimal() +
  labs(title="hsCRP vs BMI", x="BMI", y="hsCRP")

5️⃣ Outlier Check

# Boxplots for continuous variables
ggplot(nhanes_analysis, aes(y=hsCRP)) +
  geom_boxplot(fill="pink") +
  theme_minimal() +
  labs(title="Boxplot of hsCRP", y="hsCRP")

# Optional: log-transformed hsCRP
ggplot(nhanes_analysis, aes(y=hsCRP_log)) +
  geom_boxplot(fill="lightblue") +
  theme_minimal() +
  labs(title="Boxplot of log-transformed hsCRP", y="log(hsCRP)")

6️⃣ Missing Data Summary

library(naniar)

# Missing data overview
gg_miss_var(nhanes_analysis)

# Count missing for each variable
miss_summary <- nhanes_analysis %>% summarise(across(everything(), ~sum(is.na(.))))
miss_summary
##   SEQN hsCRP hypertension age gender race PIR BMI waist diabetes kidney smoking
## 1    0     0            0   0      0    0   0   0     0        0      0       0
##   alcohol physical TC HDL HbA1c hsCRP_log hsCRP_cap hsCRP_quartile
## 1       0        0  0   0     0         0         0              0

7️⃣ Additional Visualizations

# hsCRP by Hypertension (Boxplot)
ggplot(nhanes_analysis, aes(x=factor(hypertension), y=hsCRP)) +
  geom_boxplot(fill="lightgreen") +
  theme_minimal() +
  labs(title="hsCRP by Hypertension", x="Hypertension", y="hsCRP")

# Stacked bar chart example
ggplot(nhanes_analysis, aes(x=factor(hypertension), fill=factor(smoking))) +
  geom_bar(position="fill") +
  theme_minimal() +
  labs(title="Smoking by Hypertension", x="Hypertension", y="Proportion")