library(tidyverse)
library(dplyr)
library(DBI) #contains functions to interect with the databasa
library(RSQLite) #SQLite database driver
library(jsonlite)
library(haven)
library(janitor)
library(readxl)
library(writexl)
library(openxlsx)
library(xlsx)
library(microbenchmark)
library(data.table)
library(readr)
library(remotes)
library(tidyr)
library(dbplyr)
library(knitr)
library(kableExtra)
library(psych)
library(readxl)
library(DescTools)
library(ggplot2)
library(e1071)
library(moments)
library(skimr)
library(tableone)
library(table1)
library(gtsummary)
library(gmodels)
library(naniar)
library(ggthemes)
library(ggThemeAssist)
library(forcats)
library(ggpubr)
library(patchwork)
library(reshape2)
library(car)
library(lares)
library(ggstatsplot)
library(GGally)
library(PerformanceAnalytics)
library(heatmap3)
library(sjPlot)
library(sjmisc)
library(pairsD3)
library(gridExtra)
library(gridExtra)
library(lmtest)
library(titanic)
library(tidyverse)
library(caret)
library(pROC)
library(gt)
library(corrplot)
nhanes_analysis <- read.csv("nhanes_analysis_without_outliar_hscrpcattegorisation.csv")
# Continuous variables summary
library(dplyr)
nhanes_analysis %>%
summarise(
hsCRP_mean = mean(hsCRP, na.rm = TRUE),
hsCRP_sd = sd(hsCRP, na.rm = TRUE),
hsCRP_median = median(hsCRP, na.rm = TRUE),
age_mean = mean(age, na.rm = TRUE),
age_sd = sd(age, na.rm = TRUE),
age_median = median(age, na.rm = TRUE),
BMI_mean = mean(BMI, na.rm = TRUE),
BMI_sd = sd(BMI, na.rm = TRUE),
BMI_median = median(BMI, na.rm = TRUE),
waist_mean = mean(waist, na.rm = TRUE),
waist_sd = sd(waist, na.rm = TRUE),
waist_median = median(waist, na.rm = TRUE),
PIR_mean = mean(PIR, na.rm = TRUE),
PIR_sd = sd(PIR, na.rm = TRUE),
PIR_median = median(PIR, na.rm = TRUE),
TC_mean = mean(TC, na.rm = TRUE),
TC_sd = sd(TC, na.rm = TRUE),
TC_median = median(TC, na.rm = TRUE),
HDL_mean = mean(HDL, na.rm = TRUE),
HDL_sd = sd(HDL, na.rm = TRUE),
HDL_median = median(HDL, na.rm = TRUE),
HbA1c_mean = mean(HbA1c, na.rm = TRUE),
HbA1c_sd = sd(HbA1c, na.rm = TRUE),
HbA1c_median = median(HbA1c, na.rm = TRUE)
)
## hsCRP_mean hsCRP_sd hsCRP_median age_mean age_sd age_median BMI_mean
## 1 2.853738 6.54405 1.32 35.52248 24.98483 33 26.43016
## BMI_sd BMI_median waist_mean waist_sd waist_median PIR_mean PIR_sd
## 1 7.502948 25.8 89.90923 20.39434 91 2.412445 1.52466
## PIR_median TC_mean TC_sd TC_median HDL_mean HDL_sd HDL_median
## 1 2.06 177.9767 34.23275 175 53.34957 12.70463 52
## HbA1c_mean HbA1c_sd HbA1c_median
## 1 5.648368 0.8584626 5.5
# Categorical variables summary
library(dplyr)
categorical_vars <- c("hypertension", "gender", "race", "diabetes", "kidney",
"smoking", "alcohol", "physical")
for (var in categorical_vars) {
print(var)
print(table(nhanes_analysis[[var]], useNA = "ifany"))
print(round(prop.table(table(nhanes_analysis[[var]]))*100, 2)) # percentage
cat("\n")
}
## [1] "hypertension"
##
## 0 1
## 29263 6461
##
## 0 1
## 81.91 18.09
##
## [1] "gender"
##
## Female Male
## 18395 17329
##
## Female Male
## 51.49 48.51
##
## [1] "race"
##
## 1 2 3 4 6 7
## 4752 4000 13896 7488 3234 2354
##
## 1 2 3 4 6 7
## 13.30 11.20 38.90 20.96 9.05 6.59
##
## [1] "diabetes"
##
## 0 1
## 32342 3382
##
## 0 1
## 90.53 9.47
##
## [1] "kidney"
##
## 0 1
## 35680 44
##
## 0 1
## 99.88 0.12
##
## [1] "smoking"
##
## 0 1
## 26417 9307
##
## 0 1
## 73.95 26.05
##
## [1] "alcohol"
##
## 0 1
## 3100 32624
##
## 0 1
## 8.68 91.32
##
## [1] "physical"
##
## 0 1
## 7954 27770
##
## 0 1
## 22.27 77.73
con <- dbConnect(drv = SQLite(),":memory:")
library(ggplot2)
# Histogram for continuous
ggplot(nhanes_analysis, aes(x = hsCRP)) +
geom_histogram(binwidth = 0.5, fill = "skyblue", color = "black") +
labs(title = "Distribution of hsCRP")
# Bar plot for categorical
ggplot(nhanes_analysis, aes(x = factor(hypertension))) +
geom_bar(fill = "orange") +
labs(title = "Hypertension Frequency", x = "Hypertension", y = "Count")
library(dplyr)
library(naniar) # Missing data visualization
# Missing data summary
missing_summary <- nhanes_analysis %>%
summarise(across(everything(), ~sum(is.na(.))))
missing_summary
## SEQN hsCRP hypertension age gender race PIR BMI waist diabetes kidney smoking
## 1 0 0 0 0 0 0 0 0 0 0 0 0
## alcohol physical TC HDL HbA1c hsCRP_log hsCRP_cap hsCRP_quartile
## 1 0 0 0 0 0 0 0 0
# Missing data visualization
vis_miss(nhanes_analysis)
Step 2a: Basic statistics
continuous_vars <- c("hsCRP", "age", "PIR", "BMI", "waist", "TC", "HDL", "HbA1c")
nhanes_analysis %>%
select(all_of(continuous_vars)) %>%
summarise(across(everything(), list(
mean = ~mean(.x, na.rm = TRUE),
sd = ~sd(.x, na.rm = TRUE),
median = ~median(.x, na.rm = TRUE),
min = ~min(.x, na.rm = TRUE),
max = ~max(.x, na.rm = TRUE),
Q1 = ~quantile(.x, 0.25, na.rm = TRUE),
Q3 = ~quantile(.x, 0.75, na.rm = TRUE)
)))
## hsCRP_mean hsCRP_sd hsCRP_median hsCRP_min hsCRP_max hsCRP_Q1 hsCRP_Q3
## 1 2.853738 6.54405 1.32 0.08 246.86 0.7 2.5
## age_mean age_sd age_median age_min age_max age_Q1 age_Q3 PIR_mean PIR_sd
## 1 35.52248 24.98483 33 1 80 12 58 2.412445 1.52466
## PIR_median PIR_min PIR_max PIR_Q1 PIR_Q3 BMI_mean BMI_sd BMI_median BMI_min
## 1 2.06 0 5 1.2 3.55 26.43016 7.502948 25.8 11.1
## BMI_max BMI_Q1 BMI_Q3 waist_mean waist_sd waist_median waist_min waist_max
## 1 92.3 21.7 29.9 89.90923 20.39434 91 39.8 187
## waist_Q1 waist_Q3 TC_mean TC_sd TC_median TC_min TC_max TC_Q1 TC_Q3
## 1 78.5 101.3 177.9767 34.23275 175 62 545 162 189
## HDL_mean HDL_sd HDL_median HDL_min HDL_max HDL_Q1 HDL_Q3 HbA1c_mean
## 1 53.34957 12.70463 52 5 226 47 57 5.648368
## HbA1c_sd HbA1c_median HbA1c_min HbA1c_max HbA1c_Q1 HbA1c_Q3
## 1 0.8584626 5.5 2.8 17.1 5.4 5.6
Step 2b: Distribution plots
library(ggplot2)
# Histogram + Density overlay
for (var in continuous_vars) {
ggplot(nhanes_analysis, aes_string(x = var)) +
geom_histogram(aes(y = ..density..), binwidth = 0.5, fill = "skyblue", color = "black") +
geom_density(color = "red") +
labs(title = paste("Distribution of", var)) +
theme_minimal() -> p
print(p)
}
Step 2c: Boxplots (Outlier detection)
for (var in continuous_vars) {
ggplot(nhanes_analysis, aes_string(y = var)) +
geom_boxplot(fill = "orange") +
labs(title = paste("Boxplot of", var)) +
theme_minimal() -> p
print(p)
}
Step 3a: Frequency + %
categorical_vars <- c("hypertension", "gender", "race", "diabetes", "kidney",
"smoking", "alcohol", "physical")
for (var in categorical_vars) {
print(var)
print(table(nhanes_analysis[[var]], useNA = "ifany"))
print(round(prop.table(table(nhanes_analysis[[var]]))*100, 2))
cat("\n")
}
## [1] "hypertension"
##
## 0 1
## 29263 6461
##
## 0 1
## 81.91 18.09
##
## [1] "gender"
##
## Female Male
## 18395 17329
##
## Female Male
## 51.49 48.51
##
## [1] "race"
##
## 1 2 3 4 6 7
## 4752 4000 13896 7488 3234 2354
##
## 1 2 3 4 6 7
## 13.30 11.20 38.90 20.96 9.05 6.59
##
## [1] "diabetes"
##
## 0 1
## 32342 3382
##
## 0 1
## 90.53 9.47
##
## [1] "kidney"
##
## 0 1
## 35680 44
##
## 0 1
## 99.88 0.12
##
## [1] "smoking"
##
## 0 1
## 26417 9307
##
## 0 1
## 73.95 26.05
##
## [1] "alcohol"
##
## 0 1
## 3100 32624
##
## 0 1
## 8.68 91.32
##
## [1] "physical"
##
## 0 1
## 7954 27770
##
## 0 1
## 22.27 77.73
Step 3b: Barplots
for (var in categorical_vars) {
ggplot(nhanes_analysis, aes_string(x = var)) +
geom_bar(fill = "steelblue") +
labs(title = paste("Barplot of", var)) +
theme_minimal() -> p
print(p)
}
# Example: hsCRP by hypertension
nhanes_analysis %>%
group_by(hypertension) %>%
summarise(
mean_hsCRP = mean(hsCRP, na.rm = TRUE),
sd_hsCRP = sd(hsCRP, na.rm = TRUE),
median_hsCRP = median(hsCRP, na.rm = TRUE),
n = n()
)
## # A tibble: 2 × 5
## hypertension mean_hsCRP sd_hsCRP median_hsCRP n
## <int> <dbl> <dbl> <dbl> <int>
## 1 0 2.46 5.55 1.32 29263
## 2 1 4.65 9.66 1.93 6461
# Boxplot by hypertension
ggplot(nhanes_analysis, aes(x = factor(hypertension), y = hsCRP)) +
geom_boxplot(fill = "lightgreen") +
labs(title = "hsCRP by Hypertension", x = "Hypertension", y = "hsCRP") +
theme_minimal()
Stratify by multiple variables (like gender + hypertension):
nhanes_analysis %>%
group_by(gender, hypertension) %>%
summarise(
mean_hsCRP = mean(hsCRP, na.rm = TRUE),
sd_hsCRP = sd(hsCRP, na.rm = TRUE),
n = n()
)
## # A tibble: 4 × 5
## # Groups: gender [2]
## gender hypertension mean_hsCRP sd_hsCRP n
## <chr> <int> <dbl> <dbl> <int>
## 1 Female 0 2.78 6.00 15206
## 2 Female 1 5.04 8.57 3189
## 3 Male 0 2.11 5.00 14057
## 4 Male 1 4.26 10.6 3272
library(corrplot)
continuous_data <- nhanes_analysis %>% select(all_of(continuous_vars))
corr_matrix <- cor(continuous_data, use = "complete.obs")
corrplot(corr_matrix, method = "color", addCoef.col = "black", number.cex = 0.7)
library(gtsummary)
# Continuous + Categorical variables একসাথে
tbl <- nhanes_analysis %>%
select(hsCRP,hsCRP_cap,hsCRP_log,hsCRP_quartile, hypertension, age, gender, race, PIR, BMI, waist, diabetes, kidney, smoking, alcohol, physical, TC, HDL, HbA1c) %>%
tbl_summary(
by = hypertension, # stratify by outcome
missing = "no",
statistic = list(
all_continuous() ~ "{mean} ({sd})",
all_categorical() ~ "{n} ({p}%)"
)
) %>%
add_overall() %>%
add_p() # p-value by group
tbl
| Characteristic | Overall N = 35,7241 |
0 N = 29,2631 |
1 N = 6,4611 |
p-value2 |
|---|---|---|---|---|
| hsCRP | 2.85 (6.54) | 2.46 (5.55) | 4.65 (9.66) | <0.001 |
| hsCRP_cap | 2.63 (4.20) | 2.31 (3.77) | 4.11 (5.52) | <0.001 |
| hsCRP_log | 0.41 (1.06) | 0.30 (1.03) | 0.89 (1.06) | <0.001 |
| hsCRP_quartile | <0.001 | |||
| 1 | 8,931 (25%) | 8,054 (28%) | 877 (14%) | |
| 2 | 8,931 (25%) | 7,581 (26%) | 1,350 (21%) | |
| 3 | 8,931 (25%) | 7,479 (26%) | 1,452 (22%) | |
| 4 | 8,931 (25%) | 6,149 (21%) | 2,782 (43%) | |
| age | 36 (25) | 30 (23) | 61 (15) | <0.001 |
| gender | <0.001 | |||
| Female | 18,395 (51%) | 15,206 (52%) | 3,189 (49%) | |
| Male | 17,329 (49%) | 14,057 (48%) | 3,272 (51%) | |
| race | <0.001 | |||
| 1 | 4,752 (13%) | 4,000 (14%) | 752 (12%) | |
| 2 | 4,000 (11%) | 3,323 (11%) | 677 (10%) | |
| 3 | 13,896 (39%) | 11,411 (39%) | 2,485 (38%) | |
| 4 | 7,488 (21%) | 5,814 (20%) | 1,674 (26%) | |
| 6 | 3,234 (9.1%) | 2,698 (9.2%) | 536 (8.3%) | |
| 7 | 2,354 (6.6%) | 2,017 (6.9%) | 337 (5.2%) | |
| PIR | 2.41 (1.52) | 2.42 (1.54) | 2.39 (1.47) | 0.6 |
| BMI | 26 (8) | 25 (7) | 31 (8) | <0.001 |
| waist | 90 (20) | 87 (20) | 104 (17) | <0.001 |
| diabetes | 3,382 (9.5%) | 0 (0%) | 3,382 (52%) | <0.001 |
| kidney | 44 (0.1%) | 5 (<0.1%) | 39 (0.6%) | <0.001 |
| smoking | 9,307 (26%) | 6,243 (21%) | 3,064 (47%) | <0.001 |
| alcohol | 32,624 (91%) | 27,134 (93%) | 5,490 (85%) | <0.001 |
| physical | 27,770 (78%) | 23,082 (79%) | 4,688 (73%) | <0.001 |
| TC | 178 (34) | 177 (32) | 184 (42) | <0.001 |
| HDL | 53 (13) | 54 (12) | 52 (15) | <0.001 |
| HbA1c | 5.65 (0.86) | 5.46 (0.39) | 6.49 (1.59) | <0.001 |
| 1 Mean (SD); n (%) | ||||
| 2 Wilcoxon rank sum test; Pearson’s Chi-squared test | ||||
#.rs.restartR()
library(dplyr)
library(gtsummary)
library(writexl)
cat_vars <- c("gender", "hypertension", "kidney", "diabetes",
"smoking", "alcohol", "physical", "hsCRP_quartile")
cont_vars <- c("hsCRP", "hsCRP_cap", "hsCRP_log", "age", "PIR", "BMI", "waist", "TC", "HDL", "HbA1c")
nhanes_analysis <- nhanes_analysis %>%
mutate(across(all_of(cat_vars), as.factor))
descriptive_table <- tbl_summary(
data = nhanes_analysis,
include = c(cont_vars, cat_vars),
statistic = list(
all_continuous() ~ "{mean} ± {sd} ({median}, {p25}-{p75})",
all_categorical() ~ "{n} ({p}%)"
),
missing = "no"
)
descriptive_df <- descriptive_table$table_body %>% as.data.frame()
#write_xlsx(descriptive_df, "NHANES_Descriptive_Summary.xlsx")
Continuous variables (age, BMI, waist, hsCRP, TC, HDL, HbA1c):
# Continuous variables summary
summary(nhanes_analysis[, c("age","BMI","waist","hsCRP","TC","HDL","HbA1c")])
## age BMI waist hsCRP
## Min. : 1.00 Min. :11.10 Min. : 39.80 Min. : 0.080
## 1st Qu.:12.00 1st Qu.:21.70 1st Qu.: 78.50 1st Qu.: 0.700
## Median :33.00 Median :25.80 Median : 91.00 Median : 1.320
## Mean :35.52 Mean :26.43 Mean : 89.91 Mean : 2.854
## 3rd Qu.:58.00 3rd Qu.:29.90 3rd Qu.:101.30 3rd Qu.: 2.500
## Max. :80.00 Max. :92.30 Max. :187.00 Max. :246.860
## TC HDL HbA1c
## Min. : 62 Min. : 5.00 Min. : 2.800
## 1st Qu.:162 1st Qu.: 47.00 1st Qu.: 5.400
## Median :175 Median : 52.00 Median : 5.500
## Mean :178 Mean : 53.35 Mean : 5.648
## 3rd Qu.:189 3rd Qu.: 57.00 3rd Qu.: 5.600
## Max. :545 Max. :226.00 Max. :17.100
# Histograms
library(ggplot2)
ggplot(nhanes_analysis, aes(x=hsCRP)) +
geom_histogram(binwidth=0.5, fill="skyblue", color="black") +
theme_minimal() +
labs(title="Histogram of hsCRP", x="hsCRP", y="Frequency")
summary(nhanes_analysis$hsCRP)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.080 0.700 1.320 2.854 2.500 246.860
Categorical variables (gender, hypertension, kidney, diabetes, smoking, alcohol, physical):
# Frequency tables
table(nhanes_analysis$gender)
##
## Female Male
## 18395 17329
table(nhanes_analysis$hypertension)
##
## 0 1
## 29263 6461
table(nhanes_analysis$diabetes)
##
## 0 1
## 32342 3382
# Bar plot example
ggplot(nhanes_analysis, aes(x=factor(hypertension))) +
geom_bar(fill="lightgreen") +
theme_minimal() +
labs(title="Hypertension Distribution", x="Hypertension", y="Count")
2️⃣ Stratified Descriptive Tables
library(gtsummary)
# By Gender
tbl_gender <- nhanes_analysis %>%
tbl_summary(
by = "gender",
statistic = list(all_continuous() ~ "{mean} ({sd})", all_categorical() ~ "{n} / {N} ({p}%)")
)
tbl_gender
| Characteristic | Female N = 18,3951 |
Male N = 17,3291 |
|---|---|---|
| SEQN | 116,122 (18,670) | 115,381 (18,467) |
| hsCRP | 3.17 (6.58) | 2.52 (6.49) |
| hypertension | ||
| 0 | 15,206 / 18,395 (83%) | 14,057 / 17,329 (81%) |
| 1 | 3,189 / 18,395 (17%) | 3,272 / 17,329 (19%) |
| age | 36 (25) | 35 (25) |
| race | ||
| 1 | 2,488 / 18,395 (14%) | 2,264 / 17,329 (13%) |
| 2 | 2,098 / 18,395 (11%) | 1,902 / 17,329 (11%) |
| 3 | 7,144 / 18,395 (39%) | 6,752 / 17,329 (39%) |
| 4 | 3,858 / 18,395 (21%) | 3,630 / 17,329 (21%) |
| 6 | 1,670 / 18,395 (9.1%) | 1,564 / 17,329 (9.0%) |
| 7 | 1,137 / 18,395 (6.2%) | 1,217 / 17,329 (7.0%) |
| PIR | 2.37 (1.52) | 2.45 (1.53) |
| BMI | 27 (8) | 26 (7) |
| waist | 89 (20) | 90 (21) |
| diabetes | ||
| 0 | 16,768 / 18,395 (91%) | 15,574 / 17,329 (90%) |
| 1 | 1,627 / 18,395 (8.8%) | 1,755 / 17,329 (10%) |
| kidney | ||
| 0 | 18,371 / 18,395 (100%) | 17,309 / 17,329 (100%) |
| 1 | 24 / 18,395 (0.1%) | 20 / 17,329 (0.1%) |
| smoking | ||
| 0 | 14,460 / 18,395 (79%) | 11,957 / 17,329 (69%) |
| 1 | 3,935 / 18,395 (21%) | 5,372 / 17,329 (31%) |
| alcohol | ||
| 0 | 2,065 / 18,395 (11%) | 1,035 / 17,329 (6.0%) |
| 1 | 16,330 / 18,395 (89%) | 16,294 / 17,329 (94%) |
| physical | ||
| 0 | 3,670 / 18,395 (20%) | 4,284 / 17,329 (25%) |
| 1 | 14,725 / 18,395 (80%) | 13,045 / 17,329 (75%) |
| TC | 180 (34) | 175 (34) |
| HDL | 56 (13) | 51 (11) |
| HbA1c | 5.63 (0.84) | 5.66 (0.88) |
| hsCRP_log | 0.50 (1.09) | 0.31 (1.02) |
| hsCRP_cap | 2.96 (4.54) | 2.29 (3.77) |
| hsCRP_quartile | ||
| 1 | 4,189 / 18,395 (23%) | 4,742 / 17,329 (27%) |
| 2 | 4,360 / 18,395 (24%) | 4,571 / 17,329 (26%) |
| 3 | 4,590 / 18,395 (25%) | 4,341 / 17,329 (25%) |
| 4 | 5,256 / 18,395 (29%) | 3,675 / 17,329 (21%) |
| 1 Mean (SD); n / N (%) | ||
# By Hypertension
tbl_htn <- nhanes_analysis %>%
tbl_summary(
by = "hypertension",
statistic = list(all_continuous() ~ "{mean} ({sd})", all_categorical() ~ "{n} / {N} ({p}%)")
)
tbl_htn
| Characteristic | 0 N = 29,2631 |
1 N = 6,4611 |
|---|---|---|
| SEQN | 115,819 (18,628) | 115,507 (18,332) |
| hsCRP | 2.46 (5.55) | 4.65 (9.66) |
| age | 30 (23) | 61 (15) |
| gender | ||
| Female | 15,206 / 29,263 (52%) | 3,189 / 6,461 (49%) |
| Male | 14,057 / 29,263 (48%) | 3,272 / 6,461 (51%) |
| race | ||
| 1 | 4,000 / 29,263 (14%) | 752 / 6,461 (12%) |
| 2 | 3,323 / 29,263 (11%) | 677 / 6,461 (10%) |
| 3 | 11,411 / 29,263 (39%) | 2,485 / 6,461 (38%) |
| 4 | 5,814 / 29,263 (20%) | 1,674 / 6,461 (26%) |
| 6 | 2,698 / 29,263 (9.2%) | 536 / 6,461 (8.3%) |
| 7 | 2,017 / 29,263 (6.9%) | 337 / 6,461 (5.2%) |
| PIR | 2.42 (1.54) | 2.39 (1.47) |
| BMI | 25 (7) | 31 (8) |
| waist | 87 (20) | 104 (17) |
| diabetes | ||
| 0 | 29,263 / 29,263 (100%) | 3,079 / 6,461 (48%) |
| 1 | 0 / 29,263 (0%) | 3,382 / 6,461 (52%) |
| kidney | ||
| 0 | 29,258 / 29,263 (100%) | 6,422 / 6,461 (99%) |
| 1 | 5 / 29,263 (<0.1%) | 39 / 6,461 (0.6%) |
| smoking | ||
| 0 | 23,020 / 29,263 (79%) | 3,397 / 6,461 (53%) |
| 1 | 6,243 / 29,263 (21%) | 3,064 / 6,461 (47%) |
| alcohol | ||
| 0 | 2,129 / 29,263 (7.3%) | 971 / 6,461 (15%) |
| 1 | 27,134 / 29,263 (93%) | 5,490 / 6,461 (85%) |
| physical | ||
| 0 | 6,181 / 29,263 (21%) | 1,773 / 6,461 (27%) |
| 1 | 23,082 / 29,263 (79%) | 4,688 / 6,461 (73%) |
| TC | 177 (32) | 184 (42) |
| HDL | 54 (12) | 52 (15) |
| HbA1c | 5.46 (0.39) | 6.49 (1.59) |
| hsCRP_log | 0.30 (1.03) | 0.89 (1.06) |
| hsCRP_cap | 2.31 (3.77) | 4.11 (5.52) |
| hsCRP_quartile | ||
| 1 | 8,054 / 29,263 (28%) | 877 / 6,461 (14%) |
| 2 | 7,581 / 29,263 (26%) | 1,350 / 6,461 (21%) |
| 3 | 7,479 / 29,263 (26%) | 1,452 / 6,461 (22%) |
| 4 | 6,149 / 29,263 (21%) | 2,782 / 6,461 (43%) |
| 1 Mean (SD); n / N (%) | ||
# Quartiles
nhanes_analysis <- nhanes_analysis %>%
mutate(hsCRP_quartile = ntile(hsCRP, 4))
table(nhanes_analysis$hsCRP_quartile)
##
## 1 2 3 4
## 8931 8931 8931 8931
# Boxplot of BMI by hsCRP quartile
ggplot(nhanes_analysis, aes(x=factor(hsCRP_quartile), y=BMI)) +
geom_boxplot(fill="orange") +
theme_minimal() +
labs(title="BMI by hsCRP Quartile", x="hsCRP Quartile", y="BMI")
# Continuous variables correlation
cor_vars <- nhanes_analysis[, c("hsCRP","BMI","waist","TC","HDL","HbA1c")]
cor(cor_vars, use="complete.obs")
## hsCRP BMI waist TC HDL HbA1c
## hsCRP 1.00000000 0.2756943 0.2438136 0.0304140 -0.08883523 0.1506126
## BMI 0.27569425 1.0000000 0.8895485 0.1351613 -0.20166659 0.2269670
## waist 0.24381363 0.8895485 1.0000000 0.1647872 -0.19949949 0.2439794
## TC 0.03041400 0.1351613 0.1647872 1.0000000 0.21603822 0.0807612
## HDL -0.08883523 -0.2016666 -0.1994995 0.2160382 1.00000000 -0.1379476
## HbA1c 0.15061262 0.2269670 0.2439794 0.0807612 -0.13794760 1.0000000
# Scatter plot example
ggplot(nhanes_analysis, aes(x=BMI, y=hsCRP)) +
geom_point(alpha=0.5) +
geom_smooth(method="lm", color="red") +
theme_minimal() +
labs(title="hsCRP vs BMI", x="BMI", y="hsCRP")
# Boxplots for continuous variables
ggplot(nhanes_analysis, aes(y=hsCRP)) +
geom_boxplot(fill="pink") +
theme_minimal() +
labs(title="Boxplot of hsCRP", y="hsCRP")
# Optional: log-transformed hsCRP
ggplot(nhanes_analysis, aes(y=hsCRP_log)) +
geom_boxplot(fill="lightblue") +
theme_minimal() +
labs(title="Boxplot of log-transformed hsCRP", y="log(hsCRP)")
library(naniar)
# Missing data overview
gg_miss_var(nhanes_analysis)
# Count missing for each variable
miss_summary <- nhanes_analysis %>% summarise(across(everything(), ~sum(is.na(.))))
miss_summary
## SEQN hsCRP hypertension age gender race PIR BMI waist diabetes kidney smoking
## 1 0 0 0 0 0 0 0 0 0 0 0 0
## alcohol physical TC HDL HbA1c hsCRP_log hsCRP_cap hsCRP_quartile
## 1 0 0 0 0 0 0 0 0
# hsCRP by Hypertension (Boxplot)
ggplot(nhanes_analysis, aes(x=factor(hypertension), y=hsCRP)) +
geom_boxplot(fill="lightgreen") +
theme_minimal() +
labs(title="hsCRP by Hypertension", x="Hypertension", y="hsCRP")
# Stacked bar chart example
ggplot(nhanes_analysis, aes(x=factor(hypertension), fill=factor(smoking))) +
geom_bar(position="fill") +
theme_minimal() +
labs(title="Smoking by Hypertension", x="Hypertension", y="Proportion")