Data overview & preparation
# Basic overview
str(nhanes_analysis)
## 'data.frame': 35724 obs. of 20 variables:
## $ SEQN : int 83732 83733 83734 83735 83736 83737 83738 83739 83740 83741 ...
## $ hsCRP : num 0.6 1.4 0.6 9 0.5 2.5 0.3 0.08 1.32 1.3 ...
## $ hypertension : int 1 1 1 0 0 0 0 0 0 0 ...
## $ age : int 62 53 78 56 42 72 11 4 1 22 ...
## $ gender : chr "Male" "Male" "Male" "Female" ...
## $ race : int 3 3 3 3 4 1 1 3 2 4 ...
## $ PIR : num 4.39 1.32 1.51 5 1.23 2.82 1.18 4.22 2.06 2.08 ...
## $ BMI : num 27.8 30.8 28.8 42.4 20.3 28.6 18.1 15.7 25.8 28 ...
## $ waist : num 101.1 107.9 116.5 110.1 80.4 ...
## $ diabetes : int 1 0 1 0 0 0 0 0 0 0 ...
## $ kidney : int 0 0 0 0 0 0 0 0 0 0 ...
## $ smoking : int 1 1 1 0 0 0 0 0 0 1 ...
## $ alcohol : int 1 1 1 0 0 0 1 1 1 1 ...
## $ physical : int 1 1 1 1 1 1 1 1 1 1 ...
## $ TC : int 173 265 229 174 204 190 142 175 175 164 ...
## $ HDL : int 46 63 30 61 53 78 43 52 52 48 ...
## $ HbA1c : num 7 5.5 5.8 5.6 5.6 5.9 5.5 5.5 5.5 5.6 ...
## $ hsCRP_log : num -0.357 0.405 -0.357 2.208 -0.511 ...
## $ hsCRP_cap : num 0.6 1.4 0.6 9 0.5 2.5 0.3 0.08 1.32 1.3 ...
## $ hsCRP_quartile: int 1 3 1 4 1 3 1 1 2 2 ...
## SEQN hsCRP hypertension age
## Min. : 83732 Min. : 0.080 Min. :0.0000 Min. : 1.00
## 1st Qu.: 93149 1st Qu.: 0.700 1st Qu.:0.0000 1st Qu.:12.00
## Median :118128 Median : 1.320 Median :0.0000 Median :33.00
## Mean :115763 Mean : 2.854 Mean :0.1809 Mean :35.52
## 3rd Qu.:133038 3rd Qu.: 2.500 3rd Qu.:0.0000 3rd Qu.:58.00
## Max. :142310 Max. :246.860 Max. :1.0000 Max. :80.00
## gender race PIR BMI
## Length:35724 Min. :1.000 Min. :0.000 Min. :11.10
## Class :character 1st Qu.:3.000 1st Qu.:1.200 1st Qu.:21.70
## Mode :character Median :3.000 Median :2.060 Median :25.80
## Mean :3.367 Mean :2.412 Mean :26.43
## 3rd Qu.:4.000 3rd Qu.:3.550 3rd Qu.:29.90
## Max. :7.000 Max. :5.000 Max. :92.30
## waist diabetes kidney smoking
## Min. : 39.80 Min. :0.00000 Min. :0.000000 Min. :0.0000
## 1st Qu.: 78.50 1st Qu.:0.00000 1st Qu.:0.000000 1st Qu.:0.0000
## Median : 91.00 Median :0.00000 Median :0.000000 Median :0.0000
## Mean : 89.91 Mean :0.09467 Mean :0.001232 Mean :0.2605
## 3rd Qu.:101.30 3rd Qu.:0.00000 3rd Qu.:0.000000 3rd Qu.:1.0000
## Max. :187.00 Max. :1.00000 Max. :1.000000 Max. :1.0000
## alcohol physical TC HDL
## Min. :0.0000 Min. :0.0000 Min. : 62 Min. : 5.00
## 1st Qu.:1.0000 1st Qu.:1.0000 1st Qu.:162 1st Qu.: 47.00
## Median :1.0000 Median :1.0000 Median :175 Median : 52.00
## Mean :0.9132 Mean :0.7773 Mean :178 Mean : 53.35
## 3rd Qu.:1.0000 3rd Qu.:1.0000 3rd Qu.:189 3rd Qu.: 57.00
## Max. :1.0000 Max. :1.0000 Max. :545 Max. :226.00
## HbA1c hsCRP_log hsCRP_cap hsCRP_quartile
## Min. : 2.800 Min. :-1.7148 Min. : 0.080 Min. :1.00
## 1st Qu.: 5.400 1st Qu.:-0.2231 1st Qu.: 0.700 1st Qu.:1.75
## Median : 5.500 Median : 0.3507 Median : 1.320 Median :2.50
## Mean : 5.648 Mean : 0.4052 Mean : 2.633 Mean :2.50
## 3rd Qu.: 5.600 3rd Qu.: 0.9555 3rd Qu.: 2.500 3rd Qu.:3.25
## Max. :17.100 Max. : 5.5092 Max. :29.238 Max. :4.00
# Missing value check
sapply(nhanes_analysis, function(x) sum(is.na(x)))
## SEQN hsCRP hypertension age gender
## 0 0 0 0 0
## race PIR BMI waist diabetes
## 0 0 0 0 0
## kidney smoking alcohol physical TC
## 0 0 0 0 0
## HDL HbA1c hsCRP_log hsCRP_cap hsCRP_quartile
## 0 0 0 0 0
# Outlier visualization for hsCRP
boxplot(nhanes_analysis$hsCRP, main = "Boxplot of hsCRP")

# Ensure variable types are correct
nhanes_analysis <- nhanes_analysis %>%
mutate(
hypertension = factor(hypertension, levels = c(0,1), labels = c("No","Yes")),
gender = factor(gender, levels = c(1,2), labels = c("Male","Female")),
race = as.factor(race)
)
Normality check
# Shapiro-Wilk test for hsCRP_log
#shapiro.test(nhanes_analysis$hsCRP_log)
# Q–Q plot
qqnorm(nhanes_analysis$hsCRP_log)
qqline(nhanes_analysis$hsCRP_log, col = "red")

# Q–Q plot
qqnorm(nhanes_analysis$hsCRP_log)
qqline(nhanes_analysis$hsCRP_log, col = "red")

# Histogram (visual check)
hist(nhanes_analysis$hsCRP_log, breaks = 40, col = "skyblue", main = "Distribution of hsCRP_log")

# Skewness and kurtosis test (from 'moments' package)
library(moments)
skewness(nhanes_analysis$hsCRP_log, na.rm = TRUE)
## [1] 0.4079302
kurtosis(nhanes_analysis$hsCRP_log, na.rm = TRUE)
## [1] 0.6616889
ks.test(nhanes_analysis$hsCRP_log, "pnorm", mean=mean(nhanes_analysis$hsCRP_log, na.rm=TRUE),
sd=sd(nhanes_analysis$hsCRP_log, na.rm=TRUE))
##
## Asymptotic one-sample Kolmogorov-Smirnov test
##
## data: nhanes_analysis$hsCRP_log
## D = 0.15486, p-value < 2.2e-16
## alternative hypothesis: two-sided
Group comparisons (optional)
# Continuous variable comparison (t-test)
t.test(hsCRP_log ~ hypertension, data = nhanes_analysis)
##
## Welch Two Sample t-test
##
## data: hsCRP_log by hypertension
## t = -40.437, df = 9353.7, p-value < 2.2e-16
## alternative hypothesis: true difference in means between group No and group Yes is not equal to 0
## 95 percent confidence interval:
## -0.6143723 -0.5575617
## sample estimates:
## mean in group No mean in group Yes
## 0.2992454 0.8852124
# Categorical variable comparison (chi-square)
#chisq.test(table(nhanes_analysis$gender, nhanes_analysis$hypertension))
#chisq.test(table(nhanes_analysis$kidney_function, nhanes_analysis$hypertension))
table(nhanes_analysis$gender, nhanes_analysis$hypertension)
##
## No Yes
## Male 0 0
## Female 0 0
table(nhanes_analysis$hypertension, useNA = "ifany")
##
## No Yes
## 29263 6461
table(nhanes_analysis$gender, useNA = "ifany")
##
## Male Female <NA>
## 0 0 35724
str(nhanes_analysis$gender)
## Factor w/ 2 levels "Male","Female": NA NA NA NA NA NA NA NA NA NA ...
nhanes_analysis$gender <- nhanes_final_clean$gender
table(nhanes_analysis$gender, useNA = "ifany")
##
## Female Male
## 18395 17329
nhanes_analysis <- nhanes_analysis %>%
mutate(
gender = case_when(
gender == "Male" ~ 1,
gender == "Female" ~ 2,
TRUE ~ NA_real_
)
)
table(nhanes_analysis$gender, useNA = "ifany")
##
## 1 2
## 17329 18395
table(nhanes_analysis$gender, nhanes_analysis$hypertension)
##
## No Yes
## 1 14057 3272
## 2 15206 3189
chisq.test(table(nhanes_analysis$gender, nhanes_analysis$hypertension))
##
## Pearson's Chi-squared test with Yates' continuity correction
##
## data: table(nhanes_analysis$gender, nhanes_analysis$hypertension)
## X-squared = 14.281, df = 1, p-value = 0.0001575
# Gender vs Hypertension table with counts and percentages
library(dplyr)
library(tidyr)
nhanes_analysis %>%
group_by(gender, hypertension) %>%
summarise(n = n(), .groups = "drop") %>%
group_by(gender) %>%
mutate(percent = round(100 * n / sum(n), 2)) %>%
arrange(gender, hypertension)
## # A tibble: 4 × 4
## # Groups: gender [2]
## gender hypertension n percent
## <dbl> <fct> <int> <dbl>
## 1 1 No 14057 81.1
## 2 1 Yes 3272 18.9
## 3 2 No 15206 82.7
## 4 2 Yes 3189 17.3
length(nhanes_analysis$kidney_function)
## [1] 0
length(nhanes_analysis$hypertension)
## [1] 35724
nhanes_analysis$kidney <- nhanes_final_clean$kidney
# Kidney function vs hypertension
table(nhanes_analysis$kidney, nhanes_analysis$hypertension, useNA = "ifany")
##
## No Yes
## 0 29258 6422
## 1 5 39
chisq.test(table(nhanes_analysis$kidney, nhanes_analysis$hypertension))
##
## Pearson's Chi-squared test with Yates' continuity correction
##
## data: table(nhanes_analysis$kidney, nhanes_analysis$hypertension)
## X-squared = 143.28, df = 1, p-value < 2.2e-16
# ✅ Categorical variables এর তালিকা
cat_vars <- c("gender", "race", "diabetes", "kidney", "smoking", "alcohol", "physical","hsCRP_quartile")
# ✅ Chi-square test results সংরক্ষণের জন্য list বানানো
chi_results <- list()
# ✅ Loop করে প্রতিটা categorical variable এর জন্য chi-square test করা
for (var in cat_vars) {
tbl <- table(nhanes_analysis[[var]], nhanes_analysis$hypertension, useNA = "ifany")
chi <- chisq.test(tbl)
chi_results[[var]] <- list(
variable = var,
table = tbl,
chi_sq = chi$statistic,
df = chi$parameter,
p_value = chi$p.value
)
}
# ✅ Chi-square test এর summary দেখতে
chi_results_summary <- lapply(chi_results, function(x) {
data.frame(
Variable = x$variable,
Chi_sq = x$chi_sq,
df = x$df,
p_value = x$p_value
)
})
# ✅ Summary print করা
chi_results_summary <- do.call(rbind, chi_results_summary)
print(chi_results_summary)
## Variable Chi_sq df p_value
## gender gender 14.28064 1 1.574765e-04
## race race 140.14515 5 1.666542e-28
## diabetes diabetes 16913.33057 1 0.000000e+00
## kidney kidney 143.27985 1 5.105558e-33
## smoking smoking 1868.45276 1 0.000000e+00
## alcohol alcohol 400.48528 1 4.318117e-89
## physical physical 121.74766 1 2.621468e-28
## hsCRP_quartile hsCRP_quartile 1514.09552 3 0.000000e+00
cont_vars <- c("age", "PIR", "BMI", "waist", "TC", "HDL", "HbA1c", "hsCRP_cap", "hsCRP_log")