merged_data <- read.csv("merged2_data.csv")
head(merged_data)
##   Sequence.no Blood.Pressure HsCRP Age Gender               Race Poverty.Ratio
## 1      130378            Yes  1.78  43   Male Non-Hispanic Asian          5.00
## 2      130379            Yes  2.03  66   Male Non-Hispanic White          5.00
## 3      130380             No  5.62  44 Female     Other Hispanic          1.41
## 4      130386             No  1.05  34   Male   Mexican American          1.33
## 5      130387            Yes  3.96  68 Female Non-Hispanic White          1.32
## 6      130390             No 11.20  31 Female Non-Hispanic White          2.16
##    BMI Waist Diabetes Kidney.Disease Smoking Cholesterol HDLC HbA1c
## 1 27.0  98.3       No             No     Yes        6.83   45   5.6
## 2 33.5 114.7       No             No     Yes        5.53   60   5.6
## 3 29.7  93.5      Yes             No      No        4.84   49   6.2
## 4 30.2 106.1       No             No     Yes        4.73   46   5.1
## 5 42.6 122.0       No             No      No        5.25   42   5.9
## 6 46.0 131.0       No             No      No        4.11   39   5.9
library(dplyr)
library(gtsummary)
library(gt)
library(tableone)
library(kableExtra)

# --- prepare data & variable lists -----------------------------------------
# Use the dataset you already loaded: merged_data
# Make sure exposure and categorical vars are factors
merged_data2 <- merged_data %>%
  mutate(
    Blood.Pressure = factor(Blood.Pressure, levels = c("No","Yes")),
    Gender = factor(Gender),
    Race = factor(Race),
    Diabetes = factor(Diabetes),
    Kidney.Disease = factor(Kidney.Disease),
    Smoking = factor(Smoking)
  )

# Define variables
exposure <- "Blood.Pressure"
outcome <- "HsCRP"   # just kept for reference (we will include in the table)
covariates <- c("Age", "Gender", "Race", "Poverty.Ratio", "BMI", "Waist",
                "Diabetes", "Kidney.Disease", "Smoking", "Cholesterol", "HDLC", "HbA1c")

# A) Pretty summary table with p-values and overall column (gtsummary -> gt)
tbl <- merged_data2 %>%
  dplyr::select(all_of(c(exposure, outcome, covariates))) %>%
  tbl_summary(
    by = !!rlang::sym(exposure),                     # stratify by Blood.Pressure
    include = all_of(c(outcome, covariates)),        # include HsCRP + covariates
    missing = "no",
    statistic = list(
      all_continuous() ~ "{mean} ({sd})",            # mean (sd) for continuous
      all_categorical() ~ "{n} ({p}%)"
    ),
    digits = all_continuous() ~ 2
  ) %>%
  add_overall() %>%                                  # include Overall column
  add_n() %>%                                        # show N in headers
  add_p(test = list(
    all_continuous() ~ "t.test",                     # default tests (can be adjusted)
    all_categorical() ~ "chisq.test"
  )) %>%
  modify_header(label = "**Characteristic**") %>%
  bold_labels() %>%
  modify_caption("**Table 1. Baseline / Demographic Characteristics **")
## 31 missing rows in the "Blood.Pressure" column have been removed.
## The following warnings were returned during `modify_caption()`:
## ! For variable `Diabetes` (`Blood.Pressure`) and "statistic", "p.value", and
##   "parameter" statistics: Chi-squared approximation may be incorrect
## ! For variable `Smoking` (`Blood.Pressure`) and "statistic", "p.value", and
##   "parameter" statistics: Chi-squared approximation may be incorrect
# Render nice GT table in R Markdown
tbl_gt <- as_gt(tbl)
tbl_gt
**Table 1. Baseline / Demographic Characteristics **
Characteristic N Overall
N = 25,766
1
No
N = 16,791
1
Yes
N = 8,975
1
p-value2
HsCRP 25,766 4.00 (8.11) 3.36 (6.39) 5.19 (10.50) <0.001
Age 25,766 48.44 (19.32) 42.07 (18.46) 60.36 (14.68) <0.001
Gender 25,766


0.029
    Female
13,495 (52%) 8,878 (53%) 4,617 (51%)
    Male
12,271 (48%) 7,913 (47%) 4,358 (49%)
Race 25,766


<0.001
    Mexican American
3,367 (13%) 2,508 (15%) 859 (9.6%)
    Non-Hispanic Asian
2,781 (11%) 2,034 (12%) 747 (8.3%)
    Non-Hispanic Black
5,250 (20%) 2,925 (17%) 2,325 (26%)
    Non-Hispanic White
10,232 (40%) 6,541 (39%) 3,691 (41%)
    Other Hispanic
2,782 (11%) 1,885 (11%) 897 (10.0%)
    Other Race - Including Multi-Racial
1,354 (5.3%) 898 (5.3%) 456 (5.1%)
Poverty.Ratio 22,556 2.60 (1.63) 2.63 (1.65) 2.55 (1.61) 0.002
BMI 25,397 29.52 (7.39) 28.35 (7.01) 31.72 (7.59) <0.001
Waist 24,522 99.54 (17.35) 95.82 (16.67) 106.67 (16.37) <0.001
Diabetes 25,766


<0.001
    Borderline
732 (2.8%) 322 (1.9%) 410 (4.6%)
    Don't know
14 (<0.1%) 11 (<0.1%) 3 (<0.1%)
    No
21,450 (83%) 15,382 (92%) 6,068 (68%)
    Yes
3,570 (14%) 1,076 (6.4%) 2,494 (28%)
Kidney.Disease 23,469


<0.001
    Don't know
39 (0.2%) 16 (0.1%) 23 (0.3%)
    No
22,489 (96%) 14,339 (98%) 8,150 (92%)
    Yes
941 (4.0%) 224 (1.5%) 717 (8.1%)
Smoking 24,541


<0.001
    Don't know
14 (<0.1%) 7 (<0.1%) 7 (<0.1%)
    No
14,640 (60%) 10,019 (64%) 4,621 (52%)
    Refused
6 (<0.1%) 2 (<0.1%) 4 (<0.1%)
    Yes
9,881 (40%) 5,579 (36%) 4,302 (48%)
Cholesterol 25,651 4.79 (1.08) 4.79 (1.06) 4.77 (1.11) 0.093
HDLC 25,651 53.69 (15.74) 54.16 (15.46) 52.83 (16.23) <0.001
HbA1c 25,743 5.79 (1.10) 5.58 (0.93) 6.17 (1.27) <0.001
1 Mean (SD); n (%)
2 Welch Two Sample t-test; Pearson’s Chi-squared test
# B) TableOne with Standardized Mean Differences (SMD) for quick balance check
catVars <- c("Gender", "Race", "Diabetes", "Kidney.Disease", "Smoking")

# Create TableOne
tab1 <- CreateTableOne(
  vars = c(outcome, covariates),
  strata = exposure,
  data = merged_data2,
  factorVars = catVars,
  includeNA = FALSE
)

# Print with SMD (as a data frame for nicer rendering)
tab1_print <- print(tab1, showAllLevels = TRUE, smd = TRUE)
##                            Stratified by Blood.Pressure
##                             level                               No           
##   n                                                             16791        
##   HsCRP (mean (SD))                                              3.36 (6.39) 
##   Age (mean (SD))                                               42.07 (18.46)
##   Gender (%)                Female                               8878 (52.9) 
##                             Male                                 7913 (47.1) 
##   Race (%)                  Mexican American                     2508 (14.9) 
##                             Non-Hispanic Asian                   2034 (12.1) 
##                             Non-Hispanic Black                   2925 (17.4) 
##                             Non-Hispanic White                   6541 (39.0) 
##                             Other Hispanic                       1885 (11.2) 
##                             Other Race - Including Multi-Racial   898 ( 5.3) 
##   Poverty.Ratio (mean (SD))                                      2.63 (1.65) 
##   BMI (mean (SD))                                               28.35 (7.01) 
##   Waist (mean (SD))                                             95.82 (16.67)
##   Diabetes (%)              Borderline                            322 ( 1.9) 
##                             Don't know                             11 ( 0.1) 
##                             No                                  15382 (91.6) 
##                             Yes                                  1076 ( 6.4) 
##   Kidney.Disease (%)        Don't know                             16 ( 0.1) 
##                             No                                  14339 (98.4) 
##                             Yes                                   224 ( 1.5) 
##   Smoking (%)               Don't know                              7 ( 0.0) 
##                             No                                  10019 (64.2) 
##                             Refused                                 2 ( 0.0) 
##                             Yes                                  5579 (35.7) 
##   Cholesterol (mean (SD))                                        4.79 (1.06) 
##   HDLC (mean (SD))                                              54.16 (15.46)
##   HbA1c (mean (SD))                                              5.58 (0.93) 
##                            Stratified by Blood.Pressure
##                             Yes            p      test SMD   
##   n                           8975                           
##   HsCRP (mean (SD))           5.19 (10.50) <0.001       0.211
##   Age (mean (SD))            60.36 (14.68) <0.001       1.097
##   Gender (%)                  4617 (51.4)   0.029       0.029
##                               4358 (48.6)                    
##   Race (%)                     859 ( 9.6)  <0.001       0.274
##                                747 ( 8.3)                    
##                               2325 (25.9)                    
##                               3691 (41.1)                    
##                                897 (10.0)                    
##                                456 ( 5.1)                    
##   Poverty.Ratio (mean (SD))   2.55 (1.61)   0.002       0.044
##   BMI (mean (SD))            31.72 (7.59)  <0.001       0.461
##   Waist (mean (SD))         106.67 (16.37) <0.001       0.657
##   Diabetes (%)                 410 ( 4.6)  <0.001       0.630
##                                  3 ( 0.0)                    
##                               6068 (67.6)                    
##                               2494 (27.8)                    
##   Kidney.Disease (%)            23 ( 0.3)  <0.001       0.311
##                               8150 (91.7)                    
##                                717 ( 8.1)                    
##   Smoking (%)                    7 ( 0.1)  <0.001       0.255
##                               4621 (51.7)                    
##                                  4 ( 0.0)                    
##                               4302 (48.2)                    
##   Cholesterol (mean (SD))     4.77 (1.11)   0.089       0.022
##   HDLC (mean (SD))           52.83 (16.23) <0.001       0.084
##   HbA1c (mean (SD))           6.17 (1.27)  <0.001       0.525
# Convert to data.frame for kable
tab1_df <- as.data.frame.matrix(tab1_print)
# Add rownames as first column
tab1_df <- tibble::rownames_to_column(tab1_df, var = "Variable")

# Show the SMD table (selecting the SMD column usually named "Std. Mean Diff" or "SMD")
# Depending on tableone version the column name might differ; try to find SMD column
smd_col <- grep("Std.*Mean|SMD|std", names(tab1_df), ignore.case = TRUE, value = TRUE)[1]
if (is.na(smd_col)) smd_col <- NULL

# Print via kable
tab1_df %>%
  kable(caption = "Table 2. TableOne output including standardized mean differences (SMD)") %>%
  kable_styling(full_width = FALSE, bootstrap_options = c("striped","hover","condensed"))
Table 2. TableOne output including standardized mean differences (SMD)
Variable level No Yes p test SMD
n 16791 8975
HsCRP..mean..SD.. 3.36 (6.39) 5.19 (10.50) <0.001 0.211
Age..mean..SD.. 42.07 (18.46) 60.36 (14.68) <0.001 1.097
Gender…. Female 8878 (52.9) 4617 (51.4) 0.029 0.029
X Male 7913 (47.1) 4358 (48.6)
Race…. Mexican American 2508 (14.9) 859 ( 9.6) <0.001 0.274
X.1 Non-Hispanic Asian 2034 (12.1) 747 ( 8.3)
X.2 Non-Hispanic Black 2925 (17.4) 2325 (25.9)
X.3 Non-Hispanic White 6541 (39.0) 3691 (41.1)
X.4 Other Hispanic 1885 (11.2) 897 (10.0)
X.5 Other Race - Including Multi-Racial 898 ( 5.3) 456 ( 5.1)
Poverty.Ratio..mean..SD.. 2.63 (1.65) 2.55 (1.61) 0.002 0.044
BMI..mean..SD.. 28.35 (7.01) 31.72 (7.59) <0.001 0.461
Waist..mean..SD.. 95.82 (16.67) 106.67 (16.37) <0.001 0.657
Diabetes…. Borderline 322 ( 1.9) 410 ( 4.6) <0.001 0.630
X.6 Don’t know 11 ( 0.1) 3 ( 0.0)
X.7 No 15382 (91.6) 6068 (67.6)
X.8 Yes 1076 ( 6.4) 2494 (27.8)
Kidney.Disease…. Don’t know 16 ( 0.1) 23 ( 0.3) <0.001 0.311
X.9 No 14339 (98.4) 8150 (91.7)
X.10 Yes 224 ( 1.5) 717 ( 8.1)
Smoking…. Don’t know 7 ( 0.0) 7 ( 0.1) <0.001 0.255
X.11 No 10019 (64.2) 4621 (51.7)
X.12 Refused 2 ( 0.0) 4 ( 0.0)
X.13 Yes 5579 (35.7) 4302 (48.2)
Cholesterol..mean..SD.. 4.79 (1.06) 4.77 (1.11) 0.089 0.022
HDLC..mean..SD.. 54.16 (15.46) 52.83 (16.23) <0.001 0.084
HbA1c..mean..SD.. 5.58 (0.93) 6.17 (1.27) <0.001 0.525
# Optionally show only key columns (Variable, No, Yes, SMD) if SMD found:
if (!is.null(smd_col)) {
  tab1_df %>%
    dplyr::select(Variable, starts_with("No"), starts_with("Yes"), all_of(smd_col)) %>%
    kable(caption = "Table 2. Key columns with SMD (No vs Yes)") %>%
    kable_styling(full_width = FALSE, bootstrap_options = c("striped","hover","condensed"))
}
Table 2. Key columns with SMD (No vs Yes)
Variable No Yes SMD
n 16791 8975
HsCRP..mean..SD.. 3.36 (6.39) 5.19 (10.50) 0.211
Age..mean..SD.. 42.07 (18.46) 60.36 (14.68) 1.097
Gender…. 8878 (52.9) 4617 (51.4) 0.029
X 7913 (47.1) 4358 (48.6)
Race…. 2508 (14.9) 859 ( 9.6) 0.274
X.1 2034 (12.1) 747 ( 8.3)
X.2 2925 (17.4) 2325 (25.9)
X.3 6541 (39.0) 3691 (41.1)
X.4 1885 (11.2) 897 (10.0)
X.5 898 ( 5.3) 456 ( 5.1)
Poverty.Ratio..mean..SD.. 2.63 (1.65) 2.55 (1.61) 0.044
BMI..mean..SD.. 28.35 (7.01) 31.72 (7.59) 0.461
Waist..mean..SD.. 95.82 (16.67) 106.67 (16.37) 0.657
Diabetes…. 322 ( 1.9) 410 ( 4.6) 0.630
X.6 11 ( 0.1) 3 ( 0.0)
X.7 15382 (91.6) 6068 (67.6)
X.8 1076 ( 6.4) 2494 (27.8)
Kidney.Disease…. 16 ( 0.1) 23 ( 0.3) 0.311
X.9 14339 (98.4) 8150 (91.7)
X.10 224 ( 1.5) 717 ( 8.1)
Smoking…. 7 ( 0.0) 7 ( 0.1) 0.255
X.11 10019 (64.2) 4621 (51.7)
X.12 2 ( 0.0) 4 ( 0.0)
X.13 5579 (35.7) 4302 (48.2)
Cholesterol..mean..SD.. 4.79 (1.06) 4.77 (1.11) 0.022
HDLC..mean..SD.. 54.16 (15.46) 52.83 (16.23) 0.084
HbA1c..mean..SD.. 5.58 (0.93) 6.17 (1.27) 0.525