Load in packages and dataset

# Packages

library(tidyverse)
library(dplyr)
library(purrr)
library(readr)
library(ggplot2)
library(ggforce)
library(broom)
library(forcats)
library(corrplot)
library(lubridate)
library(reshape2)

# Load in the Data
SSI_Raw <- read_csv("SSI_Raw.csv", 
    col_types = cols(id = col_character(), Age = col_number(), 
        SmokHx = col_character(), Dx = col_character(), 
        Surgeon = col_character(), Location = col_character(), 
        Defect_Size = col_number(), Closure = col_character(), 
        Primary_Closure = col_number(), `Flaps/Grafts _cm2` = col_number()))

Data Cleaning

# Convert column names to lowercase
colnames(SSI_Raw) <- tolower(colnames(SSI_Raw))

# Convert string values to lowercase
SSI_Clean <- SSI_Raw %>%
  mutate(across(where(is.character), tolower))

# Trim the trailing white spaces
SSI_Clean <- SSI_Clean %>%
  mutate(across(where(is.character), str_trim))

#Converting the variables
SSI_Clean <- SSI_Clean %>%
  mutate(
    sex = dplyr::recode(sex,
                  `0` = "male",
                  `1` = "female",
             .default = "NA"),
    smokhx = recode(smokhx,
                  `0` = "never",
                  `1` = "current",
                  `2` = "former",
             .default = "NA"),
    dm_hx = recode(dm_hx,
                  `0` = "no",
                  `1` = "yes",
                  `2` = "yes", #T1DM
             .default = "NA"),
    immuno_hx = recode(immuno_hx,
                  `0` = "no",
                  `1` = "yes",
                  `2` = "yes",
             .default = "NA"),
    ppx_abx = recode(ppx_abx,
                  `0` = "no",
                  `1` = "yes",
                  `2` = "yes",
             .default = "NA"),
    site = recode(site,
                  #Oddities
                  "hands" = "hand",
                  "feet"  = "foot",
                  "ears"  = "ear",
                  "lips"  = "lip",
                  "2"     = "face"),
    dx = recode(dx,
                  `1` = "SCC",
                  `2` = "BCC",
                  `3` = "Melanoma",
                  `4` = "Other_Malignancies",
                  `5` = "Lipomas",
                  `6` = "Cysts",
                  `7` = "Nevi",
                  `8` = "Other_Non-Malignancies",
             .default = "NA"),
    surgery = recode(surgery,
                  `0` = "Mohs",
                  `1` = "Excision",
             .default = "NA"),
    surgeon = recode(surgeon,
                  `1` = "Mohs_Surgeon",
                  `2` = "Non-Mohs_Derm",
                  `3` = "Plastics",
                  `4` = "ENT",
                  `5` = "Family_Med",
                  `6` = "Internal_Med",
                  `7` = "Surgical_Onc",
                  `8` = "OBGYN",
                  `9` = "Combined_Mohs +",
                 `10` = "Other",
             .default = "NA"),
    location = recode(location,
                  `0` = "inpatient",
                  `1` = "outpatient",
                  `2` = "mohs_out_+_OR",
              .default = "NA"),
    closure = recode(closure,
                  `1` = "Primary",
                  `2` = "Secondary",
                  `3` = "Flap",
                  `4` = "Flap",
                  `5` = "Flap",
                  `6` = "Flap",
                  `7` = "Graft",
                  `8` = "Other",
             .default = "NA"),
    ssi = recode(ssi,
                  `0` = "no",
                  `1` = "yes",
             .default = "NA"))

# Removed Other closure types
SSI_Clean <- SSI_Clean |> 
  filter(!closure == "Other")

SSI_Clean <- SSI_Clean |> 
  select(-c(primary_closure, `flaps/grafts _cm2`))

SSI_Clean_Unbinned <- SSI_Clean

# Binning Continuous Data
SSI_Clean <- SSI_Clean %>%
  mutate(
  age = case_when(
    age >= 19 & age <= 59 ~ "<60",
    age >= 60 & age <= 69 ~ "60-69",
    age >= 70 & age <= 79 ~ "70-79",
    age >= 80 ~ "80+",
           TRUE ~ NA_character_),
  defect_size = case_when(
    defect_size <= 1 ~ "<1.00_cm",
    defect_size >= 1 & defect_size < 1.50 ~ "1.00-1.49_cm",
    defect_size >= 1.50 & defect_size < 2.0 ~ "1.50-1.99_cm",
    defect_size >= 2 & defect_size < 2.5 ~ "2.00-2.49_cm",
    defect_size >= 2.5 ~ "≥2.50_cm",
          TRUE ~ NA_character_
  ))

Plot the Distribution per Closure

str(SSI_Clean_Unbinned)
## tibble [2,435 × 15] (S3: tbl_df/tbl/data.frame)
##  $ id         : num [1:2435] 2 4 5 7 8 9 10 11 12 13 ...
##  $ age        : num [1:2435] 81 67 67 64 71 61 60 72 69 70 ...
##  $ sex        : chr [1:2435] "female" "male" "male" "female" ...
##  $ smokhx     : chr [1:2435] "former" "former" "former" "never" ...
##  $ dm_hx      : chr [1:2435] "yes" "no" "no" "no" ...
##  $ immuno_hx  : chr [1:2435] "yes" "yes" "yes" "no" ...
##  $ ppx_abx    : chr [1:2435] "no" "no" "yes" "no" ...
##  $ site       : chr [1:2435] "face" "face" "groin" "ue" ...
##  $ dx         : chr [1:2435] "BCC" "SCC" "SCC" "Cysts" ...
##  $ surgery    : chr [1:2435] "Mohs" "Mohs" "Mohs" "Excision" ...
##  $ surgeon    : chr [1:2435] "Mohs_Surgeon" "Mohs_Surgeon" "Mohs_Surgeon" "Plastics" ...
##  $ location   : chr [1:2435] "outpatient" "outpatient" "outpatient" "inpatient" ...
##  $ defect_size: num [1:2435] 1.5 2 4 1 1 2 3 1.5 2.5 0.5 ...
##  $ closure    : chr [1:2435] "Primary" "Primary" "Graft" "Primary" ...
##  $ ssi        : chr [1:2435] "no" "no" "no" "no" ...
unique(SSI_Clean_Unbinned$defect_size)
##  [1]  1.5  2.0  4.0  1.0  3.0  2.5  0.5  1.1  2.7  2.3  0.8  0.4  0.7  1.2  1.8
## [16]  5.5  0.6  1.6  2.1  2.6  2.2  1.9  2.8  3.5  6.0  3.8  1.4  1.7  1.3  4.5
## [31]  0.9  2.4  3.2  8.0  0.2  3.1  4.1  3.7  5.0  3.3  2.9  5.1  3.6  4.6  6.5
## [46]  7.5  3.4  9.0  6.8  4.8 11.0 12.0 10.0  5.6  4.2  5.3  7.0  6.1  6.7 10.5
## [61]  7.2 18.0  4.7  0.0  0.3  4.3 13.7  5.2  4.9  3.9  5.4 36.0 38.0  5.7  6.2
unique(SSI_Clean_Unbinned$closure)
## [1] "Primary"   "Graft"     "Secondary" "Flap"
SSI_Clean_Unbinned <- SSI_Clean_Unbinned |> 
  filter(!defect_size >8)



SSI_Clean_Unbinned$closure <- factor(SSI_Clean_Unbinned$closure, 
                                     levels = c("Primary", 
                                                "Secondary", 
                                                "Flap", 
                                                "Graft"))


ggplot(SSI_Clean_Unbinned, aes(x = defect_size, fill = closure)) +
  geom_histogram(position = "identity", alpha = 1, binwidth = .3) +
  theme_minimal() +
  xlim(0, 8) +
  scale_fill_viridis_d(option = "plasma", direction = 1) +
  labs(title = "Distribution of Defect Size by Closure Technique",
       x = "Defect Size (cm)",
       y = "Number of Cases",
       fill = "Closure")
## Warning: Removed 8 rows containing missing values (`geom_bar()`).

Data Analysis

Frequency Table

Demographic Frequency

library(dplyr)
library(tidyr)

# Define the columns to be evaluated
columns_to_evaluate <- c("age", "sex", "smokhx", "dm_hx", "immuno_hx", "ppx_abx", "site", "dx", "surgeon", "location", "defect_size", "ssi")

# Reshape the data to a long format
long_df <- SSI_Clean %>%
  select(all_of(columns_to_evaluate), closure) %>%
  pivot_longer(cols = -closure, names_to = "variable", values_to = "level")

# Calculate frequencies for each closure level
freq_table_closure <- long_df %>%
  group_by(variable, level, closure) %>%
  summarise(n = n()) %>%
  pivot_wider(names_from = closure, values_from = n, names_prefix = "n_", names_sep = "")

# Calculate closure totals for each level
closure_totals <- long_df %>%
  group_by(variable, level, closure) %>%
  summarise(Total_per_closure = n())

# Calculate percentages within each closure group
percent_table_closure <- long_df %>%
  group_by(variable, closure) %>%
  summarise(Total_per_closure_group = n()) %>%
  left_join(closure_totals, by = c("variable", "closure")) %>%
  mutate(percent = Total_per_closure / Total_per_closure_group * 100) %>%
  pivot_wider(names_from = closure, values_from = percent, names_prefix = "percent_", names_sep = "") %>%
  select(-Total_per_closure, -Total_per_closure_group)

# Join the count and percentage tables
closure_combined <- left_join(freq_table_closure, percent_table_closure, by = c("variable", "level"))

# Calculate total frequencies and percentages for each variable level
freq_table_total <- long_df %>%
  group_by(variable, level) %>%
  summarise(Total_n = n(),
            Total_Percent = Total_n / sum(Total_n) * 100)

# Join the tables to form the final frequency table
final_freq_table <- left_join(freq_table_total, closure_combined, by = c("variable", "level"))

# View the final frequency table
print(final_freq_table)
## # A tibble: 186 × 12
## # Groups:   variable [12]
##    variable level Total_n Total_Percent n_Flap n_Graft n_Primary n_Secondary
##    <chr>    <chr>   <int>         <dbl>  <int>   <int>     <int>       <int>
##  1 age      60-69     741           100     75      31       490         145
##  2 age      60-69     741           100     75      31       490         145
##  3 age      60-69     741           100     75      31       490         145
##  4 age      60-69     741           100     75      31       490         145
##  5 age      70-79     708           100     96      28       414         170
##  6 age      70-79     708           100     96      28       414         170
##  7 age      70-79     708           100     96      28       414         170
##  8 age      70-79     708           100     96      28       414         170
##  9 age      80+       450           100     70      24       235         121
## 10 age      80+       450           100     70      24       235         121
## # ℹ 176 more rows
## # ℹ 4 more variables: percent_Flap <dbl>, percent_Graft <dbl>,
## #   percent_Primary <dbl>, percent_Secondary <dbl>

SSI Frequency

# Define the columns to be evaluated
columns_to_evaluate <- c("age", "sex", "smokhx", "dm_hx", "immuno_hx", "ppx_abx", "site", "dx", "surgeon", "location", "surgery", "defect_size", "closure")

# Reshape the data to a long format
long_df <- SSI_Clean %>%
  select(all_of(columns_to_evaluate), ssi) %>%
  pivot_longer(cols = -ssi, names_to = "variable", values_to = "level")

# Calculate frequencies for each closure level
freq_table_ssi <- long_df %>%
  group_by(variable, level, ssi) %>%
  count(name = "n") %>%
  ungroup()

# Make the new table of yes/no
freq_table_ssi_yes <- freq_table_ssi |> 
  filter(ssi == "yes")
freq_table_ssi_no <- freq_table_ssi |> 
  filter(ssi == "no")
freq_table_ssi <- left_join(freq_table_ssi_yes, freq_table_ssi_no, by = c("variable", "level")) |> 
  select(-c(ssi.x, ssi.y))

Chi-Square Analysis

# Chi-Square Data
SSI_Chi_Square <- SSI_Clean

# Make all values factors
SSI_Chi_Square[] <- lapply(SSI_Chi_Square, as.factor)

# Chi-Square Analysis
# Create an empty data frame to store results
chi_square_results <- data.frame(
  Variable = character(),
  Chi_Square = numeric(),
  Degrees_of_Freedom = numeric(),
  P_Value = numeric(),
  stringsAsFactors = FALSE
)

# Loop through each column in the data frame
for (col_name in colnames(SSI_Chi_Square)) {
  
  cat("Processing:", col_name, "\n")
  
  # Skip the 'ssi' and 'id' columns
  if (col_name != 'ssi' && col_name != 'id') {
    
    # Convert columns to factors for chi-square test
    temp_col <- as.factor(SSI_Chi_Square[[col_name]])
    temp_ssi <- as.factor(SSI_Chi_Square$ssi)
    
    # Perform the chi-square test
    test_result <- chisq.test(temp_col, temp_ssi)
    print(test_result)  
    
    # Store the results in the data frame
    chi_square_results <- rbind(
      chi_square_results,
      data.frame(
        Variable = col_name,
        Chi_Square = test_result$statistic,
        Degrees_of_Freedom = test_result$parameter,
        P_Value = test_result$p.value,
        stringsAsFactors = FALSE
      )
    )
    print(chi_square_results) 
  }
}
## Processing: id 
## Processing: age 
## 
##  Pearson's Chi-squared test
## 
## data:  temp_col and temp_ssi
## X-squared = 4.2815, df = 3, p-value = 0.2326
## 
##           Variable Chi_Square Degrees_of_Freedom   P_Value
## X-squared      age    4.28154                  3 0.2326239
## Processing: sex 
## 
##  Pearson's Chi-squared test with Yates' continuity correction
## 
## data:  temp_col and temp_ssi
## X-squared = 3.4375, df = 1, p-value = 0.06373
## 
##            Variable Chi_Square Degrees_of_Freedom    P_Value
## X-squared       age   4.281540                  3 0.23262390
## X-squared1      sex   3.437466                  1 0.06373336
## Processing: smokhx 
## 
##  Pearson's Chi-squared test
## 
## data:  temp_col and temp_ssi
## X-squared = 2.503, df = 2, p-value = 0.2861
## 
##            Variable Chi_Square Degrees_of_Freedom    P_Value
## X-squared       age   4.281540                  3 0.23262390
## X-squared1      sex   3.437466                  1 0.06373336
## X-squared2   smokhx   2.503042                  2 0.28606935
## Processing: dm_hx 
## 
##  Pearson's Chi-squared test with Yates' continuity correction
## 
## data:  temp_col and temp_ssi
## X-squared = 1.7904, df = 1, p-value = 0.1809
## 
##            Variable Chi_Square Degrees_of_Freedom    P_Value
## X-squared       age   4.281540                  3 0.23262390
## X-squared1      sex   3.437466                  1 0.06373336
## X-squared2   smokhx   2.503042                  2 0.28606935
## X-squared3    dm_hx   1.790356                  1 0.18088280
## Processing: immuno_hx 
## 
##  Pearson's Chi-squared test with Yates' continuity correction
## 
## data:  temp_col and temp_ssi
## X-squared = 0.0017427, df = 1, p-value = 0.9667
## 
##             Variable  Chi_Square Degrees_of_Freedom    P_Value
## X-squared        age 4.281540209                  3 0.23262390
## X-squared1       sex 3.437466262                  1 0.06373336
## X-squared2    smokhx 2.503042038                  2 0.28606935
## X-squared3     dm_hx 1.790355940                  1 0.18088280
## X-squared4 immuno_hx 0.001742664                  1 0.96670180
## Processing: ppx_abx 
## 
##  Pearson's Chi-squared test with Yates' continuity correction
## 
## data:  temp_col and temp_ssi
## X-squared = 3.6197e-28, df = 1, p-value = 1
## 
##             Variable   Chi_Square Degrees_of_Freedom    P_Value
## X-squared        age 4.281540e+00                  3 0.23262390
## X-squared1       sex 3.437466e+00                  1 0.06373336
## X-squared2    smokhx 2.503042e+00                  2 0.28606935
## X-squared3     dm_hx 1.790356e+00                  1 0.18088280
## X-squared4 immuno_hx 1.742664e-03                  1 0.96670180
## X-squared5   ppx_abx 3.619715e-28                  1 1.00000000
## Processing: site 
## 
##  Pearson's Chi-squared test
## 
## data:  temp_col and temp_ssi
## X-squared = 34.699, df = 11, p-value = 0.0002778
## 
##             Variable   Chi_Square Degrees_of_Freedom     P_Value
## X-squared        age 4.281540e+00                  3 0.232623905
## X-squared1       sex 3.437466e+00                  1 0.063733363
## X-squared2    smokhx 2.503042e+00                  2 0.286069349
## X-squared3     dm_hx 1.790356e+00                  1 0.180882801
## X-squared4 immuno_hx 1.742664e-03                  1 0.966701798
## X-squared5   ppx_abx 3.619715e-28                  1 1.000000000
## X-squared6      site 3.469900e+01                 11 0.000277771
## Processing: dx 
## 
##  Pearson's Chi-squared test
## 
## data:  temp_col and temp_ssi
## X-squared = 15.18, df = 7, p-value = 0.03376
## 
##             Variable   Chi_Square Degrees_of_Freedom     P_Value
## X-squared        age 4.281540e+00                  3 0.232623905
## X-squared1       sex 3.437466e+00                  1 0.063733363
## X-squared2    smokhx 2.503042e+00                  2 0.286069349
## X-squared3     dm_hx 1.790356e+00                  1 0.180882801
## X-squared4 immuno_hx 1.742664e-03                  1 0.966701798
## X-squared5   ppx_abx 3.619715e-28                  1 1.000000000
## X-squared6      site 3.469900e+01                 11 0.000277771
## X-squared7        dx 1.518019e+01                  7 0.033757704
## Processing: surgery 
## 
##  Pearson's Chi-squared test with Yates' continuity correction
## 
## data:  temp_col and temp_ssi
## X-squared = 1.1048, df = 1, p-value = 0.2932
## 
##             Variable   Chi_Square Degrees_of_Freedom     P_Value
## X-squared        age 4.281540e+00                  3 0.232623905
## X-squared1       sex 3.437466e+00                  1 0.063733363
## X-squared2    smokhx 2.503042e+00                  2 0.286069349
## X-squared3     dm_hx 1.790356e+00                  1 0.180882801
## X-squared4 immuno_hx 1.742664e-03                  1 0.966701798
## X-squared5   ppx_abx 3.619715e-28                  1 1.000000000
## X-squared6      site 3.469900e+01                 11 0.000277771
## X-squared7        dx 1.518019e+01                  7 0.033757704
## X-squared8   surgery 1.104798e+00                  1 0.293215451
## Processing: surgeon 
## 
##  Pearson's Chi-squared test
## 
## data:  temp_col and temp_ssi
## X-squared = 11.093, df = 8, p-value = 0.1965
## 
##             Variable   Chi_Square Degrees_of_Freedom     P_Value
## X-squared        age 4.281540e+00                  3 0.232623905
## X-squared1       sex 3.437466e+00                  1 0.063733363
## X-squared2    smokhx 2.503042e+00                  2 0.286069349
## X-squared3     dm_hx 1.790356e+00                  1 0.180882801
## X-squared4 immuno_hx 1.742664e-03                  1 0.966701798
## X-squared5   ppx_abx 3.619715e-28                  1 1.000000000
## X-squared6      site 3.469900e+01                 11 0.000277771
## X-squared7        dx 1.518019e+01                  7 0.033757704
## X-squared8   surgery 1.104798e+00                  1 0.293215451
## X-squared9   surgeon 1.109344e+01                  8 0.196460646
## Processing: location 
## 
##  Pearson's Chi-squared test
## 
## data:  temp_col and temp_ssi
## X-squared = 2.0576, df = 2, p-value = 0.3574
## 
##              Variable   Chi_Square Degrees_of_Freedom     P_Value
## X-squared         age 4.281540e+00                  3 0.232623905
## X-squared1        sex 3.437466e+00                  1 0.063733363
## X-squared2     smokhx 2.503042e+00                  2 0.286069349
## X-squared3      dm_hx 1.790356e+00                  1 0.180882801
## X-squared4  immuno_hx 1.742664e-03                  1 0.966701798
## X-squared5    ppx_abx 3.619715e-28                  1 1.000000000
## X-squared6       site 3.469900e+01                 11 0.000277771
## X-squared7         dx 1.518019e+01                  7 0.033757704
## X-squared8    surgery 1.104798e+00                  1 0.293215451
## X-squared9    surgeon 1.109344e+01                  8 0.196460646
## X-squared10  location 2.057629e+00                  2 0.357430496
## Processing: defect_size 
## 
##  Pearson's Chi-squared test
## 
## data:  temp_col and temp_ssi
## X-squared = 14.799, df = 4, p-value = 0.005137
## 
##                Variable   Chi_Square Degrees_of_Freedom     P_Value
## X-squared           age 4.281540e+00                  3 0.232623905
## X-squared1          sex 3.437466e+00                  1 0.063733363
## X-squared2       smokhx 2.503042e+00                  2 0.286069349
## X-squared3        dm_hx 1.790356e+00                  1 0.180882801
## X-squared4    immuno_hx 1.742664e-03                  1 0.966701798
## X-squared5      ppx_abx 3.619715e-28                  1 1.000000000
## X-squared6         site 3.469900e+01                 11 0.000277771
## X-squared7           dx 1.518019e+01                  7 0.033757704
## X-squared8      surgery 1.104798e+00                  1 0.293215451
## X-squared9      surgeon 1.109344e+01                  8 0.196460646
## X-squared10    location 2.057629e+00                  2 0.357430496
## X-squared11 defect_size 1.479889e+01                  4 0.005137038
## Processing: closure 
## 
##  Pearson's Chi-squared test
## 
## data:  temp_col and temp_ssi
## X-squared = 11.468, df = 3, p-value = 0.009447
## 
##                Variable   Chi_Square Degrees_of_Freedom     P_Value
## X-squared           age 4.281540e+00                  3 0.232623905
## X-squared1          sex 3.437466e+00                  1 0.063733363
## X-squared2       smokhx 2.503042e+00                  2 0.286069349
## X-squared3        dm_hx 1.790356e+00                  1 0.180882801
## X-squared4    immuno_hx 1.742664e-03                  1 0.966701798
## X-squared5      ppx_abx 3.619715e-28                  1 1.000000000
## X-squared6         site 3.469900e+01                 11 0.000277771
## X-squared7           dx 1.518019e+01                  7 0.033757704
## X-squared8      surgery 1.104798e+00                  1 0.293215451
## X-squared9      surgeon 1.109344e+01                  8 0.196460646
## X-squared10    location 2.057629e+00                  2 0.357430496
## X-squared11 defect_size 1.479889e+01                  4 0.005137038
## X-squared12     closure 1.146783e+01                  3 0.009447324
## Processing: ssi

Evaluate for Multicollinearity

multicol <- model.matrix(~ site + dx + ppx_abx + dm_hx + smokhx + immuno_hx + defect_size + closure - 1, data = SSI_Clean)

# Calculate the correlation matrix
cor_matrix <- cor(multicol)

# Convert the correlation matrix into a long format
long_cor_matrix <- melt(cor_matrix)

corrplot(cor_matrix, method = "circle", tl.cex = 0.6)

Multivariate Regression Analysis

SSI_Clean$defect_size <- factor(SSI_Clean$defect_size, 
                                levels = c("<1.00_cm", "1.00-1.49_cm", "1.50-1.99_cm", "2.00-2.49_cm", "≥2.50_cm"))

# Reorder closure
SSI_Clean$closure <- factor(SSI_Clean$closure, 
                            levels = c("Primary", "Secondary", "Flap", "Graft"))

# Setting the reference for dx
SSI_Clean$dx <- factor(SSI_Clean$dx, 
                       levels = c("BCC", "SCC", "Cysts", "Lipomas", "Nevi", "Other_Non-Malignancies", "Melanoma", "Other_Malignancies"))

# Setting the reference for site
SSI_Clean$site <- factor(SSI_Clean$site, 
                         levels = c("face", "groin", "ue", "ear", "trunk", "hand", "nose", "scalp", "neck", "lip", "le", "foot"))
SSI_Clean$smokhx <- factor(SSI_Clean$smokhx, 
                           levels = c("never", "former", "current"))
SSI_Clean$age <- factor(SSI_Clean$age, 
                           levels = c("<60", "60-69", "70-79", "80+"))
SSI_Clean$immuno_hx <- factor(SSI_Clean$immuno_hx, 
                           levels = c("no", "yes"))
SSI_Clean$dm_hx <- factor(SSI_Clean$dm_hx, 
                           levels = c("no", "yes"))
SSI_Clean$ppx_abx <- factor(SSI_Clean$ppx_abx, 
                           levels = c("no", "yes"))
SSI_Clean$surgery <- factor(SSI_Clean$surgery, 
                           levels = c("Mohs", "Excision"))

#Create a new data set for the analysis where SSI is numerical
SSI_Regression <- SSI_Clean
SSI_Regression$ssi <- ifelse(SSI_Regression$ssi == "yes", 1, 0)

# Perform Multivariate Logistic Regression
glm_model <- glm(ssi ~ 
                   site +
                   age +
                   immuno_hx +
                   smokhx +
                   dm_hx +
                   ppx_abx +
                   dx +
                   defect_size +
                   closure, 
              data = SSI_Regression, 
              family = binomial)
summary(glm_model)
## 
## Call:
## glm(formula = ssi ~ site + age + immuno_hx + smokhx + dm_hx + 
##     ppx_abx + dx + defect_size + closure, family = binomial, 
##     data = SSI_Regression)
## 
## Coefficients:
##                          Estimate Std. Error z value Pr(>|z|)    
## (Intercept)              -3.11474    0.35102  -8.873  < 2e-16 ***
## sitegroin                -0.10920    1.07647  -0.101  0.91920    
## siteue                    0.22508    0.27424   0.821  0.41180    
## siteear                   0.37894    0.30487   1.243  0.21389    
## sitetrunk                 0.08526    0.27953   0.305  0.76035    
## sitehand                 -0.19919    0.58092  -0.343  0.73168    
## sitenose                  0.71477    0.32401   2.206  0.02739 *  
## sitescalp                -1.12152    0.49885  -2.248  0.02456 *  
## siteneck                 -1.48692    0.73488  -2.023  0.04304 *  
## sitelip                   0.86789    0.52022   1.668  0.09526 .  
## sitele                    0.72942    0.31823   2.292  0.02190 *  
## sitefoot                  1.09162    0.70681   1.544  0.12248    
## age60-69                 -0.39404    0.22869  -1.723  0.08488 .  
## age70-79                 -0.35654    0.23653  -1.507  0.13171    
## age80+                   -0.14404    0.25352  -0.568  0.56993    
## immuno_hxyes             -0.09313    0.16289  -0.572  0.56749    
## smokhxformer              0.18578    0.16250   1.143  0.25292    
## smokhxcurrent            -0.27345    0.38793  -0.705  0.48088    
## dm_hxyes                  0.19223    0.18327   1.049  0.29422    
## ppx_abxyes               -0.20220    0.27005  -0.749  0.45401    
## dxSCC                     0.14147    0.20253   0.699  0.48486    
## dxCysts                  -0.38126    0.34437  -1.107  0.26823    
## dxLipomas                -0.20691    0.64339  -0.322  0.74776    
## dxNevi                   -0.68041    0.55213  -1.232  0.21782    
## dxOther_Non-Malignancies -0.27729    0.46257  -0.599  0.54887    
## dxMelanoma                0.32553    0.29527   1.102  0.27025    
## dxOther_Malignancies      0.01011    0.77699   0.013  0.98962    
## defect_size1.00-1.49_cm   0.23218    0.33912   0.685  0.49357    
## defect_size1.50-1.99_cm   0.80865    0.30477   2.653  0.00797 ** 
## defect_size2.00-2.49_cm   0.91721    0.32477   2.824  0.00474 ** 
## defect_size≥2.50_cm       0.90829    0.31332   2.899  0.00374 ** 
## closureSecondary          0.14348    0.22535   0.637  0.52430    
## closureFlap              -0.23222    0.29496  -0.787  0.43111    
## closureGraft              0.28355    0.39512   0.718  0.47299    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 1334  on 2434  degrees of freedom
## Residual deviance: 1260  on 2401  degrees of freedom
## AIC: 1328
## 
## Number of Fisher Scoring iterations: 6
# Calculating Odds Ratios and Confidence Intervals
exp_coef <- exp(coef(glm_model))
conf_int <- exp(confint(glm_model))
result <- data.frame(OR = exp_coef, '2.5 %' = conf_int[, 1], '97.5 %' = conf_int[, 2])

#Create the table 
library(broom)
glm_summary <- tidy(glm_model, conf.int = TRUE, conf.level = 0.95, exponentiate = TRUE)
colnames(glm_summary) <- c("Variable", "OR", "Std. Error", "Statistic", "p-value", "CI lower", "CI upper")

library(ggplot2)

p_values <- summary(glm_model)$coefficients[, 4]

# Modify the significant_mask to include "Size: 1.00 - 1.49 cm"
significant_mask <- p_values < 0.05 | names(p_values) == "defect_size1.00-1.49_cm"

significant_exp_coef <- exp_coef[significant_mask]
significant_conf_int <- conf_int[significant_mask, ]

# Create a data frame containing variable names, odds ratios, and confidence intervals
forest_data_significant <- data.frame(
  Term = names(significant_exp_coef),
  OR = as.numeric(significant_exp_coef),
  Lower = as.numeric(significant_conf_int[, 1]),
  Upper = as.numeric(significant_conf_int[, 2])
)

closure_terms <- c("closureSecondary", "closureFlap", "closureGraft")
closure_exp_coef <- exp_coef[closure_terms]
closure_conf_int <- conf_int[closure_terms, ]

# Create a data frame for the closure terms
closure_data <- data.frame(
  Term = closure_terms,
  OR = as.numeric(closure_exp_coef),
  Lower = as.numeric(closure_conf_int[, 1]),
  Upper = as.numeric(closure_conf_int[, 2])
)

# Bind the closure data to the original dataframe
forest_data_significant <- rbind(forest_data_significant, closure_data)

# Rename the variables 
forest_data_significant <- forest_data_significant %>%
  mutate(
    Term = dplyr::recode(Term, 
                  "sitescalp" = "Site: Scalp",
                  "siteneck" = "Site: Neck",
                  "sitenose" = "Site: Nose",
                  "sitele" = "Site: Lower Extremity",
                  "defect_size1.00-1.49_cm" = "Size: 1.00 - 1.49 cm",
                  "defect_size2.00-2.49_cm" = "Size: 2.00 - 2.49 cm", 
                  "defect_size1.50-1.99_cm" = "Size: 1.50 - 1.99 cm",
                  "defect_size≥2.50_cm" = "Size: ≥2.50 cm",
                  "closureSecondary" = "Closure: Secondary", 
                  "closureFlap" = "Closure: Flap", 
                  "closureGraft" = "Closure: Graft")) %>%
  filter(!Term == "(Intercept)")

# Add in the other variables for comparison
new_terms <- data.frame(
  Term = c("Site: Face", "Size: < 1.00 cm", "Closure: Primary"),
  OR = NA,
  Lower = NA,
  Upper = NA,
  stringsAsFactors = FALSE)
forest_data_significant <- rbind(forest_data_significant, new_terms)

# Order the variables
term_order <- rev(c(
                "Size: < 1.00 cm", 
  "Size: 1.00 - 1.49 cm", 
  "Size: 1.50 - 1.99 cm", 
  "Size: 2.00 - 2.49 cm", 
  "Size: ≥2.50 cm", 
                "Site: Face", 
  "Site: Scalp", 
  "Site: Neck", 
  "Site: Nose", 
  "Site: Lower Extremity", 
                  "Closure: Primary", 
  "Closure: Secondary", 
  "Closure: Flap", 
  "Closure: Graft"))

# Convert Term to a factor with the specified order
forest_data_significant$Term <- factor(forest_data_significant$Term, levels = term_order)

# Arrange the rows based on the factor levels
forest_data_significant <- forest_data_significant %>%
  arrange(Term)

# Create a new column for grouping
forest_data_significant$Group <- ifelse(grepl("^Site:", forest_data_significant$Term), "Site",
                                   ifelse(grepl("^Size:", forest_data_significant$Term), "Defect Size", "Closure"))

# Recode the variables for the graph
forest_data_significant <- forest_data_significant %>%
  mutate(
    Term = dplyr::recode(Term,
                  "Closure: Primary" = "Primary", 
                  "Closure: Secondary" = "Secondary",
                  "Closure: Flap" = "Flap", 
                  "Closure: Graft" = "Graft",
                  "Site: Face" = "Face",
                  "Site: Scalp" = "Scalp",
                  "Site: Nose" = "Nose", 
                  "Site: Neck" = "Neck",
                  "Site: Lower Extremity" = "Lower Extemity",
                  "Size: < 1.00 cm" = "< 1.00 cm",
                  "Size: 1.00 - 1.49 cm" = "1.00 - 1.49 cm*",
                  "Size: 1.50 - 1.99 cm" = "1.50 - 1.99 cm",
                  "Size: 2.00 - 2.49 cm" = "2.00 - 2.50 cm", 
                  "Size: ≥2.50 cm" = "≥ 2.50 cm"))
# Forest Plot with Facet
ggplot(forest_data_significant, aes(x = Term, y = OR, ymin = Lower, ymax = Upper)) +
  geom_pointrange(aes(color = ifelse(Lower <= 1 & Upper >= 1, "grey", 
                                     ifelse(OR > 1, "red", "blue")),
                      fill = ifelse(Lower <= 1 & Upper >= 1, "grey", 
                                    ifelse(OR > 1, "red", "blue")))) +
  geom_hline(yintercept = 1, linetype = "dashed") +
  coord_flip() +
  xlab("") +
  ylab("Odds Ratio (95% CI)") +
  scale_fill_manual(values = c("red" = "red", "blue" = "blue", "grey" = "grey"), guide = "none") +
  scale_color_manual(values = c("red" = "red", "blue" = "blue", "grey" = "grey"), guide = "none") +
  geom_text(data = subset(forest_data_significant, Term %in% c("Face", "< 1.00 cm", "Primary")), 
            aes(label = "[Reference]", y = 0.4, x = Term), size = 3, color = "grey40") +
  theme_minimal() +
  facet_grid(Group ~ ., scales = "free_y", space = "free_y", switch = "y") +
  theme(
    strip.background = element_rect(fill = "grey90", colour = "grey90", size = 1), 
    strip.text = element_text(size = 14, colour = "black"),
    strip.placement = "outside",
    axis.title.y = element_text(size = 12),
    axis.text.y = element_text(size = 10),
    axis.title.x = element_text(size = 12, vjust = -1),
    axis.text.x = element_text(size = 10),
    axis.ticks = element_line(colour = "gray10"),
    axis.text = element_text(size = 12, colour = "gray10")
  ) +
  labs(x = NULL)

Evaluation of Multivariate Model

# ANOVA
anova(glm_model, test = "Chisq")
# Model Fit
library(pscl)
## Classes and Methods for R developed in the
## Political Science Computational Laboratory
## Department of Political Science
## Stanford University
## Simon Jackman
## hurdle and zeroinfl functions by Achim Zeileis
pR2(glm_model)
## fitting null model for pseudo-r2
##           llh       llhNull            G2      McFadden          r2ML 
## -629.99411375 -667.01538638   74.04254526    0.05550288    0.02994996 
##          r2CU 
##    0.07100305
# Multicollinearity Assessment
library(car)
## Loading required package: carData
## 
## Attaching package: 'car'
## The following object is masked from 'package:dplyr':
## 
##     recode
## The following object is masked from 'package:purrr':
## 
##     some
vif_result <- vif(glm_model)
print(vif_result)
##                 GVIF Df GVIF^(1/(2*Df))
## site        4.534579 11        1.071131
## age         1.337120  3        1.049611
## immuno_hx   1.124382  1        1.060369
## smokhx      1.100644  2        1.024264
## dm_hx       1.063055  1        1.031045
## ppx_abx     1.058033  1        1.028607
## dx          2.278550  7        1.060589
## defect_size 1.445961  4        1.047176
## closure     2.649060  3        1.176292
detach(package:car, unload = TRUE)

Predicted Probabilities of SSI by Defect Size

library(ggplot2)
library(ggforce)


# Calculate predicted probabilities and confidence intervals for each defect_size and set reference values
new_data <- expand.grid(
  age         = levels(SSI_Regression$age)[1],
  immuno_hx   = levels(SSI_Regression$immuno_hx)[1],
  dm_hx       = levels(SSI_Regression$dm_hx)[1],
  smokhx      = levels(SSI_Regression$smokhx)[1],
  ppx_abx     = levels(SSI_Regression$ppx_abx)[1],
  site        = levels(SSI_Regression$site)[1],
  dx          = levels(SSI_Regression$dx)[1],
  surgery     = levels(SSI_Regression$surgery)[1],
  defect_size = levels(SSI_Regression$defect_size),
  closure     = levels(SSI_Regression$closure)[1])

predicted_values <- predict(glm_model, newdata = new_data, type = "response", se.fit = TRUE)

# Compute Wald 95% CIs
z_value <- qnorm(0.975)
new_data$prob <- predicted_values$fit
new_data$CI_low <- predicted_values$fit - z_value * predicted_values$se.fit
new_data$CI_high <- predicted_values$fit + z_value * predicted_values$se.fit

new_data$defect_size <- factor(new_data$defect_size, 
                               levels = c("<1.00_cm", "1.00-1.49_cm", "1.50-1.99_cm", "2.00-2.49_cm", "≥2.50_cm"))
new_data <- new_data %>%
  mutate(
    prob = prob * 100,
    CI_low = CI_low * 100,
    CI_high = CI_high * 100,
    defect_size = fct_recode(defect_size, 
                         "<1.00_cm" = "< 1.00", 
                         "1.00-1.49_cm" = "1.00 - 1.49",
                         "1.50-1.99_cm" = "1.50 - 1.99", 
                         "2.00-2.49_cm" = "2.00 - 2.49", 
                         "≥2.50_cm" = "≥ 2.50"))
plot_data <- new_data %>% 
              filter(site == "face", dx == "BCC", closure == "Primary")

# Creat infection rates to compare aganist predicted
infection_rate <- SSI_Clean |> 
  group_by(defect_size) |> 
  dplyr::summarise(
    total = n(), 
    yes_count = sum(ssi == "yes", na.rm = TRUE), 
    infection_percentage = (yes_count / total) * 100)
plot_data$infection_rate <- infection_rate$infection_percentage


# Plot
library(ggplot2)

p <- ggplot(plot_data, aes(x = defect_size, y = prob)) +
  geom_line(group = 1, color = "#2C3E50", size = 1) +
  geom_pointrange(aes(ymin = CI_low, ymax = CI_high), color = "#3498DB", size = 0.5, fatten = 2) +
  xlab("Defect Size") +
  ylab("Predicted Probability of Surgical Site Infection") +
  ggtitle("Predicted Probability of Infection by Defect Size") +
  theme_light(base_size = 14, base_family = "Helvetica") +
  theme(plot.title = element_text(face = "bold", size = 20),
        axis.title = element_text(size = 14))



print(p)

Packages Citation

packages <- c("dplyr", "ggplot2", "tidyr", "tidyverse", "purrr", "broom", "lubridate", "forcats", "stringr", "tibble", "readr", "pscl", "car", "corrplot", "reshape2") 
package_versions <- sapply(packages, packageVersion)
package_versions
## $dplyr
## [1] 1 1 3
## 
## $ggplot2
## [1] 3 4 4
## 
## $tidyr
## [1] 1 3 0
## 
## $tidyverse
## [1] 2 0 0
## 
## $purrr
## [1] 1 0 2
## 
## $broom
## [1] 1 0 5
## 
## $lubridate
## [1] 1 9 3
## 
## $forcats
## [1] 1 0 0
## 
## $stringr
## [1] 1 5 0
## 
## $tibble
## [1] 3 2 1
## 
## $readr
## [1] 2 1 4
## 
## $pscl
## [1] 1 5 5 1
## 
## $car
## [1] 3 1 2
## 
## $corrplot
## [1]  0 92
## 
## $reshape2
## [1] 1 4 4