library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.4     ✔ readr     2.1.5
## ✔ forcats   1.0.0     ✔ stringr   1.5.1
## ✔ ggplot2   3.5.2     ✔ tibble    3.3.0
## ✔ lubridate 1.9.4     ✔ tidyr     1.3.1
## ✔ purrr     1.0.4     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(summarytools)
## 
## Attaching package: 'summarytools'
## 
## The following object is masked from 'package:tibble':
## 
##     view
library(ggridges)
library(gplots)
## 
## Attaching package: 'gplots'
## 
## The following object is masked from 'package:stats':
## 
##     lowess
library(vcd)
## Loading required package: grid
library(BayesFactor)
## Loading required package: coda
## Loading required package: Matrix
## 
## Attaching package: 'Matrix'
## 
## The following objects are masked from 'package:tidyr':
## 
##     expand, pack, unpack
## 
## ************
## Welcome to BayesFactor 0.9.12-4.7. If you have questions, please contact Richard Morey (richarddmorey@gmail.com).
## 
## Type BFManual() to open the manual.
## ************
library(ggpubr)
library(grid)
library(ggplot2)
library(dplyr)
library(cluster)
library(factoextra)
## Warning: package 'factoextra' was built under R version 4.5.1
## Welcome! Want to learn more? See two factoextra-related books at https://goo.gl/ve3WBa
library(reshape2)
## 
## Attaching package: 'reshape2'
## 
## The following object is masked from 'package:tidyr':
## 
##     smiths
library(RColorBrewer)
library(broom)
## Warning: package 'broom' was built under R version 4.5.1
# Set working directory
output_dir <- "F:/Surg Cdr Aksai Kannan/AnthroResults"
dir.create(output_dir, showWarnings = FALSE)
setwd(output_dir)

data <- read.csv("ANTHRO.csv")
colnames(data) <- str_trim(colnames(data))
data$MALOCCLUSION <- as.factor(data$MALOCCLUSION)
data$ETHNICITY <- as.factor(data$ETHNICITY)
data$GENDER <- as.factor(data$GENDER)
data$TREATMENT <- as.factor(data$TREATMENT)


sumdata <- summarytools::dfSummary(data)
write.csv(sumdata, "summary.csv")


# Load required libraries
library(summarytools)
library(dplyr)
library(ggplot2)
# Frequency tables
freq(data$MALOCCLUSION)
## Frequencies  
## data$MALOCCLUSION  
## Type: Factor  
## 
##                      Freq   % Valid   % Valid Cum.   % Total   % Total Cum.
## ------------------ ------ --------- -------------- --------- --------------
##              CL I·    268     44.67          44.67     44.67          44.67
##        CL I BIMAX·     49      8.17          52.83      8.17          52.83
##         CL I DIV I      7      1.17          54.00      1.17          54.00
##             CL II·     29      4.83          58.83      4.83          58.83
##        CL II BIMAX      3      0.50          59.33      0.50          59.33
##        CL II DIV I    175     29.17          88.50     29.17          88.50
##       CL II DIV II     17      2.83          91.33      2.83          91.33
##            CL III·     48      8.00          99.33      8.00          99.33
##                CLP      4      0.67         100.00      0.67         100.00
##               <NA>      0                               0.00         100.00
##              Total    600    100.00         100.00    100.00         100.00
freq(data$ETHNICITY)
## Frequencies  
## data$ETHNICITY  
## Type: Factor  
## 
##                            Freq   % Valid   % Valid Cum.   % Total   % Total Cum.
## ------------------------ ------ --------- -------------- --------- --------------
##             EAST INDIAN·     57      9.50           9.50      9.50           9.50
##       NORTH EAST INDIAN·      6      1.00          10.50      1.00          10.50
##            NORTH INDIAN·    382     63.67          74.17     63.67          74.17
##            SOUTH INDIAN·     81     13.50          87.67     13.50          87.67
##             WEST INDIAN·     74     12.33         100.00     12.33         100.00
##                     <NA>      0                               0.00         100.00
##                    Total    600    100.00         100.00    100.00         100.00
freq(data$GENDER)
## Frequencies  
## data$GENDER  
## Type: Factor  
## 
##               Freq   % Valid   % Valid Cum.   % Total   % Total Cum.
## ----------- ------ --------- -------------- --------- --------------
##           F    304     50.67          50.67     50.67          50.67
##           M    296     49.33         100.00     49.33         100.00
##        <NA>      0                               0.00         100.00
##       Total    600    100.00         100.00    100.00         100.00
freq(data$TREATMENT)
## Frequencies  
## data$TREATMENT  
## Type: Factor  
## 
##                        Freq   % Valid   % Valid Cum.   % Total   % Total Cum.
## -------------------- ------ --------- -------------- --------- --------------
##                  FM·    573     95.50          95.50     95.50          95.50
##       ORTHO SURGICAL     27      4.50         100.00      4.50         100.00
##                 <NA>      0                               0.00         100.00
##                Total    600    100.00         100.00    100.00         100.00
# Function to generate pie-friendly frequency table
make_freq_df <- function(data, var_name) {
  data %>%
    filter(!is.na({{ var_name }})) %>%
    count({{ var_name }}) %>%
    mutate(Percent = round(100 * n / sum(n), 2),
           Label = paste0({{ var_name }}, "\n", Percent, "%"))
}

library(ggrepel)  # For smart label repulsion

# Prepare frequency data
maloc_df <- make_freq_df(data, MALOCCLUSION)

# Calculate cumulative midpoint for label angles
maloc_df <- maloc_df %>%
  arrange(desc(MALOCCLUSION)) %>%
  mutate(
    ymax = cumsum(Percent),
    ymin = c(0, head(ymax, -1)),
    label_pos = (ymax + ymin) / 2,
    angle = 90 - 360 * label_pos / 100
  )

# Plot
p1 <- ggplot(maloc_df, aes(x = "", y = Percent, fill = MALOCCLUSION)) +
  geom_col(width = 1, color = "white") +
  coord_polar(theta = "y") +
  geom_text_repel(
    aes(y = label_pos, label = Label, angle = angle),
    nudge_x = 1.2,
    direction = "y",
    hjust = 0,
    segment.size = 0.2,
    size = 3.2,
    show.legend = FALSE
  ) +
  labs(title = "Malocclusion Distribution") +
  theme_void() +
  theme(plot.title = element_text(hjust = 0.5, face = "bold"),
        legend.position = "none")


library(ggrepel)

# Prepare ethnicity frequency data
eth_df <- make_freq_df(data, ETHNICITY) %>%
  arrange(desc(ETHNICITY)) %>%
  mutate(
    ymax = cumsum(Percent),
    ymin = c(0, head(ymax, -1)),
    label_pos = (ymax + ymin) / 2,
    angle = 90 - 360 * label_pos / 100  # For radial text rotation
  )

# Plot
p2 <- ggplot(eth_df, aes(x = "", y = Percent, fill = ETHNICITY)) +
  geom_col(width = 1, color = "white") +
  coord_polar(theta = "y") +
  geom_text_repel(
    aes(y = label_pos, label = Label, angle = angle),
    nudge_x = 1.2,
    direction = "y",
    hjust = 0,
    segment.size = 0.3,
    size = 3,
    show.legend = FALSE
  ) +
  labs(title = "Ethnicity Distribution") +
  theme_void() +
  theme(
    plot.title = element_text(hjust = 0.5, face = "bold", size = 12),
    legend.position = "none"
  )


gender_df <- make_freq_df(data, GENDER)

p3 <- ggplot(gender_df, aes(x = "", y = Percent, fill = GENDER)) +
  geom_col(width = 1, color = "white") +
  coord_polar(theta = "y") +
  geom_text(aes(label = Label),
            position = position_stack(vjust = 0.5),
            size = 4) +
  labs(title = "Gender Distribution") +
  theme_void()

treat_df <- make_freq_df(data, TREATMENT)

p4 <- ggplot(treat_df, aes(x = "", y = Percent, fill = TREATMENT)) +
  geom_col(width = 1, color = "white") +
  coord_polar(theta = "y") +
  geom_text(aes(label = Label),
            position = position_stack(vjust = 0.5),
            size = 3) +
  labs(title = "Treatment Distribution") +
  theme_void()

library(gridExtra)
## 
## Attaching package: 'gridExtra'
## 
## The following object is masked from 'package:dplyr':
## 
##     combine
grid.arrange(p1, p2, p3, p4, ncol = 2)
## Warning: ggrepel: Repulsion works correctly only for rotation angles multiple
## of 90 degrees
## Warning: ggrepel: Repulsion works correctly only for rotation angles multiple
## of 90 degrees

# Load required libraries
library(summarytools)
library(dplyr)

s <- data.frame(
  MALOCCLUSION = data$MALOCCLUSION,
  ETHNICITY = data$ETHNICITY,
  GENDER = data$GENDER,
  TREATMENT = data$TREATMENT
)


# Initialize an empty list to store frequency tables
freq_tables <- list()

# Specify the four variable names
var_names <- c("MALOCCLUSION", "ETHNICITY", "GENDER", "TREATMENT")  # Replace with your actual variable names

for (var_name in var_names) {
  # Generate frequency table for the variable
  f <- freq(
    s[[var_name]],
    report.nas = FALSE,
    headings = FALSE,
    plain.ascii = FALSE
  )
  
  # Convert to data frame and add variable name column
  df <- as.data.frame(f)
  df$Variable <- var_name
  
  # Append to list
  freq_tables[[var_name]] <- df
}

# Combine all frequency tables into one
combined_freq_table <- bind_rows(freq_tables)

# Reorder columns for clarity
combined_freq_table <- combined_freq_table %>%
  select(Variable, everything())

# View the result
print(combined_freq_table)
##                        Variable Freq     % Valid % Valid Cum.     % Total
## CL I·              MALOCCLUSION  268  44.6666667     44.66667  44.6666667
## CL I BIMAX·        MALOCCLUSION   49   8.1666667     52.83333   8.1666667
## CL I DIV I         MALOCCLUSION    7   1.1666667     54.00000   1.1666667
## CL II·             MALOCCLUSION   29   4.8333333     58.83333   4.8333333
## CL II BIMAX        MALOCCLUSION    3   0.5000000     59.33333   0.5000000
## CL II DIV I        MALOCCLUSION  175  29.1666667     88.50000  29.1666667
## CL II DIV II       MALOCCLUSION   17   2.8333333     91.33333   2.8333333
## CL III·            MALOCCLUSION   48   8.0000000     99.33333   8.0000000
## CLP                MALOCCLUSION    4   0.6666667    100.00000   0.6666667
## <NA>...10          MALOCCLUSION    0          NA           NA   0.0000000
## Total...11         MALOCCLUSION  600 100.0000000    100.00000 100.0000000
## EAST INDIAN·          ETHNICITY   57   9.5000000      9.50000   9.5000000
## NORTH EAST INDIAN·    ETHNICITY    6   1.0000000     10.50000   1.0000000
## NORTH INDIAN·         ETHNICITY  382  63.6666667     74.16667  63.6666667
## SOUTH INDIAN·         ETHNICITY   81  13.5000000     87.66667  13.5000000
## WEST INDIAN·          ETHNICITY   74  12.3333333    100.00000  12.3333333
## <NA>...17             ETHNICITY    0          NA           NA   0.0000000
## Total...18            ETHNICITY  600 100.0000000    100.00000 100.0000000
## F                        GENDER  304  50.6666667     50.66667  50.6666667
## M                        GENDER  296  49.3333333    100.00000  49.3333333
## <NA>...21                GENDER    0          NA           NA   0.0000000
## Total...22               GENDER  600 100.0000000    100.00000 100.0000000
## FM·                   TREATMENT  573  95.5000000     95.50000  95.5000000
## ORTHO SURGICAL        TREATMENT   27   4.5000000    100.00000   4.5000000
## <NA>...25             TREATMENT    0          NA           NA   0.0000000
## Total...26            TREATMENT  600 100.0000000    100.00000 100.0000000
##                    % Total Cum.
## CL I·                  44.66667
## CL I BIMAX·            52.83333
## CL I DIV I             54.00000
## CL II·                 58.83333
## CL II BIMAX            59.33333
## CL II DIV I            88.50000
## CL II DIV II           91.33333
## CL III·                99.33333
## CLP                   100.00000
## <NA>...10             100.00000
## Total...11            100.00000
## EAST INDIAN·            9.50000
## NORTH EAST INDIAN·     10.50000
## NORTH INDIAN·          74.16667
## SOUTH INDIAN·          87.66667
## WEST INDIAN·          100.00000
## <NA>...17             100.00000
## Total...18            100.00000
## F                      50.66667
## M                     100.00000
## <NA>...21             100.00000
## Total...22            100.00000
## FM·                    95.50000
## ORTHO SURGICAL        100.00000
## <NA>...25             100.00000
## Total...26            100.00000
write.csv(combined_freq_table, "freq.csv")

colnames(combined_freq_table)
## [1] "Variable"     "Freq"         "% Valid"      "% Valid Cum." "% Total"     
## [6] "% Total Cum."
# Cross-tabulation
ctable_gender <- ctable(data$MALOCCLUSION,
                        data$GENDER,
                        headings = FALSE,
                        plain.ascii = FALSE)
ctable_eth <- ctable(data$MALOCCLUSION,
                     data$ETHNICITY,
                     headings = FALSE,
                     plain.ascii = FALSE)
ctable_treat <- ctable(data$MALOCCLUSION,
                       data$TREATMENT,
                       headings = FALSE,
                       plain.ascii = FALSE)

write.csv(ctable_treat, "ctable_treat.csv")
## Warning in (function (..., row.names = NULL, check.rows = FALSE, check.names =
## TRUE, : row names were found from a short variable and have been discarded
write.csv(ctable_eth, "ctable_eth.csv")
## Warning in (function (..., row.names = NULL, check.rows = FALSE, check.names =
## TRUE, : row names were found from a short variable and have been discarded
write.csv(ctable_gender, "ctable_gender.csv")
## Warning in (function (..., row.names = NULL, check.rows = FALSE, check.names =
## TRUE, : row names were found from a short variable and have been discarded
# Descriptive summary
summary_data <- dfSummary(data)

# Perform chi-square tests
gender_test    <- chisq.test(table(data$MALOCCLUSION, data$GENDER))
## Warning in chisq.test(table(data$MALOCCLUSION, data$GENDER)): Chi-squared
## approximation may be incorrect
ethnicity_test <- chisq.test(table(data$MALOCCLUSION, data$ETHNICITY))
## Warning in chisq.test(table(data$MALOCCLUSION, data$ETHNICITY)): Chi-squared
## approximation may be incorrect
treatment_test <- chisq.test(table(data$MALOCCLUSION, data$TREATMENT))
## Warning in chisq.test(table(data$MALOCCLUSION, data$TREATMENT)): Chi-squared
## approximation may be incorrect
# Extract results into a data frame
chi_results <- data.frame(
  Variable      = c("GENDER", "ETHNICITY", "TREATMENT"),
  Chi_Square    = c(
    gender_test$statistic,
    ethnicity_test$statistic,
    treatment_test$statistic
  ),
  df            = c(
    gender_test$parameter,
    ethnicity_test$parameter,
    treatment_test$parameter
  ),
  p_value       = c(
    gender_test$p.value,
    ethnicity_test$p.value,
    treatment_test$p.value
  ),
  stringsAsFactors = FALSE
)
# Optional: Round the numeric columns for clarity
chi_results$Chi_Square <- round(chi_results$Chi_Square, 3)
chi_results$p_value    <- signif(chi_results$p_value, 3)

# View the final table
print(chi_results)
##    Variable Chi_Square df  p_value
## 1    GENDER     14.073  8 7.99e-02
## 2 ETHNICITY     48.156 32 3.33e-02
## 3 TREATMENT     53.782  8 7.61e-09
# Install & load knitr if needed
if (!require(knitr))
  install.packages("knitr")
## Loading required package: knitr
library(knitr)

kable(chi_results, caption = "Chi-Square Test Summary for Malocclusion vs. Demographic Variables")
Chi-Square Test Summary for Malocclusion vs. Demographic Variables
Variable Chi_Square df p_value
GENDER 14.073 8 0.0799
ETHNICITY 48.156 32 0.0333
TREATMENT 53.782 8 0.0000
##Safe Chi-square Analyzer Function with simulated P value
analyze_chisq <- function(varname,
                          predictor,
                          response = data$MALOCCLUSION) {
  tab <- table(response, predictor)
  
  # Decide whether simulation is needed
  expected_counts <- suppressWarnings(chisq.test(tab)$expected)
  use_simulation <- any(expected_counts < 5)
  
  # Perform Chi-square test
  chisq <- suppressWarnings(if (use_simulation) {
    chisq.test(tab, simulate.p.value = TRUE, B = 10000)
  } else {
    chisq.test(tab)
  })
  
  # Extract residuals and contribution only if not simulated
  if (!use_simulation) {
    residuals <- round(chisq$stdres, 2)
    contrib <- round((chisq$stdres^2 / sum(chisq$statistic)) * 100, 2)
  } else {
    residuals <- NA
    contrib <- NA
  }
  
  # Return structured list
  return(
    list(
      variable     = varname,
      chi_sq       = round(chisq$statistic, 3),
      df           = chisq$parameter,
      p_value      = signif(chisq$p.value, 3),
      method       = chisq$method,
      expected     = round(chisq$expected, 2),
      observed     = tab,
      residuals    = residuals,
      contribution = contrib
    )
  )
}

# Apply to all three variables
gender_analysis    <- analyze_chisq("GENDER", data$GENDER)
ethnicity_analysis <- analyze_chisq("ETHNICITY", data$ETHNICITY)
treatment_analysis <- analyze_chisq("TREATMENT", data$TREATMENT)

##Summary Table
chi_summary <- data.frame(
  Variable   = c(
    gender_analysis$variable,
    ethnicity_analysis$variable,
    treatment_analysis$variable
  ),
  Chi_Square = c(
    gender_analysis$chi_sq,
    ethnicity_analysis$chi_sq,
    treatment_analysis$chi_sq
  ),
  df         = c(
    gender_analysis$df,
    ethnicity_analysis$df,
    treatment_analysis$df
  ),
  p_value    = c(
    gender_analysis$p_value,
    ethnicity_analysis$p_value,
    treatment_analysis$p_value
  ),
  Method     = c(
    gender_analysis$method,
    ethnicity_analysis$method,
    treatment_analysis$method
  )
)

print(chi_summary)
##    Variable Chi_Square df p_value
## 1    GENDER     14.073 NA  0.0705
## 2 ETHNICITY     48.156 NA  0.0836
## 3 TREATMENT     53.782 NA  0.0007
##                                                                              Method
## 1 Pearson's Chi-squared test with simulated p-value\n\t (based on 10000 replicates)
## 2 Pearson's Chi-squared test with simulated p-value\n\t (based on 10000 replicates)
## 3 Pearson's Chi-squared test with simulated p-value\n\t (based on 10000 replicates)
##Interpretation Add-on (APA Style)
cat(
  sprintf(
    "A %s was conducted to examine the relationship between malocclusion and gender. The test yielded χ²(%d) = %.2f, p = %.3f.\n",
    gender_analysis$method,
    gender_analysis$df,
    gender_analysis$chi_sq,
    gender_analysis$p_value
  )
)
## A Pearson's Chi-squared test with simulated p-value
##   (based on 10000 replicates) was conducted to examine the relationship between malocclusion and gender. The test yielded χ²(NA) = 14.07, p = 0.070.
# Bar plot
ggplot(data, aes(x = MALOCCLUSION, fill = GENDER)) +
  geom_bar(position = "dodge") +
  labs(title = "Malocclusion by Gender", y = "Count") +
  theme_minimal()

# Mosaic plot
library(vcd)
mosaic(
  ~ MALOCCLUSION + GENDER,
  data = data,
  shade = TRUE,
  labeling_args = list(
    rot_labels = c(left = 45, top = 0),
    # 0° rotation for better alignment
    offset_varnames = c(left = 0.9, top = 1),
    gp_labels = gpar(fontsize = 4)     # Smaller label size to reduce clutter
  )
)

# Balloon plot

library(gplots)
# Create the contingency table first
ethnicity_tab <- table(data$ETHNICITY, data$MALOCCLUSION)
gender_tab <- table(data$GENDER, data$MALOCCLUSION)
treatment_tab <- table(data$TREATMENT, data$MALOCCLUSION)
# Now load gplots and create the balloon plot
library(gplots)

# balloonplot
balloonplot(
  ethnicity_tab,
  main = "Malocclusion vs Ethnicity",
  xlab = "Malocclusion",
  ylab = "Ethnicity",
  label = TRUE,
  show.margins = FALSE
)

balloonplot(
  gender_tab,
  main = "Malocclusion vs Gender",
  xlab = "Malocclusion",
  ylab = "Gender",
  label = TRUE,
  show.margins = FALSE
)

balloonplot(
  treatment_tab,
  main = "Malocclusion vs Treatment",
  xlab = "Malocclusion",
  ylab = "Treatment",
  label = TRUE,
  show.margins = FALSE
)

##Balloon Plot Styled combined
library(ggplot2)
library(dplyr)
library(scales)
## 
## Attaching package: 'scales'
## 
## The following object is masked from 'package:purrr':
## 
##     discard
## 
## The following object is masked from 'package:readr':
## 
##     col_factor
# Helper to extract % and p-value
prep_plot_data <- function(group_var, varname) {
  tab <- table(group_var, data$MALOCCLUSION)
  df <- as.data.frame(tab)
  colnames(df) <- c("Group", "Malocclusion", "Freq")
  df$Total <- ave(df$Freq, df$Group, FUN = sum)
  df$Perc <- df$Freq / df$Total
  
  # Chi-square p-value
  chi <- chisq.test(tab,
                    simulate.p.value = any(chisq.test(tab)$expected < 5),
                    B = 9999)
  df$p_label <- paste0("Chi² p = ", signif(chi$p.value, 3))
  df$Variable <- varname
  return(df)
}

# Create all
df_gender    <- prep_plot_data(data$GENDER, "GENDER")
## Warning in chisq.test(tab): Chi-squared approximation may be incorrect
df_ethnicity <- prep_plot_data(data$ETHNICITY, "ETHNICITY")
## Warning in chisq.test(tab): Chi-squared approximation may be incorrect
df_treatment <- prep_plot_data(data$TREATMENT, "TREATMENT")
## Warning in chisq.test(tab): Chi-squared approximation may be incorrect
# Combine
styled_data <- bind_rows(df_gender, df_ethnicity, df_treatment)
p_styled <- ggplot(styled_data, aes(x = Malocclusion, y = Group)) +
  geom_point(
    aes(size = Perc, fill = Perc),
    shape = 21,
    color = "black",
    stroke = 0.3
  ) +
  geom_text(
    aes(label = percent(Perc, accuracy = 1)),
    vjust = -0.8,
    size = 3.5,
    fontface = "bold"
  ) +
  facet_wrap( ~ Variable + p_label, scales = "free_y") +
  scale_size(range = c(2, 15), name = "Proportion") +
  scale_fill_gradient(low = "#f2f0f7",
                      high = "#54278f",
                      name = "Proportion") +
  labs(
    title = "Malocclusion Patterns by Group (Balloon Plot)",
    subtitle = "Circle size & color represent proportion within group.\nChi-square p-values annotated in facet headers.",
    x = "Malocclusion Type",
    y = NULL
  ) +
  theme_minimal(base_size = 13) +
  theme(
    strip.text = element_text(face = "bold", size = 09),
    legend.position = "right",
    axis.text.x = element_text(angle = 45, hjust = 0.5),
    panel.grid.major = element_line(color = "grey90")
  )
print(p_styled)

# Optional: Save it!
ggsave(
  "Malocclusion_BalloonPlot_Styled.png",
  p_styled,
  width = 12,
  height = 7,
  dpi = 300
)

## Ridgeline Plot
ggplot(data, aes(x = MALOCCLUSION, y = ETHNICITY, fill = ETHNICITY)) +
  geom_density_ridges(
    stat = "binline",
    binwidth = 1,
    scale = 0.9,
    draw_baseline = FALSE
  ) +
  theme_ridges() +
  labs(title = "Ridgeline Plot of Malocclusion by Ethnicity")

##Load Required Libraries
library(ggridges)
library(dplyr)
library(ggplot2)
##Prepare Long-format Data
# Reshape data into long format
ridgeline_data <- bind_rows(
  data %>% select(Group = GENDER, MALOCCLUSION)     %>% mutate(Variable = "Gender"),
  data %>% select(Group = ETHNICITY, MALOCCLUSION)  %>% mutate(Variable = "Ethnicity"),
  data %>% select(Group = TREATMENT, MALOCCLUSION)  %>% mutate(Variable = "Treatment")
)

# Ensure factors are treated cleanly
ridgeline_data$MALOCCLUSION <- as.factor(ridgeline_data$MALOCCLUSION)
ridgeline_data$Group <- as.factor(ridgeline_data$Group)
##Plot the Combined Ridgeline Panel
ggplot(ridgeline_data, aes(x = MALOCCLUSION, y = Group, fill = Group)) +
  geom_density_ridges(
    stat = "binline",
    binwidth = 1,
    scale = 0.9,
    alpha = 0.7,
    draw_baseline = FALSE,
    color = "white"
  ) +
  facet_wrap( ~ Variable, scales = "free_y") +
  theme_ridges(center_axis_labels = TRUE) +
  labs(title = "Distribution of Malocclusion Types by Gender, Ethnicity, and Treatment", x = "Malocclusion Type", y = "Group") +
  theme(
    strip.text = element_text(face = "bold", size = 12),
    legend.position = "none",
    plot.title = element_text(face = "bold", hjust = 0.5),
    axis.text.x = element_text(
      size = 8,
      angle = 45,
      hjust = 1
    )  # 👈 Make X-axis labels smaller
  )

##Violin plot

# Load required libraries
library(ggplot2)
library(dplyr)
##Mode Function
# Mode calculation function
get_mode <- function(x) {
  ux <- unique(x)
  ux[which.max(tabulate(match(x, ux)))]
}
# Interpretation function
interpret_violin_stats <- function(df, group_var, group_label = "Group") {
  df %>%
    filter(!is.na({{ group_var }}), !is.na(MALOCCLUSION)) %>%
    mutate(Malocclusion_Ordinal = as.numeric(factor(MALOCCLUSION))) %>%
    group_by(Group = {{ group_var }}) %>%
    summarise(
      n = n(),
      Mean = round(mean(Malocclusion_Ordinal), 2),
      Median = round(median(Malocclusion_Ordinal), 2),
      Mode = get_mode(Malocclusion_Ordinal),
      SD = round(sd(Malocclusion_Ordinal), 2),
      .groups = "drop"
    ) %>%
    mutate(
      Interpretation = paste0(
        "In ",
        group_label,
        " '",
        Group,
        "': ",
        "most common malocclusion class is ",
        Mode,
        " (mode), ",
        "with a median of ",
        Median,
        " and mean of ",
        Mean,
        ". ",
        "Standard deviation is ",
        SD,
        " (",
        ifelse(
          SD < 0.5,
          "low spread",
          ifelse(SD < 1, "moderate spread", "high variability")
        ),
        "); based on n = ",
        n,
        " cases."
      )
    ) %>%
    pull(Interpretation) %>%
    paste(collapse = "\n\n")
}

# Gender Interpretation
gender_interpret <- interpret_violin_stats(data, GENDER, "Gender")
cat("🔍 Gender-wise Interpretation:\n", gender_interpret, "\n\n")
## 🔍 Gender-wise Interpretation:
##  In Gender 'F': most common malocclusion class is 1 (mode), with a median of 2 and mean of 3.32. Standard deviation is 2.58 (high variability); based on n = 304 cases.
## 
## In Gender 'M': most common malocclusion class is 1 (mode), with a median of 2.5 and mean of 3.71. Standard deviation is 2.71 (high variability); based on n = 296 cases.
# Treatment Interpretation
treatment_interpret <- interpret_violin_stats(data, TREATMENT, "Treatment")
cat("🔍 Treatment-wise Interpretation:\n",
    treatment_interpret,
    "\n\n")
## 🔍 Treatment-wise Interpretation:
##  In Treatment 'FM ': most common malocclusion class is 1 (mode), with a median of 2 and mean of 3.4. Standard deviation is 2.6 (high variability); based on n = 573 cases.
## 
## In Treatment 'ORTHO SURGICAL': most common malocclusion class is 6 (mode), with a median of 6 and mean of 5.89. Standard deviation is 2.52 (high variability); based on n = 27 cases.
# Ethnicity Interpretation
ethnicity_interpret <- interpret_violin_stats(data, ETHNICITY, "Ethnicity")
cat("🔍 Ethnicity-wise Interpretation:\n",
    ethnicity_interpret,
    "\n")
## 🔍 Ethnicity-wise Interpretation:
##  In Ethnicity 'EAST INDIAN ': most common malocclusion class is 1 (mode), with a median of 2 and mean of 3.18. Standard deviation is 2.51 (high variability); based on n = 57 cases.
## 
## In Ethnicity 'NORTH EAST INDIAN ': most common malocclusion class is 1 (mode), with a median of 1 and mean of 2.67. Standard deviation is 2.58 (high variability); based on n = 6 cases.
## 
## In Ethnicity 'NORTH INDIAN ': most common malocclusion class is 1 (mode), with a median of 2 and mean of 3.57. Standard deviation is 2.68 (high variability); based on n = 382 cases.
## 
## In Ethnicity 'SOUTH INDIAN ': most common malocclusion class is 1 (mode), with a median of 2 and mean of 3.17. Standard deviation is 2.46 (high variability); based on n = 81 cases.
## 
## In Ethnicity 'WEST INDIAN ': most common malocclusion class is 1 (mode), with a median of 4 and mean of 3.92. Standard deviation is 2.77 (high variability); based on n = 74 cases.
##Violin Plot with Interpretation Underneath
library(gridExtra)
library(grid)
library(ggplot2)

shared_theme <- theme_classic() +
  theme(
    axis.text.x = element_text(size = 8),
    axis.text.y = element_text(size = 8),
    plot.title = element_text(size = 8, face = "bold", hjust = 0.5),
    legend.title = element_blank(),
    # Remove legend titles
    legend.text = element_text(size = 8)
  )

##Gender Plot with Mode
# Prepare data
data_gender <- data %>%
  filter(!is.na(GENDER), !is.na(MALOCCLUSION)) %>%
  mutate(Malocclusion_Ordinal = as.numeric(factor(MALOCCLUSION)))

# Calculate mode per gender
mode_gender <- data_gender %>%
  group_by(GENDER) %>%
  summarise(Mode = get_mode(Malocclusion_Ordinal), .groups = "drop")

# Plot with mode marker
violing <- ggplot(data_gender,
                  aes(x = GENDER, y = Malocclusion_Ordinal, fill = GENDER)) +
  geom_violin(trim = FALSE) +
  geom_point(
    data = mode_gender,
    aes(x = GENDER, y = Mode),
    shape = 17,
    size = 3,
    color = "red",
    inherit.aes = FALSE
  ) +
  labs(title = "Malocclusion by Gender", y = "Malocclusion (Ordinal)", x = "Gender") +
  shared_theme

## Treatment Plot with Mode
data_treat <- data %>%
  filter(!is.na(TREATMENT), !is.na(MALOCCLUSION)) %>%
  mutate(Malocclusion_Ordinal = as.numeric(factor(MALOCCLUSION)))

mode_treat <- data_treat %>%
  group_by(TREATMENT) %>%
  summarise(Mode = get_mode(Malocclusion_Ordinal), .groups = "drop")

violint <- ggplot(data_treat,
                  aes(x = TREATMENT, y = Malocclusion_Ordinal, fill = TREATMENT)) +
  geom_violin(trim = FALSE) +
  geom_point(
    data = mode_treat,
    aes(x = TREATMENT, y = Mode),
    shape = 17,
    size = 3,
    color = "red",
    inherit.aes = FALSE
  ) +
  labs(title = "Malocclusion by Treatment", y = "Malocclusion (Ordinal)", x = "Treatment") +
  shared_theme
## Ethnicity Plot with Facets and Mode Markers
data_eth <- data %>%
  filter(!is.na(ETHNICITY), !is.na(MALOCCLUSION)) %>%
  mutate(Malocclusion_Ordinal = as.numeric(factor(MALOCCLUSION)))

mode_eth <- data_eth %>%
  group_by(ETHNICITY) %>%
  summarise(Mode = get_mode(Malocclusion_Ordinal), .groups = "drop")

shared_theme_small_facet <- shared_theme +
  theme(
    strip.text = element_text(size = 4, face = "bold", angle = 0),
    strip.background = element_rect(fill = "gray95", color = "black")
  )  # Reduce facet label font size here

violine <- ggplot(data_eth,
                  aes(x = ETHNICITY, y = Malocclusion_Ordinal, fill = ETHNICITY)) +
  geom_violin(trim = FALSE) +
  geom_point(
    data = mode_eth,
    aes(x = ETHNICITY, y = Mode),
    shape = 17,
    size = 2.5,
    color = "red",
    inherit.aes = FALSE
  ) +
  labs(title = "Malocclusion by Ethnicity", y = "Malocclusion (Ordinal)", x = "Ethnicity") +
  facet_wrap( ~ ETHNICITY, scales = "free_x") +
  shared_theme_small_facet

##Compute Interpretation
# Assuming your interpretation function from earlier is already defined:
gender_interpret <- interpret_violin_stats(data, GENDER, "Gender")
treat_interpret <- interpret_violin_stats(data, TREATMENT, "Treatment")
ethn_interpret <- interpret_violin_stats(data, GENDER, "Ethnicity")

# Make the text into a grob (graphical object)
gender_text <- textGrob(
  label = gender_interpret,
  gp = gpar(fontsize = 9, fontface = "italic"),
  x = 0,
  hjust = 0,
  just = "left"
)
grid.arrange(
  violing,
  # Your ggplot violin plot
  gender_text,
  # Your interpretation
  ncol = 1,
  heights = c(3, 1.2)  # Adjust spacing ratio
)

treat_text <- textGrob(
  label = interpret_violin_stats(data, TREATMENT, "Treatment"),
  gp = gpar(fontsize = 9, fontface = "italic"),
  x = 0,
  hjust = 0
)
grid.arrange(violint,
             treat_text,
             ncol = 1,
             heights = c(3, 1.2))

# Ethnicity (may need tighter height for many facets)
eth_text <- textGrob(
  label = interpret_violin_stats(data, ETHNICITY, "Ethnicity"),
  gp = gpar(fontsize = 9, fontface = "italic"),
  x = 0,
  hjust = 0
)
grid.arrange(violine,
             eth_text,
             ncol = 1,
             heights = c(3.5, 1.5))

##Display All Plots
print(violing)

print(violint)

print(violine)

# Load required libraries
library(ggplot2)
library(dplyr)

# Consistent theme adjustments
shared_theme <- theme_classic() +
  theme(
    axis.text.x = element_text(size = 8),
    axis.text.y = element_text(size = 8),
    plot.title = element_text(size = 8, face = "bold", hjust = 0.5),
    legend.title = element_blank(),
    # Remove legend titles
    legend.text = element_text(size = 8)
  )

# GENDER plot
violing <- ggplot(data, aes(
  x = GENDER,
  y = as.numeric(factor(MALOCCLUSION)),
  fill = GENDER
)) +
  geom_violin(trim = FALSE) +
  labs(title = "Malocclusion by Gender", y = "Malocclusion(Ordinal)", x = "Gender") +
  shared_theme

# TREATMENT plot
violint <- ggplot(data, aes(
  x = TREATMENT,
  y = as.numeric(factor(MALOCCLUSION)),
  fill = TREATMENT
)) +
  geom_violin(trim = FALSE) +
  labs(title = "Malocclusion by Treatment", y = "Malocclusion(Ordinal)", x = "Treatment") +
  shared_theme

# ETHNICITY plot with facet wrap
# Assumption: You want to split across individual ethnic subgroups
# Extend shared theme for smaller facet strip text
shared_theme_small_facet <- shared_theme +
  theme(
    strip.text = element_text(size = 4, face = "bold", angle = 0),
    strip.background = element_rect(fill = "gray95", color = "black")
  )  # Reduce facet label font size here

# Updated ETHNICITY violin plot with reduced facet text
violine <- data %>%
  filter(!is.na(ETHNICITY)) %>%
  ggplot(aes(
    x = ETHNICITY,
    y = as.numeric(factor(MALOCCLUSION)),
    fill = ETHNICITY
  )) +
  geom_violin(trim = FALSE) +
  labs(title = "Malocclusion by Ethnicity", y = "Malocclusion (Ordinal)", x = "Ethnicity") +
  facet_wrap( ~ ETHNICITY, scales = "free_x") +
  shared_theme_small_facet

# Display
print(violine)

print(violing)

print(violint)

##Plot 1: Distribution of Malocclusion by Ethnicity and Gender
ggplot(data, aes(x = MALOCCLUSION, fill = GENDER)) +
  geom_bar(position = "dodge") +
  facet_wrap( ~ ETHNICITY) +
  labs(title = "Malocclusion Distribution by Ethnicity and Gender", x = "Malocclusion Type", y = "Count") +
  theme_minimal() +
  theme(axis.text.x = element_text(size = 5))  # Reduce font size

## Plot 2: Diagnostic Plot – Gender vs Malocclusion
ggplot(data, aes(x = GENDER, fill = MALOCCLUSION)) +
  geom_bar(position = "fill") +
  labs(title = "Proportional Malocclusion Distribution by Gender", y = "Proportion", x = "Gender") +
  theme_minimal()

##Plot 3: Malocclusion Types by Ethnicity
ggplot(data, aes(x = ETHNICITY, fill = MALOCCLUSION)) +
  geom_bar(position = "fill") +
  labs(title = "Malocclusion Types by Ethnicity", y = "Proportion", x = "Ethnicity") +
  theme_minimal()

##Plot 4: Malocclusion Types by Gender
ggplot(data, aes(x = GENDER, fill = MALOCCLUSION)) +
  geom_bar(position = "fill") +
  labs(title = "Malocclusion Types by Gender", y = "Proportion", x = "Gender") +
  theme_minimal()

##Plot 5: Gender vs Ethnicity Crossplot
ggplot(data, aes(x = ETHNICITY, fill = GENDER)) +
  geom_bar(position = "dodge") +
  labs(title = "Gender Distribution across Ethnicities", x = "Ethnicity", y = "Count") +
  theme_minimal()

# Load libraries
library(cluster)
library(factoextra)
library(dplyr)

# Step 1: Load and clean data
data <- read.csv("ANTHRO.csv")
colnames(data) <- trimws(colnames(data))  # Remove stray spaces
data$MALOCCLUSION <- as.factor(data$MALOCCLUSION)
data$GENDER <- as.factor(data$GENDER)
data$ETHNICITY <- as.factor(data$ETHNICITY)

# Step 2: Prepare only clustering variables
# Convert to dummy variables (numerical encoding) for scaling
data_clust <- data %>%
  select(MALOCCLUSION, GENDER, ETHNICITY) %>%
  mutate_all(as.factor) %>%
  model.matrix( ~ . - 1, data = .) %>%  # One-hot encoding, no intercept
  as.data.frame()

# Step 3: Gower distance is ideal for mixed data, but since we now have numeric dummies,
# Euclidean distance can also be used for visualization clustering
# Step 3: Compute distance (Euclidean or Gower)
dist_matrix <- dist(scale(data_clust))

# Step 4: Apply PAM clustering
pam_fit <- pam(dist_matrix, k = 3)
# Step 5: Generate Principal Coordinates (MDS)
pc_coords <- cmdscale(dist_matrix, k = 2)  # 2D representation
# Combine cluster assignments with coordinates
plot_data <- data.frame(pc_coords, cluster = as.factor(pam_fit$clustering))

# Now plot manually
ggplot(plot_data, aes(x = X1, y = X2, color = cluster)) +
  geom_point(size = 3, alpha = 0.7) +
  labs(title = "Clusters overlaid on Classical MDS (cmdscale)", x = "MDS Dimension 1", y = "MDS Dimension 2") +
  theme_minimal() +
  scale_color_brewer(palette = "Set2")

data$CLUSTER <- as.factor(pam_fit$clustering)

# View distribution by cluster
table(data$CLUSTER, data$MALOCCLUSION)
##    
##     CL I  CL I BIMAX  CL I DIV I CL II  CL II BIMAX CL II DIV I CL II DIV II
##   1   122          26          5     11           3           0            7
##   2     0           0          0      0           0         175            0
##   3   146          23          2     18           0           0           10
##    
##     CL III  CLP
##   1      32   2
##   2       0   0
##   3      16   2
table(data$CLUSTER, data$ETHNICITY)
##    
##     EAST INDIAN  NORTH EAST INDIAN  NORTH INDIAN  SOUTH INDIAN  WEST INDIAN 
##   1           17                  1           129            33           28
##   2           15                  2           119            17           22
##   3           25                  3           134            31           24
table(data$CLUSTER, data$TREATMENT)
##    
##     FM  ORTHO SURGICAL
##   1 196             12
##   2 166              9
##   3 211              6
table(data$CLUSTER, data$GENDER)
##    
##       F   M
##   1   0 208
##   2  87  88
##   3 217   0
##Assign Cluster Labels to Data
# Add cluster labels to original dataset
data$CLUSTER <- as.factor(pam_fit$clustering)
##Summary Table for Each Variable by Cluster
# Load necessary package
library(dplyr)

# Summary counts
summary_table <- data %>%
  group_by(CLUSTER) %>%
  summarise(
    Count = n(),
    Most_Common_Gender = names(which.max(table(GENDER))),
    Most_Common_Ethnicity = names(which.max(table(ETHNICITY))),
    Most_Common_Malocclusion = names(which.max(table(MALOCCLUSION)))
  )

print(summary_table)
## # A tibble: 3 × 5
##   CLUSTER Count Most_Common_Gender Most_Common_Ethnicity Most_Common_Malocclus…¹
##   <fct>   <int> <chr>              <chr>                 <chr>                  
## 1 1         208 M                  "NORTH INDIAN "       "CL I "                
## 2 2         175 M                  "NORTH INDIAN "       "CL II DIV I"          
## 3 3         217 F                  "NORTH INDIAN "       "CL I "                
## # ℹ abbreviated name: ¹​Most_Common_Malocclusion
##Full Cross Tabulations (Proportions)
# Gender by Cluster
cat("\n▶ Gender Distribution by Cluster:\n")
## 
## ▶ Gender Distribution by Cluster:
print(prop.table(table(data$CLUSTER, data$GENDER), margin = 1))
##    
##             F         M
##   1 0.0000000 1.0000000
##   2 0.4971429 0.5028571
##   3 1.0000000 0.0000000
# Ethnicity by Cluster
cat("\n▶ Ethnicity Distribution by Cluster:\n")
## 
## ▶ Ethnicity Distribution by Cluster:
print(prop.table(table(data$CLUSTER, data$ETHNICITY), margin = 1))
##    
##     EAST INDIAN  NORTH EAST INDIAN  NORTH INDIAN  SOUTH INDIAN  WEST INDIAN 
##   1  0.081730769        0.004807692   0.620192308   0.158653846  0.134615385
##   2  0.085714286        0.011428571   0.680000000   0.097142857  0.125714286
##   3  0.115207373        0.013824885   0.617511521   0.142857143  0.110599078
# Malocclusion by Cluster
cat("\n▶ Malocclusion Types by Cluster:\n")
## 
## ▶ Malocclusion Types by Cluster:
print(prop.table(table(data$CLUSTER, data$MALOCCLUSION), margin = 1))
##    
##           CL I  CL I BIMAX   CL I DIV I      CL II  CL II BIMAX CL II DIV I
##   1 0.586538462 0.125000000 0.024038462 0.052884615 0.014423077 0.000000000
##   2 0.000000000 0.000000000 0.000000000 0.000000000 0.000000000 1.000000000
##   3 0.672811060 0.105990783 0.009216590 0.082949309 0.000000000 0.000000000
##    
##     CL II DIV II     CL III          CLP
##   1  0.033653846 0.153846154 0.009615385
##   2  0.000000000 0.000000000 0.000000000
##   3  0.046082949 0.073732719 0.009216590
##Smart Verbal Interpretation (Auto-Narration)
for (k in levels(data$CLUSTER)) {
  sub <- data[data$CLUSTER == k, ]
  cat("\n🧠 Cluster", k, "Summary:\n")
  cat(" - Total Individuals:", nrow(sub), "\n")
  cat(" - Most common Gender:", names(which.max(table(sub$GENDER))), "\n")
  cat(" - Most common Ethnicity:", names(which.max(table(sub$ETHNICITY))), "\n")
  cat(" - Most common Malocclusion:", names(which.max(table(sub$MALOCCLUSION))), "\n")
}
## 
## 🧠 Cluster 1 Summary:
##  - Total Individuals: 208 
##  - Most common Gender: M 
##  - Most common Ethnicity: NORTH INDIAN  
##  - Most common Malocclusion: CL I  
## 
## 🧠 Cluster 2 Summary:
##  - Total Individuals: 175 
##  - Most common Gender: M 
##  - Most common Ethnicity: NORTH INDIAN  
##  - Most common Malocclusion: CL II DIV I 
## 
## 🧠 Cluster 3 Summary:
##  - Total Individuals: 217 
##  - Most common Gender: F 
##  - Most common Ethnicity: NORTH INDIAN  
##  - Most common Malocclusion: CL I
##Silhouette Widths Table
library(cluster)
library(factoextra)

# View silhouette widths
sil <- silhouette(pam_fit)

# Convert to data frame
sil_df <- as.data.frame(sil[, 1:3])
colnames(sil_df) <- c("Cluster", "Neighbor", "Sil_Width")

# Summary table: average silhouette width per cluster
sil_summary <- aggregate(Sil_Width ~ Cluster, data = sil_df, mean)
colnames(sil_summary) <- c("Cluster", "Average_Silhouette_Width")

print(sil_summary)
##   Cluster Average_Silhouette_Width
## 1       1               0.07997759
## 2       2               0.48919933
## 3       3               0.16275523
##Plot Silhouette Widths
# Plot silhouette
fviz_silhouette(
  sil,
  palette = "Set2",
  label = FALSE,
  # Disable row labels
  print.summary = TRUE,
  # Show summary at bottom
  ggtheme = theme_minimal()
) +
  labs(title = "Silhouette Plot for PAM Clustering", x = "Silhouette Width", y = "Cluster") +
  theme(
    axis.text.y = element_blank(),
    # Remove y-axis text
    axis.ticks.y = element_blank(),
    # Remove y-axis ticks
    panel.grid.major.y = element_blank(),
    # Remove horizontal gridlines
    panel.grid.minor = element_blank()
  )
##   cluster size ave.sil.width
## 1       1  208          0.08
## 2       2  175          0.49
## 3       3  217          0.16

##Chi-Square Tests (Trait vs Cluster)
# Gender vs Cluster
cat("Chi-Square Test: Gender ~ Cluster\n")
## Chi-Square Test: Gender ~ Cluster
print(chisq.test(table(data$GENDER, data$CLUSTER)))
## 
##  Pearson's Chi-squared test
## 
## data:  table(data$GENDER, data$CLUSTER)
## X-squared = 424.97, df = 2, p-value < 2.2e-16
# Ethnicity vs Cluster
cat("\nChi-Square Test: Ethnicity ~ Cluster\n")
## 
## Chi-Square Test: Ethnicity ~ Cluster
print(chisq.test(table(data$ETHNICITY, data$CLUSTER)))
## Warning in chisq.test(table(data$ETHNICITY, data$CLUSTER)): Chi-squared
## approximation may be incorrect
## 
##  Pearson's Chi-squared test
## 
## data:  table(data$ETHNICITY, data$CLUSTER)
## X-squared = 6.448, df = 8, p-value = 0.5972
# Malocclusion vs Cluster
cat("\nChi-Square Test: Malocclusion ~ Cluster\n")
## 
## Chi-Square Test: Malocclusion ~ Cluster
print(chisq.test(table(data$MALOCCLUSION, data$CLUSTER)))
## Warning in chisq.test(table(data$MALOCCLUSION, data$CLUSTER)): Chi-squared
## approximation may be incorrect
## 
##  Pearson's Chi-squared test
## 
## data:  table(data$MALOCCLUSION, data$CLUSTER)
## X-squared = 619.75, df = 16, p-value < 2.2e-16
##Overlay of Clusters on CMDscale (PCA-style)
# Classical multidimensional scaling from Gower dissimilarity
cmd <- cmdscale(dist_matrix, k = 3)
cmd_df <- data.frame(
  Dim1 = cmd[, 1],
  Dim2 = cmd[, 2],
  Dim3 = cmd[, 3],
  Cluster = as.factor(pam_fit$clustering)
)

# Plot
library(ggplot2)

ggplot(cmd_df, aes(x = Dim1, y = Dim2, color = Cluster)) +
  geom_point(size = 3, alpha = 0.7) +
  labs(title = "PAM Clustering (Gower Distance) - CMDscale Projection", x = "Component 1", y = "Component 2") +
  scale_color_brewer(palette = "Set2") +
  theme_minimal()

# --- Load Libraries ---
library(BayesFactor)
library(dplyr)
library(ggplot2)

# --- Load and Clean Data ---
data <- read.csv("ANTHRO.csv")
colnames(data) <- trimws(colnames(data))  # Remove trailing/leading spaces

# Convert to factors
data$MALOCCLUSION <- as.factor(data$MALOCCLUSION)
data$ETHNICITY <- as.factor(data$ETHNICITY)
data$GENDER <- as.factor(data$GENDER)

# --- Create Proper Contingency Tables ---
malocclusion_ethnicity_table <- table(data$MALOCCLUSION, data$ETHNICITY)
malocclusion_gender_table <- table(data$MALOCCLUSION, data$GENDER)
malocclusion_treat_table <- table(data$MALOCCLUSION, data$TREATMENT)
# --- Bayesian Contingency Table Analysis ---

## Malocclusion vs Ethnicity
bf_ethnicity <- contingencyTableBF(
  as.matrix(malocclusion_ethnicity_table),
  sampleType = "jointMulti"  # corrected sampleType
)
print("Bayes Factor (Malocclusion vs Ethnicity):")
## [1] "Bayes Factor (Malocclusion vs Ethnicity):"
print(bf_ethnicity)
## Bayes factor analysis
## --------------
## [1] Non-indep. (a=1) : 8.479476e-20 ±0%
## 
## Against denominator:
##   Null, independence, a = 1 
## ---
## Bayes factor type: BFcontingencyTable, joint multinomial
## Malocclusion vs Gender
bf_gender <- contingencyTableBF(
  as.matrix(malocclusion_gender_table),
  sampleType = "jointMulti"  # corrected sampleType
)
print("Bayes Factor (Malocclusion vs Gender):")
## [1] "Bayes Factor (Malocclusion vs Gender):"
print(bf_gender)
## Bayes factor analysis
## --------------
## [1] Non-indep. (a=1) : 0.0002191475 ±0%
## 
## Against denominator:
##   Null, independence, a = 1 
## ---
## Bayes factor type: BFcontingencyTable, joint multinomial
## Malocclusion vs Treatment
bf_treat <- contingencyTableBF(
  as.matrix(malocclusion_treat_table),
  sampleType = "jointMulti"  # corrected sampleType
)
print("Bayes Factor (Malocclusion vs Treatment):")
## [1] "Bayes Factor (Malocclusion vs Treatment):"
print(bf_treat)
## Bayes factor analysis
## --------------
## [1] Non-indep. (a=1) : 0.0485917 ±0%
## 
## Against denominator:
##   Null, independence, a = 1 
## ---
## Bayes factor type: BFcontingencyTable, joint multinomial
# --- Interpretation Function ---
interpret_bayes_factor <- function(bf_value) {
  if (bf_value < 1 / 10) {
    return("Strong evidence for the null hypothesis (no association)")
  } else if (bf_value < 1 / 3) {
    return("Moderate evidence for the null hypothesis")
  } else if (bf_value < 1) {
    return("Anecdotal evidence for the null hypothesis")
  } else if (bf_value < 3) {
    return("Anecdotal evidence for the alternative hypothesis")
  } else if (bf_value < 10) {
    return("Moderate evidence for the alternative hypothesis")
  } else {
    return("Strong evidence for the alternative hypothesis (association exists)")
  }
}

# --- Extract Bayes Factor Values ---
bf_ethnicity_value <- as.numeric(bf_ethnicity@bayesFactor$bf)
bf_gender_value <- as.numeric(bf_gender@bayesFactor$bf)
bf_treat_value <- as.numeric(bf_treat@bayesFactor$bf)

# --- Print & Interpret Results ---
cat("Bayes Factor (Malocclusion × Ethnicity):",
    round(bf_ethnicity_value, 3),
    "\n")
## Bayes Factor (Malocclusion × Ethnicity): -43.914
cat("Interpretation:",
    interpret_bayes_factor(bf_ethnicity_value),
    "\n\n")
## Interpretation: Strong evidence for the null hypothesis (no association)
cat("Bayes Factor (Malocclusion × Gender):",
    round(bf_gender_value, 3),
    "\n")
## Bayes Factor (Malocclusion × Gender): -8.426
cat("Interpretation:",
    interpret_bayes_factor(bf_gender_value),
    "\n\n")
## Interpretation: Strong evidence for the null hypothesis (no association)
cat("Bayes Factor (Malocclusion × Treatment):",
    round(bf_treat_value, 3),
    "\n")
## Bayes Factor (Malocclusion × Treatment): -3.024
cat("Interpretation:",
    interpret_bayes_factor(bf_treat_value),
    "\n\n")
## Interpretation: Strong evidence for the null hypothesis (no association)
# Step 2: Bayesian analysis
table_ethnicity <- table(data$MALOCCLUSION, data$ETHNICITY)
bf_ethnicity <- contingencyTableBF(as.matrix(table_ethnicity), sampleType = "jointMulti")

# Step 3: Extract posterior samples
posterior_matrix <- posterior(bf_ethnicity, iterations = 10000)

# Debug: Confirm it's a matrix and has content
str(posterior_matrix)  # Should show num [1:10000, 1]
## Formal class 'BFmcmc' [package "BayesFactor"] with 3 slots
##   ..@ model   :Formal class 'BFcontingencyTable' [package "BayesFactor"] with 8 slots
##   .. .. ..@ type      : chr "joint multinomial"
##   .. .. ..@ identifier:List of 1
##   .. .. .. ..$ formula: chr "non-independence"
##   .. .. ..@ prior     :List of 1
##   .. .. .. ..$ a: num 1
##   .. .. ..@ dataTypes : chr(0) 
##   .. .. ..@ shortName : chr "Non-indep. (a=1)"
##   .. .. ..@ longName  : chr "Alternative, non-independence, a = 1"
##   .. .. ..@ analysis  :List of 1
##   .. .. .. ..$ method: chr "analytic"
##   .. .. ..@ version   : chr "0.9.12-4.7"
##   ..@ data    :'data.frame': 9 obs. of  5 variables:
##   .. ..$ V1: int [1:9] 28 5 0 4 0 15 3 2 0
##   .. ..$ V2: int [1:9] 4 0 0 0 0 2 0 0 0
##   .. ..$ V3: int [1:9] 174 24 5 14 1 119 10 34 1
##   .. ..$ V4: int [1:9] 32 17 1 5 2 17 1 5 1
##   .. ..$ V5: int [1:9] 30 3 1 6 0 22 3 7 2
##   ..@ .S3Class: chr "mcmc"
##   ..$ dim     : int [1:2] 10000 45
##   ..$ dimnames:List of 2
##   .. ..$ : NULL
##   .. ..$ : chr [1:45] "pi[1,1]" "pi[2,1]" "pi[3,1]" "pi[4,1]" ...
##   ..$ mcpar   : num [1:3] 1 10000 1
head(posterior_matrix)
## Markov Chain Monte Carlo (MCMC) output:
## Start = 1 
## End = 7 
## Thinning interval = 1 
##         pi[1,1]     pi[2,1]      pi[3,1]     pi[4,1]      pi[5,1]    pi[6,1]
## [1,] 0.04467075 0.007232017 0.0023789760 0.012598443 0.0004683903 0.01487399
## [2,] 0.04743539 0.002715842 0.0005012635 0.006785004 0.0048066252 0.02614559
## [3,] 0.04546815 0.007595841 0.0017085808 0.004684165 0.0028439592 0.03876142
## [4,] 0.04500886 0.007133561 0.0004829695 0.005557561 0.0007833725 0.02319275
## [5,] 0.04619606 0.002560562 0.0012291396 0.005151500 0.0008656031 0.01263938
## [6,] 0.05555378 0.012455859 0.0002783344 0.009474202 0.0022412642 0.02148199
## [7,] 0.03823097 0.007780329 0.0008923808 0.016145536 0.0011045555 0.03043249
##          pi[7,1]     pi[8,1]      pi[9,1]     pi[1,2]      pi[2,2]      pi[3,2]
## [1,] 0.008926226 0.003726529 0.0011858543 0.009350448 0.0016795427 0.0004915563
## [2,] 0.003015541 0.004552194 0.0004388486 0.006567114 0.0054726059 0.0006899992
## [3,] 0.015168856 0.003803358 0.0020845446 0.004397231 0.0004341208 0.0017142058
## [4,] 0.005506818 0.004061765 0.0011150372 0.009321570 0.0014015517 0.0040546745
## [5,] 0.004150922 0.011229820 0.0027713201 0.013435885 0.0002693300 0.0051651008
## [6,] 0.001126521 0.004809876 0.0010888151 0.005944310 0.0004707878 0.0018716697
## [7,] 0.002908096 0.011704367 0.0003162289 0.005824144 0.0065512502 0.0090160697
##           pi[4,2]      pi[5,2]      pi[6,2]      pi[7,2]      pi[8,2]
## [1,] 0.0002201978 8.481469e-05 0.0033479154 0.0001134706 0.0007435003
## [2,] 0.0028777809 2.057532e-03 0.0017907007 0.0005664103 0.0010736300
## [3,] 0.0010239330 1.105793e-03 0.0077407782 0.0005849702 0.0007688038
## [4,] 0.0006018412 1.591925e-04 0.0019353897 0.0015552415 0.0027936634
## [5,] 0.0029956089 2.114368e-03 0.0048124918 0.0009678965 0.0001278214
## [6,] 0.0004157191 2.399204e-03 0.0009828319 0.0011439222 0.0007276862
## [7,] 0.0005555079 5.346976e-03 0.0034186886 0.0030033650 0.0002779343
##           pi[9,2]   pi[1,3]    pi[2,3]     pi[3,3]    pi[4,3]      pi[5,3]
## [1,] 2.428626e-03 0.2816386 0.03534155 0.004801879 0.01931275 0.0056096543
## [2,] 1.135811e-05 0.2238387 0.05156628 0.009530673 0.02220307 0.0029612366
## [3,] 1.094363e-03 0.2546263 0.03264734 0.014809648 0.03179104 0.0008538234
## [4,] 9.033403e-05 0.2757067 0.04720699 0.018017844 0.02022593 0.0080481739
## [5,] 1.041153e-03 0.2786867 0.03908407 0.009098981 0.02587536 0.0008143451
## [6,] 1.652050e-03 0.2738784 0.04998194 0.009313980 0.02121528 0.0021367294
## [7,] 1.925333e-05 0.2794057 0.04805363 0.011098237 0.02246062 0.0048027341
##        pi[6,3]    pi[7,3]    pi[8,3]      pi[9,3]    pi[1,4]    pi[2,4]
## [1,] 0.1971533 0.01262212 0.06177814 0.0006517831 0.05102318 0.03085498
## [2,] 0.1824437 0.01820069 0.07048601 0.0031590004 0.05764337 0.03434490
## [3,] 0.1798728 0.02430852 0.06142698 0.0074964575 0.04209938 0.02553727
## [4,] 0.1710431 0.02927359 0.04142154 0.0142563392 0.06133450 0.02249175
## [5,] 0.1742367 0.01569367 0.06086296 0.0011261391 0.06794239 0.02487571
## [6,] 0.1596931 0.01035464 0.05746156 0.0033300064 0.04974848 0.03292671
## [7,] 0.1788195 0.01896988 0.04829157 0.0047163114 0.05121993 0.01511000
##          pi[3,4]     pi[4,4]     pi[5,4]    pi[6,4]      pi[7,4]     pi[8,4]
## [1,] 0.002265613 0.011608344 0.005215125 0.02500129 0.0061607972 0.002624438
## [2,] 0.007943078 0.012106542 0.019387883 0.02131933 0.0030713520 0.011580553
## [3,] 0.002348865 0.013183351 0.010923221 0.03161693 0.0025046267 0.010246693
## [4,] 0.001790400 0.005288363 0.002006238 0.02288664 0.0004471323 0.011023235
## [5,] 0.004364747 0.004607334 0.010035510 0.02478762 0.0041417892 0.005445511
## [6,] 0.008635012 0.008107757 0.002345627 0.02059213 0.0050448488 0.011619843
## [7,] 0.001152507 0.007951308 0.005196716 0.02168162 0.0039974981 0.008314156
##           pi[9,4]    pi[1,5]     pi[2,5]      pi[3,5]     pi[4,5]      pi[5,5]
## [1,] 0.0034822698 0.04674759 0.007275792 0.0034581308 0.017643232 8.143187e-05
## [2,] 0.0006202369 0.06491231 0.003306842 0.0037528251 0.006369522 3.191055e-03
## [3,] 0.0056574016 0.03894929 0.004316455 0.0019365668 0.015337722 7.394475e-04
## [4,] 0.0177499421 0.04542248 0.005225127 0.0027440702 0.010650927 1.165991e-03
## [5,] 0.0089812130 0.04527892 0.004109269 0.0020604390 0.007992444 1.028095e-03
## [6,] 0.0020906841 0.04799866 0.005108444 0.0027355110 0.009096453 1.885871e-03
## [7,] 0.0027224356 0.05675042 0.003757187 0.0004477821 0.010277563 1.598320e-03
##         pi[6,5]     pi[7,5]     pi[8,5]     pi[9,5]
## [1,] 0.02788295 0.004832044 0.013446272 0.006965482
## [2,] 0.03024459 0.002756852 0.011106995 0.004449037
## [3,] 0.03148838 0.004524643 0.006224973 0.003548774
## [4,] 0.03285398 0.002487937 0.013411511 0.001063415
## [5,] 0.02875252 0.009192953 0.017458209 0.005742369
## [6,] 0.05192599 0.009646873 0.011981915 0.007024748
## [7,] 0.03204294 0.002873220 0.012972550 0.001787521
# Step 4: Convert to data frame with correct column name
library(dplyr)
posterior_df <- data.frame(posterior_matrix[, 1])

colnames(posterior_df) <- "effect_size"
# Debug: Confirm column name exists
print(names(posterior_df))  # Should print "effect_size"
## [1] "effect_size"
head(posterior_df)
##   effect_size
## 1  0.04467075
## 2  0.04743539
## 3  0.04546815
## 4  0.04500886
## 5  0.04619606
## 6  0.05555378
str(posterior_df)
## 'data.frame':    10000 obs. of  1 variable:
##  $ effect_size: num  0.0447 0.0474 0.0455 0.045 0.0462 ...
# Step 5: Plot posterior distribution (no missing aesthetics!)
ggplot(data = posterior_df, aes(x = effect_size)) +
  geom_density(fill = "skyblue",
               color = "black",
               alpha = 0.5) +
  geom_vline(
    xintercept = 0,
    linetype = "dashed",
    color = "red",
    linewidth = 1
  ) +
  labs(title = "Posterior Distribution of Ethnicity Effect on Malocclusion", x = "Log Odds Ratio (Effect Size)", y = "Density") +
  theme_minimal()

# Step 2: Bayesian analysis---gender
table_gender <- table(data$MALOCCLUSION, data$GENDER)
bf_gender <- contingencyTableBF(as.matrix(table_gender), sampleType = "jointMulti")

# Step 3: Extract posterior samples
posterior_matrixg <- posterior(bf_gender, iterations = 10000)

# Debug: Confirm it's a matrix and has content
str(posterior_matrixg)  # Should show num [1:10000, 1]
## Formal class 'BFmcmc' [package "BayesFactor"] with 3 slots
##   ..@ model   :Formal class 'BFcontingencyTable' [package "BayesFactor"] with 8 slots
##   .. .. ..@ type      : chr "joint multinomial"
##   .. .. ..@ identifier:List of 1
##   .. .. .. ..$ formula: chr "non-independence"
##   .. .. ..@ prior     :List of 1
##   .. .. .. ..$ a: num 1
##   .. .. ..@ dataTypes : chr(0) 
##   .. .. ..@ shortName : chr "Non-indep. (a=1)"
##   .. .. ..@ longName  : chr "Alternative, non-independence, a = 1"
##   .. .. ..@ analysis  :List of 1
##   .. .. .. ..$ method: chr "analytic"
##   .. .. ..@ version   : chr "0.9.12-4.7"
##   ..@ data    :'data.frame': 9 obs. of  2 variables:
##   .. ..$ V1: int [1:9] 146 23 2 18 0 87 10 16 2
##   .. ..$ V2: int [1:9] 122 26 5 11 3 88 7 32 2
##   ..@ .S3Class: chr "mcmc"
##   ..$ dim     : int [1:2] 10000 18
##   ..$ dimnames:List of 2
##   .. ..$ : NULL
##   .. ..$ : chr [1:18] "pi[1,1]" "pi[2,1]" "pi[3,1]" "pi[4,1]" ...
##   ..$ mcpar   : num [1:3] 1 10000 1
head(posterior_matrixg)
## Markov Chain Monte Carlo (MCMC) output:
## Start = 1 
## End = 7 
## Thinning interval = 1 
##        pi[1,1]    pi[2,1]     pi[3,1]    pi[4,1]      pi[5,1]   pi[6,1]
## [1,] 0.2330422 0.03157631 0.006471548 0.02695446 0.0010469147 0.1263945
## [2,] 0.2281346 0.03797951 0.012873791 0.02991465 0.0024502254 0.1455859
## [3,] 0.2347534 0.04058812 0.010701276 0.03815796 0.0001395716 0.1370483
## [4,] 0.2300829 0.03379091 0.004014648 0.02239511 0.0016728411 0.1449671
## [5,] 0.2262169 0.04807099 0.003118728 0.03503294 0.0021305805 0.1155445
## [6,] 0.2070459 0.04651617 0.004291142 0.03375876 0.0005613322 0.1326892
## [7,] 0.2101271 0.03405134 0.004421892 0.02664506 0.0007345579 0.1424416
##         pi[7,1]    pi[8,1]     pi[9,1]   pi[1,2]    pi[2,2]     pi[3,2]
## [1,] 0.01559235 0.01991293 0.006990940 0.1769772 0.05285370 0.016418274
## [2,] 0.01441075 0.02986918 0.003429537 0.1788818 0.05101802 0.009581189
## [3,] 0.01652977 0.02531413 0.008396523 0.1933437 0.04852743 0.009172760
## [4,] 0.02153197 0.02866619 0.004474488 0.1933032 0.03236991 0.007181474
## [5,] 0.02293132 0.02550228 0.002396257 0.2126709 0.03903755 0.012010859
## [6,] 0.02037808 0.02913509 0.003727350 0.2371379 0.03357523 0.014547169
## [7,] 0.01810141 0.02539438 0.005328587 0.2207028 0.03580081 0.007415855
##         pi[4,2]     pi[5,2]   pi[6,2]     pi[7,2]    pi[8,2]     pi[9,2]
## [1,] 0.02186326 0.005440247 0.1852740 0.005632816 0.06372041 0.003837827
## [2,] 0.02453948 0.007895749 0.1574119 0.011278720 0.05157265 0.003172375
## [3,] 0.02367762 0.003944769 0.1379960 0.019012025 0.04931656 0.003380165
## [4,] 0.02191057 0.005253732 0.1534943 0.009331531 0.07708464 0.008474491
## [5,] 0.02284035 0.005822629 0.1580313 0.008510956 0.05570579 0.004425167
## [6,] 0.01922141 0.003655979 0.1346743 0.023278710 0.05345166 0.002354657
## [7,] 0.02110247 0.003345163 0.1819833 0.010551139 0.04516984 0.006682701
# Step 4: Convert to data frame with correct column name
library(dplyr)
posterior_dfg <- data.frame(posterior_matrixg[, 1])

colnames(posterior_dfg) <- "effect_sizeg"

# Debug: Confirm column name exists
print(names(posterior_dfg))  # Should print "effect_sizeg"
## [1] "effect_sizeg"
head(posterior_dfg)
##   effect_sizeg
## 1    0.2330422
## 2    0.2281346
## 3    0.2347534
## 4    0.2300829
## 5    0.2262169
## 6    0.2070459
str(posterior_dfg)
## 'data.frame':    10000 obs. of  1 variable:
##  $ effect_sizeg: num  0.233 0.228 0.235 0.23 0.226 ...
# Step 5: Plot posterior distribution (no missing aesthetics!)
ggplot(data = posterior_dfg, aes(x = effect_sizeg)) +
  geom_density(fill = "yellow",
               color = "black",
               alpha = 0.5) +
  geom_vline(
    xintercept = 0,
    linetype = "dashed",
    color = "brown",
    linewidth = 1
  ) +
  labs(title = "Posterior Distribution of Gender Effect on Malocclusion", x = "Log Odds Ratio (Effect Size)", y = "Density") +
  theme_minimal()

# Step 2: Bayesian analysis---treatment
table_treat <- table(data$MALOCCLUSION, data$TREATMENT)
bf_treat <- contingencyTableBF(as.matrix(table_treat), sampleType = "jointMulti")

# Step 3: Extract posterior samples
posterior_matrixt <- posterior(bf_treat, iterations = 10000)

# Debug: Confirm it's a matrix and has content
str(posterior_matrixt)  # Should show num [1:10000, 1]
## Formal class 'BFmcmc' [package "BayesFactor"] with 3 slots
##   ..@ model   :Formal class 'BFcontingencyTable' [package "BayesFactor"] with 8 slots
##   .. .. ..@ type      : chr "joint multinomial"
##   .. .. ..@ identifier:List of 1
##   .. .. .. ..$ formula: chr "non-independence"
##   .. .. ..@ prior     :List of 1
##   .. .. .. ..$ a: num 1
##   .. .. ..@ dataTypes : chr(0) 
##   .. .. ..@ shortName : chr "Non-indep. (a=1)"
##   .. .. ..@ longName  : chr "Alternative, non-independence, a = 1"
##   .. .. ..@ analysis  :List of 1
##   .. .. .. ..$ method: chr "analytic"
##   .. .. ..@ version   : chr "0.9.12-4.7"
##   ..@ data    :'data.frame': 9 obs. of  2 variables:
##   .. ..$ V1: int [1:9] 266 46 6 28 3 166 17 39 2
##   .. ..$ V2: int [1:9] 2 3 1 1 0 9 0 9 2
##   ..@ .S3Class: chr "mcmc"
##   ..$ dim     : int [1:2] 10000 18
##   ..$ dimnames:List of 2
##   .. ..$ : NULL
##   .. ..$ : chr [1:18] "pi[1,1]" "pi[2,1]" "pi[3,1]" "pi[4,1]" ...
##   ..$ mcpar   : num [1:3] 1 10000 1
head(posterior_matrixt)
## Markov Chain Monte Carlo (MCMC) output:
## Start = 1 
## End = 7 
## Thinning interval = 1 
##        pi[1,1]    pi[2,1]     pi[3,1]    pi[4,1]     pi[5,1]   pi[6,1]
## [1,] 0.4711209 0.07726343 0.014932191 0.02988985 0.005488805 0.2646496
## [2,] 0.4322628 0.06078560 0.013412450 0.04691278 0.006431956 0.2833374
## [3,] 0.4196881 0.06839856 0.008312653 0.04051314 0.006920589 0.2769759
## [4,] 0.4603681 0.06012401 0.006354823 0.04131767 0.006403932 0.2546652
## [5,] 0.4740160 0.06323183 0.017067870 0.04612799 0.004994309 0.2544012
## [6,] 0.4385132 0.08903452 0.012908245 0.03759176 0.003891229 0.2496812
## [7,] 0.4393959 0.07234168 0.010703086 0.04714585 0.005230195 0.2673555
##         pi[7,1]    pi[8,1]     pi[9,1]     pi[1,2]     pi[2,2]      pi[3,2]
## [1,] 0.03645095 0.05391721 0.003816376 0.004413119 0.004530382 0.0008475143
## [2,] 0.02396227 0.06404609 0.002542070 0.005692074 0.010774689 0.0004645691
## [3,] 0.02990208 0.07301874 0.009053376 0.007857195 0.011280577 0.0050808576
## [4,] 0.03159295 0.06484240 0.002778602 0.002281714 0.016897970 0.0029947287
## [5,] 0.02542927 0.05625392 0.001579528 0.005590100 0.003497910 0.0032092719
## [6,] 0.04694701 0.06107346 0.005381577 0.001545576 0.004895429 0.0029257223
## [7,] 0.01687010 0.06836791 0.004943718 0.001502134 0.006732149 0.0010886928
##           pi[4,2]      pi[5,2]     pi[6,2]      pi[7,2]    pi[8,2]     pi[9,2]
## [1,] 0.0048557040 0.0000963384 0.015789803 0.0014379138 0.00592872 0.004571124
## [2,] 0.0039036782 0.0039295210 0.013381105 0.0009376535 0.02060105 0.006622242
## [3,] 0.0002914481 0.0034194993 0.017059243 0.0048567251 0.01373572 0.003635590
## [4,] 0.0044298934 0.0027952603 0.019133932 0.0012186127 0.01494430 0.006855906
## [5,] 0.0044009705 0.0003479374 0.016397245 0.0017045080 0.01848860 0.003261556
## [6,] 0.0045578593 0.0065048084 0.009747491 0.0038865738 0.01669670 0.004217666
## [7,] 0.0025428941 0.0011692156 0.030538621 0.0015416033 0.01745036 0.005080393
# Step 4: Convert to data frame with correct column name
library(dplyr)
posterior_dft <- data.frame(posterior_matrixt[, 1])

colnames(posterior_dft) <- "effect_sizet"

# Debug: Confirm column name exists
print(names(posterior_dft))  # Should print "effect_sizet"
## [1] "effect_sizet"
head(posterior_dft)
##   effect_sizet
## 1    0.4711209
## 2    0.4322628
## 3    0.4196881
## 4    0.4603681
## 5    0.4740160
## 6    0.4385132
str(posterior_dft)
## 'data.frame':    10000 obs. of  1 variable:
##  $ effect_sizet: num  0.471 0.432 0.42 0.46 0.474 ...
# Step 5: Plot posterior distribution (no missing aesthetics!)
ggplot(data = posterior_dft, aes(x = effect_sizet)) +
  geom_density(fill = "green", color = "black", alpha = 0.5) +
  geom_vline(xintercept = 0, linetype = "dashed", color = "blue", linewidth = 1) +
  labs(
    title = "Posterior Distribution of Gender Effect on Treatment",
    x = "Log Odds Ratio (Effect Size)",
    y = "Density"
  ) +
  theme_minimal()

# Add labels for clarity
posterior_dfg$variable <- "Gender"
posterior_df$variable <- "Ethnicity"
posterior_dft$variable<- "Treatment"

# Combine into one dataframe
posterior_combined <- rbind(
  data.frame(effect_size = posterior_dfg$effect_sizeg, variable = "Gender"),
  data.frame(effect_size = posterior_df$effect_size, variable = "Ethnicity"),
  data.frame(effect_size = posterior_dft$effect_sizet, variable = "Treatment")
)

# Combined ggplot
ggplot(data = posterior_combined, aes(x = effect_size, fill = variable, color = variable)) +
  geom_density(alpha = 0.5) +
  geom_vline(xintercept = 0, linetype = "dashed", color = "black", linewidth = 1) +
  labs(
    title = "Posterior Distribution: Gender vs Ethnicity Effect vs Treatment on Malocclusion",
    x = "Log Odds Ratio (Effect Size)",
    y = "Density",
    fill = "Group",
    color = "Group"
  ) +
  theme_minimal() +
  theme(legend.position = "bottom")

## Bayesian Analysis

# Step 4: Bayesian Analysis
# Association between ETHNICITY and MALOCCLUSION
bf_ethnicity <- contingencyTableBF(
  table(data$ETHNICITY, data$MALOCCLUSION),
  sampleType = "indepMulti",
  fixedMargin = "rows"
)

# Association between GENDER and MALOCCLUSION
library(BayesFactor)

# Create BayesFactor object: Gender vs Malocclusion
bf_gender <- contingencyTableBF(
  x = table(data$GENDER, data$MALOCCLUSION),
  sampleType = "indepMulti",
  fixedMargin = "rows"  # assumes row totals (i.e., Gender) are fixed
)

bf_ethnicity <- contingencyTableBF(
  x = table(data$ETHNICITY, data$MALOCCLUSION),
  sampleType = "indepMulti",
  fixedMargin = "rows"
)

bf_treat <- contingencyTableBF(
  x = table(data$TREATMENT, data$MALOCCLUSION),
  sampleType = "indepMulti",
  fixedMargin = "rows"
)



# Print Bayes Factors
print("Bayes Factor for Ethnicity vs Malocclusion:")
## [1] "Bayes Factor for Ethnicity vs Malocclusion:"
print(bf_ethnicity)
## Bayes factor analysis
## --------------
## [1] Non-indep. (a=1) : 1.069445e-11 ±0%
## 
## Against denominator:
##   Null, independence, a = 1 
## ---
## Bayes factor type: BFcontingencyTable, independent multinomial
print("Bayes Factor for Gender vs Malocclusion:")
## [1] "Bayes Factor for Gender vs Malocclusion:"
print(bf_gender)
## Bayes factor analysis
## --------------
## [1] Non-indep. (a=1) : 6.660233e-05 ±0%
## 
## Against denominator:
##   Null, independence, a = 1 
## ---
## Bayes factor type: BFcontingencyTable, independent multinomial
print("Bayes Factor for Treatment vs Malocclusion:")
## [1] "Bayes Factor for Treatment vs Malocclusion:"
print(bf_treat)
## Bayes factor analysis
## --------------
## [1] Non-indep. (a=1) : 6859.253 ±0%
## 
## Against denominator:
##   Null, independence, a = 1 
## ---
## Bayes factor type: BFcontingencyTable, independent multinomial