library(glmmTMB)
## Warning: package 'glmmTMB' was built under R version 4.4.2
library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(sf)
## Linking to GEOS 3.12.1, GDAL 3.8.4, PROJ 9.3.1; sf_use_s2() is TRUE
library(ggplot2)
## Warning: package 'ggplot2' was built under R version 4.4.2
library(lme4)
## Loading required package: Matrix
library(sjPlot)
## Learn more about sjPlot with 'browseVignettes("sjPlot")'.
library(betareg)
## Warning: package 'betareg' was built under R version 4.4.2
library(MASS)
## 
## Attaching package: 'MASS'
## The following object is masked from 'package:dplyr':
## 
##     select
library(MuMIn)
## Warning: package 'MuMIn' was built under R version 4.4.2
library(lmtest)
## Loading required package: zoo
## 
## Attaching package: 'zoo'
## The following objects are masked from 'package:base':
## 
##     as.Date, as.Date.numeric
library(multcomp)
## Warning: package 'multcomp' was built under R version 4.4.2
## Loading required package: mvtnorm
## Warning: package 'mvtnorm' was built under R version 4.4.2
## Loading required package: survival
## Loading required package: TH.data
## Warning: package 'TH.data' was built under R version 4.4.2
## 
## Attaching package: 'TH.data'
## The following object is masked from 'package:MASS':
## 
##     geyser
BreastData_FullSVI <- read.csv("C:\\Users\\micha\\PIPCHE Dropbox\\Michael Cajigal\\U54-3 Biostats\\LOUIS\\Students\\Michael\\AACR 2025 Conference\\Incidence and Mortality\\Breast Cancer Project - IR by Village MBC\\DataSet Creation\\1998 to 2022 GCR breast data (final) with full SVI.csv", stringsAsFactors = TRUE)
filtered_BreastData_FullSVI <- BreastData_FullSVI[BreastData_FullSVI$Sex == 2 & 
                      BreastData_FullSVI$DxCity_num_recode != 20 & 
                      BreastData_FullSVI$BehaviorICDO3 == 3, ]
filtered_BreastData_valid_vill <- filtered_BreastData_FullSVI %>%
  filter(DxCity_num_recode != 20)
#nrow(filtered_BreastData_valid_vill)
#Unknown_Vill_Count <- nrow(filtered_BreastData_FullSVI) - nrow(filtered_BreastData_valid_vill)
#Unknown_Vill_Count

# Filtering out Missing Values in Combined Stage
#filtered_BreastData_valid_vill_stage <- filtered_BreastData_valid_vill %>%
  #filter(CombinedStage != " ")

#summary(filtered_BreastData_valid_vill_stage$CombinedStage)
filtered_BreastData_valid_vill$CombinedStage <- dplyr::recode(
  filtered_BreastData_valid_vill$CombinedStage,
  " " = "Missing",
  "2" = "Early",
  "3" = "Late",
  "4" = "Unstaged"
)

filtered_BreastData_valid_vill$CombinedStage <-  dplyr::recode(filtered_BreastData_valid_vill$CombinedStage,
                        " " = "Missing",                 
                        "2" = "Early",
                        "3" = "Late",
                        "4" = "Unstaged")

# print(filtered_BreastData_valid_vill_stage$CombinedStage)

filtered_BreastData_valid_vill$DxCity_num_recode <-  dplyr::recode(filtered_BreastData_valid_vill$DxCity_num_recode,
                        `7` = "Hagatna",
                        `1` = "Agana Heights",
                        `2` = "Agat",
                        `3` = "Asan",
                        `4` = "Barrigada",
                        `6` = "Dededo",
                        `8` = "Inarajan",
                        `10` = "Merizo",
                        `9` = "Mangilao",
                        `11` = "Mongmong-Toto-Maite",
                        `5` = "Ordot / Chalan Pago",
                        `12` = "Piti",
                        `13` = "Santa Rita",
                        `14` = "Sinajana",
                        `15` = "Talofofo",
                        `16` = "Tamuning",
                        `17` = "Umatac",
                        `18` = "Yigo",
                        `19` = "Yona")
table(filtered_BreastData_valid_vill$DxCity_num_recode)
## 
##       Agana Heights                Agat                Asan           Barrigada 
##                  30                  27                  14                  82 
##              Dededo             Hagatna            Inarajan            Mangilao 
##                 317                  37                  14                  91 
##              Merizo Mongmong-Toto-Maite Ordot / Chalan Pago                Piti 
##                  15                  32                  37                  13 
##          Santa Rita            Sinajana            Talofofo            Tamuning 
##                  47                  22                  20                 123 
##              Umatac                Yigo                Yona 
##                   5                 126                  52
filtered_BreastData_valid_vill$EthnicGroups <- dplyr::recode(filtered_BreastData_valid_vill$EthnicGroups,
                        `1` = "Chamorro",
                        `2` = "Filipino",
                        `3` = "Micronesian",
                        `4` = "Asian",
                        `5` = "Caucasian",
                        `6` = "Pacific Islander",
                        `7` = "Other or Unknown")
table(filtered_BreastData_valid_vill$EthnicGroups)
## 
##            Asian        Caucasian         Chamorro         Filipino 
##               64               63              499              320 
##      Micronesian Other or Unknown Pacific Islander 
##               57               78               23
filtered_BreastData_valid_vill$AgeGroups2 <-  dplyr::recode(filtered_BreastData_valid_vill$AgeGroups2,
                        `1` = "0-24",
                        `2` = "25-34",
                        `3` = "35-44",
                        `4` = "45-54",
                        `5` = "55-64",
                        `6` = "65-74",
                        `7` = "75+")
table(filtered_BreastData_valid_vill$AgeGroups2)
## 
##  0-24 25-34 35-44 45-54 55-64 65-74   75+ 
##     1    33   140   267   298   223   142
# Calculate late-stage diagnosis percentage by village
village_stage_distribution <- filtered_BreastData_valid_vill %>%
  filter(CombinedStage %in% c("Unstaged", "Early", "Late")) %>% 
  group_by(DxCity_num_recode, CombinedStage) %>%
  summarise(Count = n(), .groups = "drop") %>%
  tidyr::pivot_wider(names_from = CombinedStage, values_from = Count, values_fill = 0) %>%
  mutate(
    Total = Unstaged + Early + Late,
    Late_Percentage = (Late / Total) * 100
  ) %>%
  arrange(desc(Late_Percentage))

# View results
print(village_stage_distribution)
## # A tibble: 19 × 6
##    DxCity_num_recode   Early  Late Unstaged Total Late_Percentage
##    <chr>               <int> <int>    <int> <int>           <dbl>
##  1 Umatac                  1     2        0     3            66.7
##  2 Asan                    2     5        1     8            62.5
##  3 Merizo                  7     6        1    14            42.9
##  4 Talofofo                7     6        1    14            42.9
##  5 Inarajan                6     5        1    12            41.7
##  6 Agat                   11     9        2    22            40.9
##  7 Yigo                   51    41       11   103            39.8
##  8 Mangilao               40    30        7    77            39.0
##  9 Barrigada              35    27        8    70            38.6
## 10 Yona                   24    17        4    45            37.8
## 11 Tamuning               55    33        4    92            35.9
## 12 Dededo                123    86       38   247            34.8
## 13 Mongmong-Toto-Maite    13     7        1    21            33.3
## 14 Ordot / Chalan Pago    17     9        1    27            33.3
## 15 Agana Heights           8     7        8    23            30.4
## 16 Hagatna                20    10        3    33            30.3
## 17 Santa Rita             23    10        3    36            27.8
## 18 Piti                    8     2        1    11            18.2
## 19 Sinajana               13     3        2    18            16.7
#write.csv(village_stage_distribution, "village_stage_distribution.csv", row.names = FALSE)

# Plotting the results (bar chart of Late-Stage Percentages by Village without missing)
library(ggplot2)

ggplot(village_stage_distribution, aes(x = reorder(DxCity_num_recode, -Late_Percentage), y = Late_Percentage)) +
  geom_bar(stat = "identity") +
  labs(title = "Percentage of Late-Stage Diagnoses by Village",
       x = "Village",
       y = "Late-Stage Percentage (%)") +
  theme(axis.text.x = element_text(angle = 45, hjust = 1))

# Plot densities on GIS map of Guam.
# Read in shape
sf_data <- st_read("C:\\Users\\micha\\Downloads\\guam_village_shape\\village.shp")
## Reading layer `village' from data source 
##   `C:\Users\micha\Downloads\guam_village_shape\village.shp' using driver `ESRI Shapefile'
## Simple feature collection with 19 features and 6 fields
## Geometry type: MULTIPOLYGON
## Dimension:     XY
## Bounding box:  xmin: 85741.2 ymin: 170580.6 xmax: 122375.7 ymax: 217087.4
## Projected CRS: 1993_Guam_Geodetic_Network
# Rename villages in the 'DxCity_num_recode' column
village_stage_distribution_vill_corr <- village_stage_distribution  %>%
  mutate(
    DxCity_num_recode = case_when(
      DxCity_num_recode == "Agana Heights (Hagåtña Heights)" ~ "Agana Heights",
      DxCity_num_recode == "Agat (HÃ¥gat)" ~ "Agat",
      DxCity_num_recode == "Asan-Maina" ~ "Asan",
      DxCity_num_recode == "Chalan Pago-Ordot" ~ "Ordot / Chalan Pago",
      DxCity_num_recode == "Hagåtña (Agana)" ~ "Hagatna",
      DxCity_num_recode == "Inarajan (Inalåhan)" ~ "Inarajan",
      DxCity_num_recode == "Tamuning-Tumon-Harmon" ~ "Tamuning",
      DxCity_num_recode == "Merizo (Malesso’)" ~ "Merizo",
      DxCity_num_recode == "Umatac (Humåtak)" ~ "Umatac",
      DxCity_num_recode == "Talofofo (Talo’fo’fo)" ~ "Talofofo",
      TRUE ~ DxCity_num_recode
    )
  )

data <- left_join(sf_data, village_stage_distribution_vill_corr, join_by("MUNICIPAL" == "DxCity_num_recode"))

 
g1 <- ggplot(data=data)+
  geom_sf(aes(fill = Late_Percentage), color="white")+
  theme_void() + scale_fill_gradient(low="#f2cbd6", high = "#990c58") +
  geom_sf_text(aes(label = paste0(MUNICIPAL, "\n", Late)), size=1) +
  labs(title = "Breast Cancer Cases by Village from 1989 to 2021",
  caption = "Hue represents percentage of \n late-diagnises counts per village (denominator is total cases from that respective), \n while labels display number of cases.", fill = "Percentage of Late-Diagnosis Cases") + 
  theme(plot.caption = element_text(size = 11, hjust=1, face= "italic"))
g1

# Creating new dataset to include SVI variables only for each village.
summary_filtered_BreastData_valid_vill <- filtered_BreastData_valid_vill %>%
  group_by(DxCity_num_recode) %>%
  summarise(
    Early = sum(CombinedStage == "Early", na.rm = TRUE),
    Late = sum(CombinedStage == "Late", na.rm = TRUE),
    Unstaged = sum(CombinedStage == "Unstaged", na.rm = TRUE),
    Missing = sum(is.na(CombinedStage) | CombinedStage == "Missing"),
    # Summarise all variables after CDP_NAME
    across(CDP_NAME:ncol(filtered_BreastData_valid_vill), ~ mean(.x, na.rm = TRUE), .names = "{col}")
  )

# Calculating stage totals for each village and late-stage diagnosis proportions (including missing).
summary_filtered_BreastData_valid_vill_wmissing <- summary_filtered_BreastData_valid_vill %>%
  mutate(
    Total = Early + Late + Unstaged + Missing,   # Calculate the total count
    late_prop = Late / Total                     # Calculate the proportion of late diagnoses
  )

# Save the datset.
write.csv(summary_filtered_BreastData_valid_vill_wmissing, "summary_filtered_BreastData_valid_vill_wmissing_BetaR.csv", row.names = FALSE)



# Calculating stage totals for each village and late-stage diagnosis proportions (excluding missing).
summary_filtered_BreastData_valid_vill_womissing <- summary_filtered_BreastData_valid_vill %>%
  mutate(
    Total = Early + Late + Unstaged,   # Calculate the total count
    late_prop = Late / Total           # Calculate the proportion of late diagnoses
  )

# Save the datset.
write.csv(summary_filtered_BreastData_valid_vill_womissing, "summary_filtered_BreastData_valid_vill_womissing_BetaR.csv", row.names = FALSE)
# List variables to exclude
exclude_vars <- c("DxCity_num_recode", "Early", "Late", "Unstaged", "Missing", "CDP_NAME", "VILLAGE_POP", "CDP_POP", "CIV_POP", "POP_25OVER", "POP_5OVER", "HOUSE_UNITS", "HOUSEHOLDS", "CIV_NONINST", "V93", "V94", "Total", "late_prop")

# Get all column names except those to exclude and the dependent variable
predictor_vars <- setdiff(names(summary_filtered_BreastData_valid_vill_wmissing), c("late_prop", exclude_vars))

# Create the formula dynamically
formula <- as.formula(paste("late_prop ~", paste(predictor_vars, collapse = " + ")))

print(predictor_vars)
##  [1] "A1_P"        "A1_R"        "A2_P"        "A2_R"        "A3_P"       
##  [6] "A3_R"        "A4_P"        "A4_R"        "A5_SUM"      "A5_R"       
## [11] "B1_P"        "B1_R"        "B2_P"        "B2_R"        "B3_P"       
## [16] "B3_R"        "B4_P"        "B4_R"        "B5_SUM"      "B5_R"       
## [21] "C1_P"        "C1_R"        "C1_NEW_P"    "C1_NEW_R"    "C2_P"       
## [26] "C2_R"        "C3_SUM"      "C3_R"        "C3_NEW_SUM"  "C3_NEW_R"   
## [31] "D1_P"        "D1_R"        "D2_P"        "D2_R"        "D3_P"       
## [36] "D3_R"        "D4_P"        "D4_R"        "D5_SUM"      "D5_P"       
## [41] "D5_R"        "D6_SUM"      "D6_R"        "SVI_CDC_SUM" "SVI_CDC_R"  
## [46] "H1_P"        "H1_R"        "H2_P"        "H2_R"        "H3A_P"      
## [51] "H3A_R"       "H3B_P"       "H3B_R"       "H3C_P"       "H3C_R"      
## [56] "H4_P"        "H4_R"        "H5_P"        "H5_R"        "H6_P"       
## [61] "H6_R"        "HA_SUM"      "HA_R"        "H7_P"        "H7_R"       
## [66] "H8_P"        "H8_R"        "H9_P"        "H9_R"        "HB_SUM"     
## [71] "HB_R"        "P1_P"        "P1_R"        "P2_P"        "P2_R"       
## [76] "P3_P"        "P3_R"        "P4_P"        "P4_R"        "P5_SUM"     
## [81] "P5_R"        "SVI_ALL_SUM" "SVI_ALL_R"
# Fit the beta regression model
#b1 <- betareg(formula, data = summary_filtered_BreastData_valid_vill_wmissing, link = "logit")
(summary_filtered_BreastData_valid_vill_wmissing$DxCity_num_recode)
##  [1] "Agana Heights"       "Agat"                "Asan"               
##  [4] "Barrigada"           "Dededo"              "Hagatna"            
##  [7] "Inarajan"            "Mangilao"            "Merizo"             
## [10] "Mongmong-Toto-Maite" "Ordot / Chalan Pago" "Piti"               
## [13] "Santa Rita"          "Sinajana"            "Talofofo"           
## [16] "Tamuning"            "Umatac"              "Yigo"               
## [19] "Yona"
# Define the regions for each village
north_villages <- c("Dededo", "Yigo", "Tamuning")
central_villages <- c("Mangilao", "Barrigada", "Agana Heights", "Asan", "Ordot / Chalan Pago", "Hagatna", "Mongmong-Toto-Maite", "Piti", "Sinajana")
south_villages <- c("Agat", "Merizo", "Umatac", "Inarajan", "Talofofo", "Santa Rita", "Yona")

# Add a region column based on village
summary_filtered_BreastData_valid_vill_wmissing <- summary_filtered_BreastData_valid_vill_wmissing %>%
  mutate(
    Region = case_when(
      DxCity_num_recode %in% north_villages ~ "North",
      DxCity_num_recode %in% central_villages ~ "Central",
      DxCity_num_recode %in% south_villages ~ "South",
      TRUE ~ "Unknown" # For villages not explicitly listed
    )
  )

# View the updated dataset
print(summary_filtered_BreastData_valid_vill_wmissing)
## # A tibble: 19 × 102
##    DxCity_num_recode   Early  Late Unstaged Missing CDP_NAME VILLAGE_POP CDP_POP
##    <chr>               <dbl> <int>    <int>   <int>    <dbl>       <dbl>   <dbl>
##  1 Agana Heights           8     7        8       7      NaN        3808     NaN
##  2 Agat                   11     9        2       5      NaN        4917     NaN
##  3 Asan                    2     5        1       6      NaN        2137     NaN
##  4 Barrigada              35    27        8      12      NaN        8875     NaN
##  5 Dededo                123    86       38      70      NaN       44943     NaN
##  6 Hagatna                20    10        3       4      NaN        1051     NaN
##  7 Inarajan                6     5        1       2      NaN        2273     NaN
##  8 Mangilao               40    30        7      14      NaN       15191     NaN
##  9 Merizo                  7     6        1       1      NaN        1850     NaN
## 10 Mongmong-Toto-Maite    13     7        1      11      NaN        6825     NaN
## 11 Ordot / Chalan Pago    17     9        1      10      NaN        6822     NaN
## 12 Piti                    8     2        1       2      NaN        1454     NaN
## 13 Santa Rita             23    10        3      11      NaN        6084     NaN
## 14 Sinajana               13     3        2       4      NaN        2592     NaN
## 15 Talofofo                7     6        1       6      NaN        3050     NaN
## 16 Tamuning               55    33        4      31      NaN       19685     NaN
## 17 Umatac                  1     2        0       2      NaN         782     NaN
## 18 Yigo                   51    41       11      23      NaN       20539     NaN
## 19 Yona                   24    17        4       7      NaN        6480     NaN
## # ℹ 94 more variables: CIV_POP <dbl>, POP_25OVER <dbl>, POP_5OVER <dbl>,
## #   HOUSE_UNITS <dbl>, HOUSEHOLDS <dbl>, CIV_NONINST <dbl>, A1_P <dbl>,
## #   A1_R <dbl>, A2_P <dbl>, A2_R <dbl>, A3_P <dbl>, A3_R <dbl>, A4_P <dbl>,
## #   A4_R <dbl>, A5_SUM <dbl>, A5_R <dbl>, B1_P <dbl>, B1_R <dbl>, B2_P <dbl>,
## #   B2_R <dbl>, B3_P <dbl>, B3_R <dbl>, B4_P <dbl>, B4_R <dbl>, B5_SUM <dbl>,
## #   B5_R <dbl>, C1_P <dbl>, C1_R <dbl>, C1_NEW_P <dbl>, C1_NEW_R <dbl>,
## #   C2_P <dbl>, C2_R <dbl>, C3_SUM <dbl>, C3_R <dbl>, C3_NEW_SUM <dbl>, …
summary_filtered_BreastData_valid_vill_wmissing$Region <- factor(summary_filtered_BreastData_valid_vill_wmissing$Region)
# Wated to see if we can find an ideal model utilizing our null and full models to converge into a "best model" based off AIC. Note: I couldn't do it)

# Fit the initial (full) model
full_model <-  glmmTMB(late_prop ~ SVI_ALL_R + A1_P + A1_R + A2_P + A2_R + A3_P + A3_R + A4_P + A4_R + A5_SUM + A5_R + B1_P + B1_R + B2_P + B2_R + B3_P + B3_R + B4_P + B4_R + B5_SUM + B5_R + C1_P + C1_R + C1_NEW_P + C1_NEW_R + C2_P + C2_R + C3_SUM + C3_R + C3_NEW_SUM + C3_NEW_R + D1_P + D1_R + D2_P + D2_R + D3_P + D3_R + D4_P + D4_R + D5_SUM + D5_P + D5_R + D6_SUM + D6_R + SVI_CDC_SUM + SVI_CDC_R + H1_P + H1_R + H2_P + H2_R + H3A_P + H3A_R + H3B_P + H3B_R + H3C_P + H3C_R + H4_P + H4_R + H5_P + H5_R + H6_P + H6_R + HA_SUM + HA_R + H7_P + H7_R + H8_P + H8_R + H9_P + H9_R + HB_SUM + HB_R + P1_P + P1_R + P2_P + P2_R + P3_P + P3_R + P4_P + P4_R + P5_SUM + P5_R + SVI_ALL_SUM + SVI_ALL_R + (1 | DxCity_num_recode), 
                 family = beta_family(link = "logit"), 
                 data = summary_filtered_BreastData_valid_vill_wmissing)
## dropping columns from rank-deficient conditional model: A5_SUM, B5_SUM, B5_R, C1_P, C1_R, C1_NEW_P, C1_NEW_R, C2_P, C2_R, C3_SUM, C3_R, C3_NEW_SUM, C3_NEW_R, D1_P, D1_R, D2_P, D2_R, D3_P, D3_R, D4_P, D4_R, D5_SUM, D5_P, D5_R, D6_SUM, D6_R, SVI_CDC_SUM, SVI_CDC_R, H1_P, H1_R, H2_P, H2_R, H3A_P, H3A_R, H3B_P, H3B_R, H3C_P, H3C_R, H4_P, H4_R, H5_P, H5_R, H6_P, H6_R, HA_SUM, HA_R, H7_P, H7_R, H8_P, H8_R, H9_P, H9_R, HB_SUM, HB_R, P1_P, P1_R, P2_P, P2_R, P3_P, P3_R, P4_P, P4_R, P5_SUM, P5_R, SVI_ALL_SUM
## Warning in (function (start, objective, gradient = NULL, hessian = NULL, :
## NA/NaN function evaluation
## Warning in (function (start, objective, gradient = NULL, hessian = NULL, :
## NA/NaN function evaluation
## Warning in finalizeTMB(TMBStruc, obj, fit, h, data.tmb.old): Model convergence
## problem; non-positive-definite Hessian matrix. See vignette('troubleshooting')
## Warning in finalizeTMB(TMBStruc, obj, fit, h, data.tmb.old): Model convergence
## problem; function evaluation limit reached without convergence (9). See
## vignette('troubleshooting'), help('diagnose')
# Fit the null model (intercept-only model)
null_model <-  glmmTMB(late_prop ~ 1, 
                 family = beta_family(link = "logit"), 
                 data = summary_filtered_BreastData_valid_vill_wmissing)

summary(null_model)
##  Family: beta  ( logit )
## Formula:          late_prop ~ 1
## Data: summary_filtered_BreastData_valid_vill_wmissing
## 
##      AIC      BIC   logLik deviance df.resid 
##    -40.7    -38.8     22.4    -44.7       17 
## 
## 
## Dispersion parameter for beta family (): 35.1 
## 
## Conditional model:
##             Estimate Std. Error z value Pr(>|z|)    
## (Intercept) -0.90871    0.08432  -10.78   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
####################################### BETA REGRESSION ##################################
#Note: When choosing a model to use, if we are not considering random intercepts we use betarag and if we are, we need to use glmmTMB(beta_family(link = "logit")).

#null model (nested: simplified model)
b0<-betareg(late_prop ~ 1, data=summary_filtered_BreastData_valid_vill_wmissing, link="logit")
summary(b0)
## 
## Call:
## betareg(formula = late_prop ~ 1, data = summary_filtered_BreastData_valid_vill_wmissing, 
##     link = "logit")
## 
## Quantile residuals:
##     Min      1Q  Median      3Q     Max 
## -2.2580 -0.6090  0.2173  0.6126  1.4353 
## 
## Coefficients (mean model with logit link):
##             Estimate Std. Error z value Pr(>|z|)    
## (Intercept) -0.90871    0.08432  -10.78   <2e-16 ***
## 
## Phi coefficients (precision model with identity link):
##       Estimate Std. Error z value Pr(>|z|)   
## (phi)    35.13      11.27   3.118  0.00182 **
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1 
## 
## Type of estimator: ML (maximum likelihood)
## Log-likelihood: 22.36 on 2 Df
## Number of iterations: 92 (BFGS) + 2 (Fisher scoring)
# We first want to see if variables of SVI are even considered to be predictors before choosing specific SVI variables. So we use the variable SVI_ALL_SUM as our only predictor to see if theres significance in the sum of all rankings for CDC defined SVI variables as a factor towards late-diagnosis of breast cancer. 
b1 <- betareg(late_prop ~ SVI_ALL_SUM, data=summary_filtered_BreastData_valid_vill_wmissing, link="logit")
summary(b1)
## 
## Call:
## betareg(formula = late_prop ~ SVI_ALL_SUM, data = summary_filtered_BreastData_valid_vill_wmissing, 
##     link = "logit")
## 
## Quantile residuals:
##     Min      1Q  Median      3Q     Max 
## -2.2319 -0.7015  0.0323  0.7043  1.8380 
## 
## Coefficients (mean model with logit link):
##             Estimate Std. Error z value Pr(>|z|)    
## (Intercept) -1.56180    0.30029  -5.201 1.98e-07 ***
## SVI_ALL_SUM  0.04329    0.01908   2.268   0.0233 *  
## 
## Phi coefficients (precision model with identity link):
##       Estimate Std. Error z value Pr(>|z|)   
## (phi)    45.08      14.50    3.11  0.00187 **
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1 
## 
## Type of estimator: ML (maximum likelihood)
## Log-likelihood: 24.73 on 3 Df
## Pseudo R-squared: 0.2178
## Number of iterations: 47 (BFGS) + 2 (Fisher scoring)
# Now we include region to see if this significance effect of SVI_ALL_SUM is also significant between regions. 
b2 <- betareg(late_prop ~ SVI_ALL_SUM + Region, data=summary_filtered_BreastData_valid_vill_wmissing, link="logit")
summary(b2)
## 
## Call:
## betareg(formula = late_prop ~ SVI_ALL_SUM + Region, data = summary_filtered_BreastData_valid_vill_wmissing, 
##     link = "logit")
## 
## Quantile residuals:
##     Min      1Q  Median      3Q     Max 
## -2.0883 -0.7368  0.0492  0.5256  2.5315 
## 
## Coefficients (mean model with logit link):
##             Estimate Std. Error z value Pr(>|z|)    
## (Intercept) -1.66749    0.26472  -6.299    3e-10 ***
## SVI_ALL_SUM  0.04027    0.01736   2.320   0.0203 *  
## RegionNorth  0.07366    0.19730   0.373   0.7089    
## RegionSouth  0.35913    0.14284   2.514   0.0119 *  
## 
## Phi coefficients (precision model with identity link):
##       Estimate Std. Error z value Pr(>|z|)   
## (phi)    61.00      19.66   3.103  0.00192 **
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1 
## 
## Type of estimator: ML (maximum likelihood)
## Log-likelihood: 27.58 on 5 Df
## Pseudo R-squared: 0.4001
## Number of iterations: 56 (BFGS) + 3 (Fisher scoring)
# Since we know that overall SVI variables have an effect, we can now investigate further with those specific SVI variables, by adding those as possible predictors seeing the effect of each one on the outcome variable.
b3 <- betareg(late_prop ~ Region + A1_P, data=summary_filtered_BreastData_valid_vill_wmissing, link="logit")
summary(b3)
## 
## Call:
## betareg(formula = late_prop ~ Region + A1_P, data = summary_filtered_BreastData_valid_vill_wmissing, 
##     link = "logit")
## 
## Quantile residuals:
##     Min      1Q  Median      3Q     Max 
## -1.9934 -0.4806 -0.0210  0.7511  2.0295 
## 
## Coefficients (mean model with logit link):
##             Estimate Std. Error z value Pr(>|z|)    
## (Intercept) -1.72874    0.34593  -4.997 5.81e-07 ***
## RegionNorth  0.16587    0.19630   0.845  0.39814    
## RegionSouth  0.40446    0.14621   2.766  0.00567 ** 
## A1_P         0.02928    0.01537   1.904  0.05685 .  
## 
## Phi coefficients (precision model with identity link):
##       Estimate Std. Error z value Pr(>|z|)   
## (phi)    56.64      18.25   3.104  0.00191 **
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1 
## 
## Type of estimator: ML (maximum likelihood)
## Log-likelihood: 26.88 on 5 Df
## Pseudo R-squared: 0.3597
## Number of iterations: 45 (BFGS) + 4 (Fisher scoring)
# As we add more predictors, the significance of each predictor starts to converge and become insignificant.
b4 <- betareg(late_prop ~ Region + A1_P + A2_P + A3_P, data=summary_filtered_BreastData_valid_vill_wmissing, link="logit")
summary(b4)
## 
## Call:
## betareg(formula = late_prop ~ Region + A1_P + A2_P + A3_P, data = summary_filtered_BreastData_valid_vill_wmissing, 
##     link = "logit")
## 
## Quantile residuals:
##     Min      1Q  Median      3Q     Max 
## -2.1658 -0.5629 -0.0034  0.4303  2.3346 
## 
## Coefficients (mean model with logit link):
##               Estimate Std. Error z value Pr(>|z|)  
## (Intercept) -6.383e-01  9.390e-01  -0.680   0.4966  
## RegionNorth  8.593e-02  2.140e-01   0.401   0.6881  
## RegionSouth  3.644e-01  2.027e-01   1.798   0.0722 .
## A1_P         1.721e-02  2.602e-02   0.661   0.5083  
## A2_P        -1.426e-02  5.557e-02  -0.257   0.7975  
## A3_P        -3.813e-05  3.127e-05  -1.219   0.2227  
## 
## Phi coefficients (precision model with identity link):
##       Estimate Std. Error z value Pr(>|z|)   
## (phi)    61.45      19.81   3.103  0.00192 **
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1 
## 
## Type of estimator: ML (maximum likelihood)
## Log-likelihood: 27.65 on 7 Df
## Pseudo R-squared: 0.4097
## Number of iterations: 30 (BFGS) + 2 (Fisher scoring)
b5 <- betareg(late_prop ~ Region + A1_P + A2_P + A3_P + A4_P, data=summary_filtered_BreastData_valid_vill_wmissing, link="logit")
summary(b5)
## 
## Call:
## betareg(formula = late_prop ~ Region + A1_P + A2_P + A3_P + A4_P, data = summary_filtered_BreastData_valid_vill_wmissing, 
##     link = "logit")
## 
## Quantile residuals:
##     Min      1Q  Median      3Q     Max 
## -2.1418 -0.5453 -0.0220  0.4074  2.3491 
## 
## Coefficients (mean model with logit link):
##               Estimate Std. Error z value Pr(>|z|)  
## (Intercept) -4.436e-01  1.438e+00  -0.308   0.7577  
## RegionNorth  7.678e-02  2.198e-01   0.349   0.7269  
## RegionSouth  3.658e-01  2.029e-01   1.802   0.0715 .
## A1_P         1.971e-02  2.975e-02   0.662   0.5077  
## A2_P        -1.754e-02  5.875e-02  -0.299   0.7653  
## A3_P        -4.354e-05  4.357e-05  -0.999   0.3176  
## A4_P        -6.176e-03  3.482e-02  -0.177   0.8592  
## 
## Phi coefficients (precision model with identity link):
##       Estimate Std. Error z value Pr(>|z|)   
## (phi)    61.55      19.84   3.103  0.00192 **
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1 
## 
## Type of estimator: ML (maximum likelihood)
## Log-likelihood: 27.67 on 8 Df
## Pseudo R-squared: 0.4112
## Number of iterations: 35 (BFGS) + 2 (Fisher scoring)
# Until no predictors are significant anymore...
b6 <- betareg(late_prop ~ Region + A1_P + A2_P + A3_P + A4_P + B1_P, data=summary_filtered_BreastData_valid_vill_wmissing, link="logit")
summary(b6)
## 
## Call:
## betareg(formula = late_prop ~ Region + A1_P + A2_P + A3_P + A4_P + B1_P, 
##     data = summary_filtered_BreastData_valid_vill_wmissing, link = "logit")
## 
## Quantile residuals:
##     Min      1Q  Median      3Q     Max 
## -1.9641 -0.5110  0.0652  0.3913  2.4291 
## 
## Coefficients (mean model with logit link):
##               Estimate Std. Error z value Pr(>|z|)
## (Intercept) -4.745e-01  1.417e+00  -0.335    0.738
## RegionNorth  8.787e-02  2.182e-01   0.403    0.687
## RegionSouth  3.238e-01  2.096e-01   1.545    0.122
## A1_P         1.278e-02  3.127e-02   0.409    0.683
## A2_P        -5.000e-03  6.108e-02  -0.082    0.935
## A3_P        -3.281e-05  4.626e-05  -0.709    0.478
## A4_P         4.977e-04  3.568e-02   0.014    0.989
## B1_P        -3.516e-02  5.502e-02  -0.639    0.523
## 
## Phi coefficients (precision model with identity link):
##       Estimate Std. Error z value Pr(>|z|)   
## (phi)    62.92      20.28   3.102  0.00192 **
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1 
## 
## Type of estimator: ML (maximum likelihood)
## Log-likelihood: 27.88 on 9 Df
## Pseudo R-squared: 0.4212
## Number of iterations: 29 (BFGS) + 3 (Fisher scoring)
####Comparing Models Without Random Effects (Betareg Models)###
# The lrtest() function in R is used to perform a Likelihood Ratio Test (LRT), which is a statistical test to compare two nested models. Nested models are models where one is a simpler version (i.e., a subset) of the other. This test evaluates whether the additional complexity of the more complicated model significantly improves the fit to the data.
#H0: The simpler model is sufficient to explain the data.
#HA: The more complex model provides a significantly better fit.
lrtest(b1)
## Likelihood ratio test
## 
## Model 1: late_prop ~ SVI_ALL_SUM
## Model 2: late_prop ~ 1
##   #Df LogLik Df  Chisq Pr(>Chisq)  
## 1   3 24.726                       
## 2   2 22.365 -1 4.7231    0.02976 *
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
lrtest(b2)
## Likelihood ratio test
## 
## Model 1: late_prop ~ SVI_ALL_SUM + Region
## Model 2: late_prop ~ 1
##   #Df LogLik Df  Chisq Pr(>Chisq)  
## 1   5 27.582                       
## 2   2 22.365 -3 10.435    0.01521 *
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
lrtest(b3)
## Likelihood ratio test
## 
## Model 1: late_prop ~ Region + A1_P
## Model 2: late_prop ~ 1
##   #Df LogLik Df  Chisq Pr(>Chisq)  
## 1   5 26.884                       
## 2   2 22.365 -3 9.0378    0.02879 *
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
lrtest(b4)
## Likelihood ratio test
## 
## Model 1: late_prop ~ Region + A1_P + A2_P + A3_P
## Model 2: late_prop ~ 1
##   #Df LogLik Df  Chisq Pr(>Chisq)  
## 1   7 27.653                       
## 2   2 22.365 -5 10.576    0.06045 .
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
lrtest(b5)
## Likelihood ratio test
## 
## Model 1: late_prop ~ Region + A1_P + A2_P + A3_P + A4_P
## Model 2: late_prop ~ 1
##   #Df LogLik Df  Chisq Pr(>Chisq)
## 1   8 27.669                     
## 2   2 22.365 -6 10.608     0.1013
lrtest(b6)
## Likelihood ratio test
## 
## Model 1: late_prop ~ Region + A1_P + A2_P + A3_P + A4_P + B1_P
## Model 2: late_prop ~ 1
##   #Df LogLik Df  Chisq Pr(>Chisq)
## 1   9 27.876                     
## 2   2 22.365 -7 11.023     0.1376
lrtest(b0, b1) # Compare null model to model with SVI_ALL_SUM
## Likelihood ratio test
## 
## Model 1: late_prop ~ 1
## Model 2: late_prop ~ SVI_ALL_SUM
##   #Df LogLik Df  Chisq Pr(>Chisq)  
## 1   2 22.365                       
## 2   3 24.726  1 4.7231    0.02976 *
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
lrtest(b1, b2) # Compare SVI_ALL_SUM to SVI_ALL_SUM + Region
## Likelihood ratio test
## 
## Model 1: late_prop ~ SVI_ALL_SUM
## Model 2: late_prop ~ SVI_ALL_SUM + Region
##   #Df LogLik Df  Chisq Pr(>Chisq)  
## 1   3 24.726                       
## 2   5 27.582  2 5.7117    0.05751 .
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
lrtest(b2, b3) # Compare SVI_ALL_SUM + Region to Region + A1_P
## Likelihood ratio test
## 
## Model 1: late_prop ~ SVI_ALL_SUM + Region
## Model 2: late_prop ~ Region + A1_P
##   #Df LogLik Df  Chisq Pr(>Chisq)    
## 1   5 27.582                         
## 2   5 26.884  0 1.3971  < 2.2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
lrtest(b3, b4) # Compare Region to Region + A1_P to Region + A1_P + A2_P + A3_P
## Likelihood ratio test
## 
## Model 1: late_prop ~ Region + A1_P
## Model 2: late_prop ~ Region + A1_P + A2_P + A3_P
##   #Df LogLik Df  Chisq Pr(>Chisq)
## 1   5 26.884                     
## 2   7 27.653  2 1.5388     0.4633
lrtest(b4, b5) # Compare Region + A1_P + A2_P + A3_P + Region to Region + A1_P + A2_P + A3_P + A4_P
## Likelihood ratio test
## 
## Model 1: late_prop ~ Region + A1_P + A2_P + A3_P
## Model 2: late_prop ~ Region + A1_P + A2_P + A3_P + A4_P
##   #Df LogLik Df  Chisq Pr(>Chisq)
## 1   7 27.653                     
## 2   8 27.669  1 0.0319     0.8583
lrtest(b5, b6) # Compare Region + A1_P + A2_P + A3_P + A4_P + Region to Region + A1_P + A2_P + A3_P + A4_P + B1_P
## Likelihood ratio test
## 
## Model 1: late_prop ~ Region + A1_P + A2_P + A3_P + A4_P
## Model 2: late_prop ~ Region + A1_P + A2_P + A3_P + A4_P + B1_P
##   #Df LogLik Df  Chisq Pr(>Chisq)
## 1   8 27.669                     
## 2   9 27.876  1 0.4142     0.5199
# Comparing AIC
AIC(b0, b1, b2, b3, b4, b5, b6)
##    df       AIC
## b0  2 -40.72941
## b1  3 -43.45254
## b2  5 -45.16423
## b3  5 -43.76717
## b4  7 -41.30595
## b5  8 -39.33783
## b6  9 -37.75201
tab_model(b1, b2, b3, b4, b5, b6, 
          show.ci = FALSE, # remove CI
          show.aic = TRUE, # display AIC
          p.style = "numeric" # display p-values
)
  late_prop late_prop late_prop late_prop late_prop late_prop
Predictors Estimates p Estimates p Estimates p Estimates p Estimates p Estimates p
(Intercept) 0.21 <0.001 0.19 <0.001 0.18 <0.001 0.53 0.497 0.64 0.758 0.62 0.738
SVI ALL SUM 1.04 0.023 1.04 0.020
Region [North] 1.08 0.709 1.18 0.398 1.09 0.688 1.08 0.727 1.09 0.687
Region [South] 1.43 0.012 1.50 0.006 1.44 0.072 1.44 0.072 1.38 0.122
A1 P 1.03 0.057 1.02 0.508 1.02 0.508 1.01 0.683
A2 P 0.99 0.797 0.98 0.765 1.00 0.935
A3 P 1.00 0.223 1.00 0.318 1.00 0.478
A4 P 0.99 0.859 1.00 0.989
B1 P 0.97 0.523
Observations 19 19 19 19 19 19
R2 0.218 0.400 0.360 0.410 0.411 0.421
AIC -43.453 -45.164 -43.767 -41.306 -39.338 -37.752
###########################################################################################
# We will now run the same model (same predictors) but now with a random intercept for each village to account for clustering by allowing the intercept to vary by village and adjusting to account for dependencies within villages (e.g., individuals within the same village sharing similar characteristics).
b7 <-  glmmTMB(late_prop ~ SVI_ALL_SUM + (1 | DxCity_num_recode), 
                 family = beta_family(link = "logit"), 
                 data = summary_filtered_BreastData_valid_vill_wmissing)
summary(b7)
##  Family: beta  ( logit )
## Formula:          late_prop ~ SVI_ALL_SUM + (1 | DxCity_num_recode)
## Data: summary_filtered_BreastData_valid_vill_wmissing
## 
##      AIC      BIC   logLik deviance df.resid 
##    -41.5    -37.7     24.7    -49.5       15 
## 
## Random effects:
## 
## Conditional model:
##  Groups            Name        Variance  Std.Dev. 
##  DxCity_num_recode (Intercept) 3.174e-09 5.634e-05
## Number of obs: 19, groups:  DxCity_num_recode, 19
## 
## Dispersion parameter for beta family (): 45.1 
## 
## Conditional model:
##             Estimate Std. Error z value Pr(>|z|)    
## (Intercept) -1.56180    0.29780  -5.245 1.57e-07 ***
## SVI_ALL_SUM  0.04329    0.01893   2.287   0.0222 *  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
b8 <-  glmmTMB(late_prop ~ SVI_ALL_SUM + Region + (1 | DxCity_num_recode), 
                 family = beta_family(link = "logit"), 
                 data = summary_filtered_BreastData_valid_vill_wmissing)
summary(b8)
##  Family: beta  ( logit )
## Formula:          late_prop ~ SVI_ALL_SUM + Region + (1 | DxCity_num_recode)
## Data: summary_filtered_BreastData_valid_vill_wmissing
## 
##      AIC      BIC   logLik deviance df.resid 
##    -43.2    -37.5     27.6    -55.2       13 
## 
## Random effects:
## 
## Conditional model:
##  Groups            Name        Variance  Std.Dev. 
##  DxCity_num_recode (Intercept) 2.506e-10 1.583e-05
## Number of obs: 19, groups:  DxCity_num_recode, 19
## 
## Dispersion parameter for beta family ():   61 
## 
## Conditional model:
##             Estimate Std. Error z value Pr(>|z|)    
## (Intercept) -1.66749    0.26456  -6.303 2.92e-10 ***
## SVI_ALL_SUM  0.04027    0.01723   2.337   0.0195 *  
## RegionNorth  0.07366    0.19769   0.373   0.7095    
## RegionSouth  0.35913    0.14215   2.526   0.0115 *  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
b9 <-  glmmTMB(late_prop ~ Region + A1_P + (1 | DxCity_num_recode), 
                 family = beta_family(link = "logit"), 
                 data = summary_filtered_BreastData_valid_vill_wmissing)
summary(b9)
##  Family: beta  ( logit )
## Formula:          late_prop ~ Region + A1_P + (1 | DxCity_num_recode)
## Data: summary_filtered_BreastData_valid_vill_wmissing
## 
##      AIC      BIC   logLik deviance df.resid 
##    -41.8    -36.1     26.9    -53.8       13 
## 
## Random effects:
## 
## Conditional model:
##  Groups            Name        Variance  Std.Dev. 
##  DxCity_num_recode (Intercept) 5.309e-10 2.304e-05
## Number of obs: 19, groups:  DxCity_num_recode, 19
## 
## Dispersion parameter for beta family (): 56.6 
## 
## Conditional model:
##             Estimate Std. Error z value Pr(>|z|)    
## (Intercept) -1.72874    0.33770  -5.119 3.07e-07 ***
## RegionNorth  0.16587    0.19642   0.844  0.39842    
## RegionSouth  0.40446    0.14607   2.769  0.00563 ** 
## A1_P         0.02928    0.01491   1.964  0.04952 *  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
b10 <-  glmmTMB(late_prop ~ Region + A1_P + A2_P + (1 | DxCity_num_recode), 
                 family = beta_family(link = "logit"), 
                 data = summary_filtered_BreastData_valid_vill_wmissing)
summary(b10)
##  Family: beta  ( logit )
## Formula:          late_prop ~ Region + A1_P + A2_P + (1 | DxCity_num_recode)
## Data: summary_filtered_BreastData_valid_vill_wmissing
## 
##      AIC      BIC   logLik deviance df.resid 
##    -39.8    -33.2     26.9    -53.8       12 
## 
## Random effects:
## 
## Conditional model:
##  Groups            Name        Variance  Std.Dev. 
##  DxCity_num_recode (Intercept) 5.815e-10 2.411e-05
## Number of obs: 19, groups:  DxCity_num_recode, 19
## 
## Dispersion parameter for beta family (): 56.9 
## 
## Conditional model:
##             Estimate Std. Error z value Pr(>|z|)    
## (Intercept) -1.72264    0.33723  -5.108 3.25e-07 ***
## RegionNorth  0.18568    0.20782   0.893   0.3716    
## RegionSouth  0.36166    0.20956   1.726   0.0844 .  
## A1_P         0.02333    0.02563   0.910   0.3626    
## A2_P         0.01486    0.05213   0.285   0.7756    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
# Again it converges to insignificant...
b11 <-  glmmTMB(late_prop ~ Region + A1_P + A2_P + A3_P + (1 | DxCity_num_recode), 
                 family = beta_family(link = "logit"), 
                 data = summary_filtered_BreastData_valid_vill_wmissing)
## Warning in (function (start, objective, gradient = NULL, hessian = NULL, :
## NA/NaN function evaluation
## Warning in (function (start, objective, gradient = NULL, hessian = NULL, :
## NA/NaN function evaluation
## Warning in finalizeTMB(TMBStruc, obj, fit, h, data.tmb.old): failed to invert
## Hessian from numDeriv::jacobian(), falling back to internal vcov estimate
summary(b11)
##  Family: beta  ( logit )
## Formula:          
## late_prop ~ Region + A1_P + A2_P + A3_P + (1 | DxCity_num_recode)
## Data: summary_filtered_BreastData_valid_vill_wmissing
## 
##      AIC      BIC   logLik deviance df.resid 
##    -39.3    -31.8     27.7    -55.3       11 
## 
## Random effects:
## 
## Conditional model:
##  Groups            Name        Variance  Std.Dev.
##  DxCity_num_recode (Intercept) 3.687e-10 1.92e-05
## Number of obs: 19, groups:  DxCity_num_recode, 19
## 
## Dispersion parameter for beta family (): 61.4 
## 
## Conditional model:
##               Estimate Std. Error z value Pr(>|z|)
## (Intercept) -6.383e-01        NaN     NaN      NaN
## RegionNorth  8.593e-02        NaN     NaN      NaN
## RegionSouth  3.644e-01        NaN     NaN      NaN
## A1_P         1.721e-02        NaN     NaN      NaN
## A2_P        -1.426e-02        NaN     NaN      NaN
## A3_P        -3.813e-05        NaN     NaN      NaN
###Comparing Models With Random Effects (glmmTMB Models)###
lrtest(b7)
## Likelihood ratio test
## 
## Model 1: late_prop ~ SVI_ALL_SUM + (1 | DxCity_num_recode)
## Model 2: late_prop ~ 1
##   #Df LogLik Df  Chisq Pr(>Chisq)  
## 1   4 24.726                       
## 2   2 22.365 -2 4.7231    0.09427 .
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
lrtest(b8)
## Likelihood ratio test
## 
## Model 1: late_prop ~ SVI_ALL_SUM + Region + (1 | DxCity_num_recode)
## Model 2: late_prop ~ 1
##   #Df LogLik Df  Chisq Pr(>Chisq)  
## 1   6 27.582                       
## 2   2 22.365 -4 10.435    0.03371 *
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
lrtest(b9)
## Likelihood ratio test
## 
## Model 1: late_prop ~ Region + A1_P + (1 | DxCity_num_recode)
## Model 2: late_prop ~ 1
##   #Df LogLik Df  Chisq Pr(>Chisq)  
## 1   6 26.884                       
## 2   2 22.365 -4 9.0378    0.06016 .
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
lrtest(b10)
## Likelihood ratio test
## 
## Model 1: late_prop ~ Region + A1_P + A2_P + (1 | DxCity_num_recode)
## Model 2: late_prop ~ 1
##   #Df LogLik Df Chisq Pr(>Chisq)
## 1   7 26.924                    
## 2   2 22.365 -5 9.119     0.1044
lrtest(b11)
## Likelihood ratio test
## 
## Model 1: late_prop ~ Region + A1_P + A2_P + A3_P + (1 | DxCity_num_recode)
## Model 2: late_prop ~ 1
##   #Df LogLik Df  Chisq Pr(>Chisq)
## 1   8 27.653                     
## 2   2 22.365 -6 10.576     0.1024
AIC(b7, b8, b9, b10, b11)
##     df       AIC
## b7   4 -41.45254
## b8   6 -43.16423
## b9   6 -41.76717
## b10  7 -39.84838
## b11  8 -39.30595
tab_model(b7, b8, b9, b10, b11, 
          show.ci = FALSE, # remove CI
          show.aic = TRUE, # display AIC
          p.style = "numeric" # display p-values
)
  late_prop late_prop late_prop late_prop late_prop
Predictors Estimates p Estimates p Estimates p Estimates p Estimates p
(Intercept) 0.21 <0.001 0.19 <0.001 0.18 <0.001 0.18 <0.001 0.53 NaN
SVI ALL SUM 1.04 0.022 1.04 0.019
Region [North] 1.08 0.709 1.18 0.398 1.20 0.372 1.09 NaN
Region [South] 1.43 0.012 1.50 0.006 1.44 0.084 1.44 NaN
A1 P 1.03 0.050 1.02 0.363 1.02 NaN
A2 P 1.01 0.776 0.99 NaN
A3 P 1.00 NaN
Random Effects
σ2 0.05 0.04 0.04 0.04 0.04
τ00 0.00 DxCity_num_recode 0.00 DxCity_num_recode 0.00 DxCity_num_recode 0.00 DxCity_num_recode 0.00 DxCity_num_recode
N 19 DxCity_num_recode 19 DxCity_num_recode 19 DxCity_num_recode 19 DxCity_num_recode 19 DxCity_num_recode
Observations 19 19 19 19 19
Marginal R2 / Conditional R2 0.372 / NA 0.613 / NA 0.563 / NA 0.565 / NA 0.616 / NA
AIC -41.453 -43.164 -41.767 -39.848 -39.306
###########################################################################################
# We then move into regional-level by using a nesting model with random intercepts for village nested within region.
b12 <- glmmTMB(
  late_prop ~ SVI_ALL_SUM + (1 | Region/DxCity_num_recode),
  family = beta_family(link = "logit"),
  data = summary_filtered_BreastData_valid_vill_wmissing
)
summary(b12)
##  Family: beta  ( logit )
## Formula:          late_prop ~ SVI_ALL_SUM + (1 | Region/DxCity_num_recode)
## Data: summary_filtered_BreastData_valid_vill_wmissing
## 
##      AIC      BIC   logLik deviance df.resid 
##    -40.3    -35.6     25.1    -50.3       14 
## 
## Random effects:
## 
## Conditional model:
##  Groups                   Name        Variance  Std.Dev. 
##  DxCity_num_recode:Region (Intercept) 5.277e-10 2.297e-05
##  Region                   (Intercept) 1.405e-02 1.185e-01
## Number of obs: 19, groups:  DxCity_num_recode:Region, 19; Region, 3
## 
## Dispersion parameter for beta family (): 52.3 
## 
## Conditional model:
##             Estimate Std. Error z value Pr(>|z|)    
## (Intercept) -1.52943    0.29591  -5.169 2.36e-07 ***
## SVI_ALL_SUM  0.04112    0.01806   2.276   0.0228 *  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
b13 <- glmmTMB(
  late_prop ~ A1_P + (1 | Region/DxCity_num_recode),
  family = beta_family(link = "logit"),
  data = summary_filtered_BreastData_valid_vill_wmissing
)
summary(b13)
##  Family: beta  ( logit )
## Formula:          late_prop ~ A1_P + (1 | Region/DxCity_num_recode)
## Data: summary_filtered_BreastData_valid_vill_wmissing
## 
##      AIC      BIC   logLik deviance df.resid 
##    -38.6    -33.9     24.3    -48.6       14 
## 
## Random effects:
## 
## Conditional model:
##  Groups                   Name        Variance  Std.Dev. 
##  DxCity_num_recode:Region (Intercept) 1.560e-09 3.949e-05
##  Region                   (Intercept) 1.876e-02 1.370e-01
## Number of obs: 19, groups:  DxCity_num_recode:Region, 19; Region, 3
## 
## Dispersion parameter for beta family (): 48.8 
## 
## Conditional model:
##             Estimate Std. Error z value Pr(>|z|)    
## (Intercept) -1.53997    0.36909  -4.172 3.01e-05 ***
## A1_P         0.02928    0.01605   1.825   0.0681 .  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
b14 <- glmmTMB(
  late_prop ~ A1_P + A2_P + (1 | Region/DxCity_num_recode),
  family = beta_family(link = "logit"),
  data = summary_filtered_BreastData_valid_vill_wmissing
)
summary(b14)
##  Family: beta  ( logit )
## Formula:          late_prop ~ A1_P + A2_P + (1 | Region/DxCity_num_recode)
## Data: summary_filtered_BreastData_valid_vill_wmissing
## 
##      AIC      BIC   logLik deviance df.resid 
##    -38.4    -32.8     25.2    -50.4       13 
## 
## Random effects:
## 
## Conditional model:
##  Groups                   Name        Variance  Std.Dev. 
##  DxCity_num_recode:Region (Intercept) 1.061e-02 1.030e-01
##  Region                   (Intercept) 8.443e-11 9.188e-06
## Number of obs: 19, groups:  DxCity_num_recode:Region, 19; Region, 3
## 
## Dispersion parameter for beta family (): 53.1 
## 
## Conditional model:
##              Estimate Std. Error z value Pr(>|z|)    
## (Intercept) -1.624220   0.364165  -4.460 8.19e-06 ***
## A1_P         0.005382   0.021355   0.252   0.8010    
## A2_P         0.064839   0.035672   1.818   0.0691 .  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
# Converges insignificantly...
b15 <- glmmTMB(
  late_prop ~ A1_P + A2_P + A3_P + (1 | Region/DxCity_num_recode),
  family = beta_family(link = "logit"),
  data = summary_filtered_BreastData_valid_vill_wmissing
)
## Warning in (function (start, objective, gradient = NULL, hessian = NULL, :
## NA/NaN function evaluation
## Warning in (function (start, objective, gradient = NULL, hessian = NULL, :
## NA/NaN function evaluation
## Warning in finalizeTMB(TMBStruc, obj, fit, h, data.tmb.old): failed to invert
## Hessian from numDeriv::jacobian(), falling back to internal vcov estimate
summary(b15)
##  Family: beta  ( logit )
## Formula:          
## late_prop ~ A1_P + A2_P + A3_P + (1 | Region/DxCity_num_recode)
## Data: summary_filtered_BreastData_valid_vill_wmissing
## 
##      AIC      BIC   logLik deviance df.resid 
##    -38.2    -31.6     26.1    -52.2       12 
## 
## Random effects:
## 
## Conditional model:
##  Groups                   Name        Variance  Std.Dev. 
##  DxCity_num_recode:Region (Intercept) 3.228e-03 5.681e-02
##  Region                   (Intercept) 9.628e-11 9.812e-06
## Number of obs: 19, groups:  DxCity_num_recode:Region, 19; Region, 3
## 
## Dispersion parameter for beta family ():   54 
## 
## Conditional model:
##               Estimate Std. Error z value Pr(>|z|)
## (Intercept) -4.161e-01        NaN     NaN      NaN
## A1_P        -6.704e-03        NaN     NaN      NaN
## A2_P         4.280e-02        NaN     NaN      NaN
## A3_P        -4.227e-05        NaN     NaN      NaN
###Comparing Models Across Model Types (Betareg vs. glmmTMB)###
lrtest(b12)
## Likelihood ratio test
## 
## Model 1: late_prop ~ SVI_ALL_SUM + (1 | Region/DxCity_num_recode)
## Model 2: late_prop ~ 1
##   #Df LogLik Df  Chisq Pr(>Chisq)
## 1   5 25.142                     
## 2   2 22.365 -3 5.5543     0.1354
lrtest(b13)
## Likelihood ratio test
## 
## Model 1: late_prop ~ A1_P + (1 | Region/DxCity_num_recode)
## Model 2: late_prop ~ 1
##   #Df LogLik Df Chisq Pr(>Chisq)
## 1   5 24.309                    
## 2   2 22.365 -3 3.888     0.2738
lrtest(b14)
## Likelihood ratio test
## 
## Model 1: late_prop ~ A1_P + A2_P + (1 | Region/DxCity_num_recode)
## Model 2: late_prop ~ 1
##   #Df LogLik Df  Chisq Pr(>Chisq)
## 1   6 25.217                     
## 2   2 22.365 -4 5.7045     0.2223
lrtest(b15)
## Likelihood ratio test
## 
## Model 1: late_prop ~ A1_P + A2_P + A3_P + (1 | Region/DxCity_num_recode)
## Model 2: late_prop ~ 1
##   #Df LogLik Df  Chisq Pr(>Chisq)
## 1   7 26.095                     
## 2   2 22.365 -5 7.4603     0.1886
AIC(b12, b13, b14, b15)
##     df       AIC
## b12  5 -40.28370
## b13  5 -38.61745
## b14  6 -38.43395
## b15  7 -38.18967
tab_model(b12, b13, b14, b15, 
          show.ci = FALSE, # remove CI
          show.aic = TRUE, # display AIC
          p.style = "numeric" # display p-values
)
  late_prop late_prop late_prop late_prop
Predictors Estimates p Estimates p Estimates p Estimates p
(Intercept) 0.22 <0.001 0.21 <0.001 0.20 <0.001 0.66 NaN
SVI ALL SUM 1.04 0.023
A1 P 1.03 0.068 1.01 0.801 0.99 NaN
A2 P 1.07 0.069 1.04 NaN
A3 P 1.00 NaN
Random Effects
σ2 0.05 0.05 0.06 0.05
τ00 0.00 DxCity_num_recode:Region 0.00 DxCity_num_recode:Region 0.01 DxCity_num_recode:Region 0.00 DxCity_num_recode:Region
0.01 Region 0.02 Region 0.00 Region 0.00 Region
N 19 DxCity_num_recode 19 DxCity_num_recode 19 DxCity_num_recode 19 DxCity_num_recode
3 Region 3 Region 3 Region 3 Region
Observations 19 19 19 19
Marginal R2 / Conditional R2 0.384 / NA 0.261 / NA 0.396 / NA 0.502 / NA
AIC -40.284 -38.617 -38.434 -38.190
###Comparison across different models###
tab_model(b2, b8, b12, 
          show.ci = FALSE, # remove CI
          show.aic = TRUE, # display AIC
          p.style = "numeric" # display p-values
)
  late_prop late_prop late_prop
Predictors Estimates p Estimates p Estimates p
(Intercept) 0.19 <0.001 0.19 <0.001 0.22 <0.001
SVI ALL SUM 1.04 0.020 1.04 0.019 1.04 0.023
Region [North] 1.08 0.709 1.08 0.709
Region [South] 1.43 0.012 1.43 0.012
Random Effects
σ2   0.04 0.05
τ00   0.00 DxCity_num_recode 0.00 DxCity_num_recode:Region
    0.01 Region
N   19 DxCity_num_recode 19 DxCity_num_recode
    3 Region
Observations 19 19 19
R2 0.400 0.613 / NA 0.384 / NA
AIC -45.164 -43.164 -40.284
# Likelihood Ratio Test: Compare nested models to see if adding random effects significantly improves fit
lrtest(b2, b8) # Compare fixed effects vs. random intercepts for Village
## Warning in modelUpdate(objects[[i - 1]], objects[[i]]): original model was of
## class "betareg", updated model is of class "glmmTMB"
## Likelihood ratio test
## 
## Model 1: late_prop ~ SVI_ALL_SUM + Region
## Model 2: late_prop ~ SVI_ALL_SUM + Region + (1 | DxCity_num_recode)
##   #Df LogLik Df Chisq Pr(>Chisq)
## 1   5 27.582                    
## 2   6 27.582  1     0     0.9999
lrtest(b8, b12) # Compare random intercepts for Village vs. nested Region/Village
## Likelihood ratio test
## 
## Model 1: late_prop ~ SVI_ALL_SUM + Region + (1 | DxCity_num_recode)
## Model 2: late_prop ~ SVI_ALL_SUM + (1 | Region/DxCity_num_recode)
##   #Df LogLik Df  Chisq Pr(>Chisq)  
## 1   6 27.582                       
## 2   5 25.142 -1 4.8805    0.02716 *
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
# Post-Hoc Analysis for Region: Conduct pairwise comparisons for Region to test specific differences (can't run)
# summary(glht(b2, linfct = mcp(Region = "Tukey")))

### Top 3 Models Overall ###
tab_model(b1, b2, b3, 
          show.ci = FALSE, # remove CI
          show.aic = TRUE, # display AIC
          p.style = "numeric" # display p-values
)
  late_prop late_prop late_prop
Predictors Estimates p Estimates p Estimates p
(Intercept) 0.21 <0.001 0.19 <0.001 0.18 <0.001
SVI ALL SUM 1.04 0.023 1.04 0.020
Region [North] 1.08 0.709 1.18 0.398
Region [South] 1.43 0.012 1.50 0.006
A1 P 1.03 0.057
Observations 19 19 19
R2 0.218 0.400 0.360
AIC -43.453 -45.164 -43.767
###THIS MODEL?###
predictors_no_region <- betareg(late_prop ~ A1_P + A2_P + A3_P + A4_P + B1_P + B2_P + B3_P + B4_P + C1_NEW_P + C2_P + D1_P + D2_P + D3_P + D4_P + H1_P + H2_P, data=summary_filtered_BreastData_valid_vill_wmissing, link="logit")
summary(predictors_no_region)
## 
## Call:
## betareg(formula = late_prop ~ A1_P + A2_P + A3_P + A4_P + B1_P + B2_P + 
##     B3_P + B4_P + C1_NEW_P + C2_P + D1_P + D2_P + D3_P + D4_P + H1_P + 
##     H2_P, data = summary_filtered_BreastData_valid_vill_wmissing, link = "logit")
## 
## Quantile residuals:
##     Min      1Q  Median      3Q     Max 
## -1.5164 -1.0231  0.0097  0.7886  1.9687 
## 
## Coefficients (mean model with logit link):
##               Estimate Std. Error z value Pr(>|z|)  
## (Intercept) -7.454e+00  3.971e+00  -1.877   0.0605 .
## A1_P         9.609e-02  6.587e-02   1.459   0.1447  
## A2_P        -1.476e-01  1.231e-01  -1.199   0.2304  
## A3_P         8.162e-05  6.087e-05   1.341   0.1800  
## A4_P        -1.086e-01  6.110e-02  -1.777   0.0755 .
## B1_P        -1.701e-01  2.188e-01  -0.777   0.4369  
## B2_P         5.698e-02  5.228e-02   1.090   0.2758  
## B3_P         3.253e-01  2.038e-01   1.596   0.1104  
## B4_P        -1.029e-01  1.148e-01  -0.896   0.3702  
## C1_NEW_P     4.542e-02  2.453e-02   1.852   0.0641 .
## C2_P        -7.444e-02  3.882e-01  -0.192   0.8479  
## D1_P        -3.586e-02  1.585e-02  -2.262   0.0237 *
## D2_P         2.775e-01  1.731e-01   1.603   0.1089  
## D3_P         1.029e-01  4.587e-02   2.242   0.0249 *
## D4_P         1.357e-01  9.598e-02   1.414   0.1573  
## H1_P        -2.184e-01  2.654e-01  -0.823   0.4106  
## H2_P         1.161e-02  7.896e-03   1.470   0.1416  
## 
## Phi coefficients (precision model with identity link):
##       Estimate Std. Error z value Pr(>|z|)   
## (phi)   169.66      54.91    3.09    0.002 **
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1 
## 
## Type of estimator: ML (maximum likelihood)
## Log-likelihood:  37.3 on 18 Df
## Pseudo R-squared: 0.7894
## Number of iterations: 72 (BFGS) + 2 (Fisher scoring)
AIC(predictors_no_region)
## [1] -38.59955
# Chisquare Test for Independence
age_data <- matrix(c(10, 1, 51, 14, 63, 29, 94, 18, 53, 14, 36, 21), 
                   ncol = 2, byrow = TRUE)
colnames(age_data) <- c("Late", "Unstaged")
rownames(age_data) <- c("25-34", "35-44", "45-54", "55-64", "65-74", "75+")
age_data <- as.table(age_data)

age_data
##       Late Unstaged
## 25-34   10        1
## 35-44   51       14
## 45-54   63       29
## 55-64   94       18
## 65-74   53       14
## 75+     36       21
chisq.test(age_data)$expected
## Warning in chisq.test(age_data): Chi-squared approximation may be incorrect
##            Late  Unstaged
## 25-34  8.358911  2.641089
## 35-44 49.393564 15.606436
## 45-54 69.910891 22.089109
## 55-64 85.108911 26.891089
## 65-74 50.913366 16.086634
## 75+   43.314356 13.685644
chisq.test(age_data)
## Warning in chisq.test(age_data): Chi-squared approximation may be incorrect
## 
##  Pearson's Chi-squared test
## 
## data:  age_data
## X-squared = 13.774, df = 5, p-value = 0.01711
# Because its statistically significant, there indicates a difference in the distribution between groups. We can further asses which groups are statistically significnat within age groups using a proportion test
# Compare proportions for 25-34 vs. 35-44
prop.test(x = c(10, 51), n = c(19, 106))
## 
##  2-sample test for equality of proportions with continuity correction
## 
## data:  c(10, 51) out of c(19, 106)
## X-squared = 0.012913, df = 1, p-value = 0.9095
## alternative hypothesis: two.sided
## 95 percent confidence interval:
##  -0.2296782  0.3200456
## sample estimates:
##    prop 1    prop 2 
## 0.5263158 0.4811321
prop.test(x = c(51, 63), n = c(106, 198))
## 
##  2-sample test for equality of proportions with continuity correction
## 
## data:  c(51, 63) out of c(106, 198)
## X-squared = 7.1418, df = 1, p-value = 0.007531
## alternative hypothesis: two.sided
## 95 percent confidence interval:
##  0.04057287 0.28532765
## sample estimates:
##    prop 1    prop 2 
## 0.4811321 0.3181818
prop.test(x = c(63, 94), n = c(198, 237))
## 
##  2-sample test for equality of proportions with continuity correction
## 
## data:  c(63, 94) out of c(198, 237)
## X-squared = 2.5478, df = 1, p-value = 0.1104
## alternative hypothesis: two.sided
## 95 percent confidence interval:
##  -0.17301047  0.01612516
## sample estimates:
##    prop 1    prop 2 
## 0.3181818 0.3966245
prop.test(x = c(94, 53), n = c(237, 179))
## 
##  2-sample test for equality of proportions with continuity correction
## 
## data:  c(94, 53) out of c(237, 179)
## X-squared = 4.0816, df = 1, p-value = 0.04335
## alternative hypothesis: two.sided
## 95 percent confidence interval:
##  0.004243908 0.196826266
## sample estimates:
##    prop 1    prop 2 
## 0.3966245 0.2960894
prop.test(x = c(53, 36), n = c(179, 116))
## 
##  2-sample test for equality of proportions with continuity correction
## 
## data:  c(53, 36) out of c(179, 116)
## X-squared = 0.017089, df = 1, p-value = 0.896
## alternative hypothesis: two.sided
## 95 percent confidence interval:
##  -0.1288798  0.1003689
## sample estimates:
##    prop 1    prop 2 
## 0.2960894 0.3103448
ethnic_data <- matrix(c(146, 37, 85, 28, 23, 6, 22, 4, 14, 8), 
                   ncol = 2, byrow = TRUE)
colnames(ethnic_data) <- c("Late", "Unstaged")
rownames(ethnic_data) <- c("CHamoru", "FIlipino", "Micronesian", "Asian", "White")
ethnic_data <- as.table(ethnic_data)

ethnic_data
##             Late Unstaged
## CHamoru      146       37
## FIlipino      85       28
## Micronesian   23        6
## Asian         22        4
## White         14        8
# Perform chi-square test
# But first let's assess the assumptions for chisquare
chisq.test(ethnic_data)$expected
## Warning in chisq.test(ethnic_data): Chi-squared approximation may be incorrect
##                  Late  Unstaged
## CHamoru     142.27882 40.721180
## FIlipino     87.85523 25.144772
## Micronesian  22.54692  6.453083
## Asian        20.21448  5.785523
## White        17.10456  4.895442
chisq.test(ethnic_data)
## Warning in chisq.test(ethnic_data): Chi-squared approximation may be incorrect
## 
##  Pearson's Chi-squared test
## 
## data:  ethnic_data
## X-squared = 4.1364, df = 4, p-value = 0.3879
# Example logistic regression for Age predicting Late
data <- data.frame(
  Stage = c(rep("Late", 6), rep("Unstaged", 6)), 
  Age = rep(c("25-34", "35-44", "45-54", "55-64", "65-74", "75+"), 2),
  Count = c(10, 51, 63, 94, 53, 36, 1, 14, 29, 18, 14, 21)
)

data$Stage <- factor(data$Stage, levels = c("Unstaged", "Late"))
data$Age <- factor(data$Age, levels = unique(data$Age))

model <- glm(Stage ~ Age, family = binomial, weights = Count, data = data)
summary(model)
## 
## Call:
## glm(formula = Stage ~ Age, family = binomial, data = data, weights = Count)
## 
## Coefficients:
##             Estimate Std. Error z value Pr(>|z|)  
## (Intercept)   2.3026     1.0488   2.195   0.0281 *
## Age35-44     -1.0098     1.0913  -0.925   0.3548  
## Age45-54     -1.5267     1.0725  -1.423   0.1546  
## Age55-64     -0.6497     1.0799  -0.602   0.5474  
## Age65-74     -0.9714     1.0910  -0.890   0.3733  
## Age75+       -1.7636     1.0842  -1.627   0.1038  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 445.36  on 11  degrees of freedom
## Residual deviance: 431.56  on  6  degrees of freedom
## AIC: 443.56
## 
## Number of Fisher Scoring iterations: 5
library(ggplot2)
age_data_prop <- data.frame(
  Age = c("25-34", "35-44", "45-54", "55-64", "65-74", "75+"),
  Late = c(52.6, 48.1, 31.8, 39.7, 29.6, 31.0),
  Unstaged = c(5.3, 13.2, 14.6, 7.6, 7.8, 18.1)
)

age_data_prop_long <- tidyr::pivot_longer(age_data_prop, cols = c(Late, Unstaged), names_to = "Stage", values_to = "Percentage")

ggplot(age_data_prop_long, aes(x = Age, y = Percentage, fill = Stage)) +
  geom_bar(stat = "identity", position = "dodge") +
  labs(title = "Late and Unstaged Cases by Age Group", y = "Percentage", x = "Age Group") +
  theme_minimal()

library(ggplot2)
library(tidyr)
## 
## Attaching package: 'tidyr'
## The following objects are masked from 'package:Matrix':
## 
##     expand, pack, unpack
# Data for ethnicity
ethnic_data <- data.frame(
  Ethnicity = c("CHamoru", "Filipino", "Micronesian", "Asian", "White"),
  Late = c(36.4, 34.0, 46.9, 43.1, 31.1),
  Unstaged = c(9.2, 11.2, 12.2, 7.8, 17.8)
)

# Convert data to long format
ethnic_data_long <- tidyr::pivot_longer(ethnic_data, cols = c(Late, Unstaged), 
                                 names_to = "Stage", values_to = "Percentage")

# Plot
ggplot(ethnic_data_long, aes(x = Ethnicity, y = Percentage, fill = Stage)) +
  geom_bar(stat = "identity", position = "dodge") +
  labs(title = "Late and Unstaged Cases by Ethnicity", 
       y = "Percentage", x = "Ethnicity") +
  theme_minimal() +
  theme(axis.text.x = element_text(angle = 45, hjust = 1))