library(glmmTMB)
## Warning: package 'glmmTMB' was built under R version 4.4.2
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(sf)
## Linking to GEOS 3.12.1, GDAL 3.8.4, PROJ 9.3.1; sf_use_s2() is TRUE
library(ggplot2)
## Warning: package 'ggplot2' was built under R version 4.4.2
library(lme4)
## Loading required package: Matrix
library(sjPlot)
## Learn more about sjPlot with 'browseVignettes("sjPlot")'.
library(betareg)
## Warning: package 'betareg' was built under R version 4.4.2
library(MASS)
##
## Attaching package: 'MASS'
## The following object is masked from 'package:dplyr':
##
## select
library(MuMIn)
## Warning: package 'MuMIn' was built under R version 4.4.2
library(lmtest)
## Loading required package: zoo
##
## Attaching package: 'zoo'
## The following objects are masked from 'package:base':
##
## as.Date, as.Date.numeric
library(multcomp)
## Warning: package 'multcomp' was built under R version 4.4.2
## Loading required package: mvtnorm
## Warning: package 'mvtnorm' was built under R version 4.4.2
## Loading required package: survival
## Loading required package: TH.data
## Warning: package 'TH.data' was built under R version 4.4.2
##
## Attaching package: 'TH.data'
## The following object is masked from 'package:MASS':
##
## geyser
BreastData_FullSVI <- read.csv("C:\\Users\\micha\\PIPCHE Dropbox\\Michael Cajigal\\U54-3 Biostats\\LOUIS\\Students\\Michael\\AACR 2025 Conference\\Incidence and Mortality\\Breast Cancer Project - IR by Village MBC\\DataSet Creation\\1998 to 2022 GCR breast data (final) with full SVI.csv", stringsAsFactors = TRUE)
filtered_BreastData_FullSVI <- BreastData_FullSVI[BreastData_FullSVI$Sex == 2 &
BreastData_FullSVI$DxCity_num_recode != 20 &
BreastData_FullSVI$BehaviorICDO3 == 3, ]
filtered_BreastData_valid_vill <- filtered_BreastData_FullSVI %>%
filter(DxCity_num_recode != 20)
#nrow(filtered_BreastData_valid_vill)
#Unknown_Vill_Count <- nrow(filtered_BreastData_FullSVI) - nrow(filtered_BreastData_valid_vill)
#Unknown_Vill_Count
# Filtering out Missing Values in Combined Stage
#filtered_BreastData_valid_vill_stage <- filtered_BreastData_valid_vill %>%
#filter(CombinedStage != " ")
#summary(filtered_BreastData_valid_vill_stage$CombinedStage)
filtered_BreastData_valid_vill$CombinedStage <- dplyr::recode(
filtered_BreastData_valid_vill$CombinedStage,
" " = "Missing",
"2" = "Early",
"3" = "Late",
"4" = "Unstaged"
)
filtered_BreastData_valid_vill$CombinedStage <- dplyr::recode(filtered_BreastData_valid_vill$CombinedStage,
" " = "Missing",
"2" = "Early",
"3" = "Late",
"4" = "Unstaged")
# print(filtered_BreastData_valid_vill_stage$CombinedStage)
filtered_BreastData_valid_vill$DxCity_num_recode <- dplyr::recode(filtered_BreastData_valid_vill$DxCity_num_recode,
`7` = "Hagatna",
`1` = "Agana Heights",
`2` = "Agat",
`3` = "Asan",
`4` = "Barrigada",
`6` = "Dededo",
`8` = "Inarajan",
`10` = "Merizo",
`9` = "Mangilao",
`11` = "Mongmong-Toto-Maite",
`5` = "Ordot / Chalan Pago",
`12` = "Piti",
`13` = "Santa Rita",
`14` = "Sinajana",
`15` = "Talofofo",
`16` = "Tamuning",
`17` = "Umatac",
`18` = "Yigo",
`19` = "Yona")
table(filtered_BreastData_valid_vill$DxCity_num_recode)
##
## Agana Heights Agat Asan Barrigada
## 30 27 14 82
## Dededo Hagatna Inarajan Mangilao
## 317 37 14 91
## Merizo Mongmong-Toto-Maite Ordot / Chalan Pago Piti
## 15 32 37 13
## Santa Rita Sinajana Talofofo Tamuning
## 47 22 20 123
## Umatac Yigo Yona
## 5 126 52
filtered_BreastData_valid_vill$EthnicGroups <- dplyr::recode(filtered_BreastData_valid_vill$EthnicGroups,
`1` = "Chamorro",
`2` = "Filipino",
`3` = "Micronesian",
`4` = "Asian",
`5` = "Caucasian",
`6` = "Pacific Islander",
`7` = "Other or Unknown")
table(filtered_BreastData_valid_vill$EthnicGroups)
##
## Asian Caucasian Chamorro Filipino
## 64 63 499 320
## Micronesian Other or Unknown Pacific Islander
## 57 78 23
filtered_BreastData_valid_vill$AgeGroups2 <- dplyr::recode(filtered_BreastData_valid_vill$AgeGroups2,
`1` = "0-24",
`2` = "25-34",
`3` = "35-44",
`4` = "45-54",
`5` = "55-64",
`6` = "65-74",
`7` = "75+")
table(filtered_BreastData_valid_vill$AgeGroups2)
##
## 0-24 25-34 35-44 45-54 55-64 65-74 75+
## 1 33 140 267 298 223 142
# Calculate late-stage diagnosis percentage by village
village_stage_distribution <- filtered_BreastData_valid_vill %>%
filter(CombinedStage %in% c("Unstaged", "Early", "Late")) %>%
group_by(DxCity_num_recode, CombinedStage) %>%
summarise(Count = n(), .groups = "drop") %>%
tidyr::pivot_wider(names_from = CombinedStage, values_from = Count, values_fill = 0) %>%
mutate(
Total = Unstaged + Early + Late,
Late_Percentage = (Late / Total) * 100
) %>%
arrange(desc(Late_Percentage))
# View results
print(village_stage_distribution)
## # A tibble: 19 × 6
## DxCity_num_recode Early Late Unstaged Total Late_Percentage
## <chr> <int> <int> <int> <int> <dbl>
## 1 Umatac 1 2 0 3 66.7
## 2 Asan 2 5 1 8 62.5
## 3 Merizo 7 6 1 14 42.9
## 4 Talofofo 7 6 1 14 42.9
## 5 Inarajan 6 5 1 12 41.7
## 6 Agat 11 9 2 22 40.9
## 7 Yigo 51 41 11 103 39.8
## 8 Mangilao 40 30 7 77 39.0
## 9 Barrigada 35 27 8 70 38.6
## 10 Yona 24 17 4 45 37.8
## 11 Tamuning 55 33 4 92 35.9
## 12 Dededo 123 86 38 247 34.8
## 13 Mongmong-Toto-Maite 13 7 1 21 33.3
## 14 Ordot / Chalan Pago 17 9 1 27 33.3
## 15 Agana Heights 8 7 8 23 30.4
## 16 Hagatna 20 10 3 33 30.3
## 17 Santa Rita 23 10 3 36 27.8
## 18 Piti 8 2 1 11 18.2
## 19 Sinajana 13 3 2 18 16.7
#write.csv(village_stage_distribution, "village_stage_distribution.csv", row.names = FALSE)
# Plotting the results (bar chart of Late-Stage Percentages by Village without missing)
library(ggplot2)
ggplot(village_stage_distribution, aes(x = reorder(DxCity_num_recode, -Late_Percentage), y = Late_Percentage)) +
geom_bar(stat = "identity") +
labs(title = "Percentage of Late-Stage Diagnoses by Village",
x = "Village",
y = "Late-Stage Percentage (%)") +
theme(axis.text.x = element_text(angle = 45, hjust = 1))

# Plot densities on GIS map of Guam.
# Read in shape
sf_data <- st_read("C:\\Users\\micha\\Downloads\\guam_village_shape\\village.shp")
## Reading layer `village' from data source
## `C:\Users\micha\Downloads\guam_village_shape\village.shp' using driver `ESRI Shapefile'
## Simple feature collection with 19 features and 6 fields
## Geometry type: MULTIPOLYGON
## Dimension: XY
## Bounding box: xmin: 85741.2 ymin: 170580.6 xmax: 122375.7 ymax: 217087.4
## Projected CRS: 1993_Guam_Geodetic_Network
# Rename villages in the 'DxCity_num_recode' column
village_stage_distribution_vill_corr <- village_stage_distribution %>%
mutate(
DxCity_num_recode = case_when(
DxCity_num_recode == "Agana Heights (Hagåtña Heights)" ~ "Agana Heights",
DxCity_num_recode == "Agat (HÃ¥gat)" ~ "Agat",
DxCity_num_recode == "Asan-Maina" ~ "Asan",
DxCity_num_recode == "Chalan Pago-Ordot" ~ "Ordot / Chalan Pago",
DxCity_num_recode == "Hagåtña (Agana)" ~ "Hagatna",
DxCity_num_recode == "Inarajan (Inalåhan)" ~ "Inarajan",
DxCity_num_recode == "Tamuning-Tumon-Harmon" ~ "Tamuning",
DxCity_num_recode == "Merizo (Malesso’)" ~ "Merizo",
DxCity_num_recode == "Umatac (Humåtak)" ~ "Umatac",
DxCity_num_recode == "Talofofo (Talo’fo’fo)" ~ "Talofofo",
TRUE ~ DxCity_num_recode
)
)
data <- left_join(sf_data, village_stage_distribution_vill_corr, join_by("MUNICIPAL" == "DxCity_num_recode"))
g1 <- ggplot(data=data)+
geom_sf(aes(fill = Late_Percentage), color="white")+
theme_void() + scale_fill_gradient(low="#f2cbd6", high = "#990c58") +
geom_sf_text(aes(label = paste0(MUNICIPAL, "\n", Late)), size=1) +
labs(title = "Breast Cancer Cases by Village from 1989 to 2021",
caption = "Hue represents percentage of \n late-diagnises counts per village (denominator is total cases from that respective), \n while labels display number of cases.", fill = "Percentage of Late-Diagnosis Cases") +
theme(plot.caption = element_text(size = 11, hjust=1, face= "italic"))
g1

# Creating new dataset to include SVI variables only for each village.
summary_filtered_BreastData_valid_vill <- filtered_BreastData_valid_vill %>%
group_by(DxCity_num_recode) %>%
summarise(
Early = sum(CombinedStage == "Early", na.rm = TRUE),
Late = sum(CombinedStage == "Late", na.rm = TRUE),
Unstaged = sum(CombinedStage == "Unstaged", na.rm = TRUE),
Missing = sum(is.na(CombinedStage) | CombinedStage == "Missing"),
# Summarise all variables after CDP_NAME
across(CDP_NAME:ncol(filtered_BreastData_valid_vill), ~ mean(.x, na.rm = TRUE), .names = "{col}")
)
# Calculating stage totals for each village and late-stage diagnosis proportions (including missing).
summary_filtered_BreastData_valid_vill_wmissing <- summary_filtered_BreastData_valid_vill %>%
mutate(
Total = Early + Late + Unstaged + Missing, # Calculate the total count
late_prop = Late / Total # Calculate the proportion of late diagnoses
)
# Save the datset.
write.csv(summary_filtered_BreastData_valid_vill_wmissing, "summary_filtered_BreastData_valid_vill_wmissing_BetaR.csv", row.names = FALSE)
# Calculating stage totals for each village and late-stage diagnosis proportions (excluding missing).
summary_filtered_BreastData_valid_vill_womissing <- summary_filtered_BreastData_valid_vill %>%
mutate(
Total = Early + Late + Unstaged, # Calculate the total count
late_prop = Late / Total # Calculate the proportion of late diagnoses
)
# Save the datset.
write.csv(summary_filtered_BreastData_valid_vill_womissing, "summary_filtered_BreastData_valid_vill_womissing_BetaR.csv", row.names = FALSE)
# List variables to exclude
exclude_vars <- c("DxCity_num_recode", "Early", "Late", "Unstaged", "Missing", "CDP_NAME", "VILLAGE_POP", "CDP_POP", "CIV_POP", "POP_25OVER", "POP_5OVER", "HOUSE_UNITS", "HOUSEHOLDS", "CIV_NONINST", "V93", "V94", "Total", "late_prop")
# Get all column names except those to exclude and the dependent variable
predictor_vars <- setdiff(names(summary_filtered_BreastData_valid_vill_wmissing), c("late_prop", exclude_vars))
# Create the formula dynamically
formula <- as.formula(paste("late_prop ~", paste(predictor_vars, collapse = " + ")))
print(predictor_vars)
## [1] "A1_P" "A1_R" "A2_P" "A2_R" "A3_P"
## [6] "A3_R" "A4_P" "A4_R" "A5_SUM" "A5_R"
## [11] "B1_P" "B1_R" "B2_P" "B2_R" "B3_P"
## [16] "B3_R" "B4_P" "B4_R" "B5_SUM" "B5_R"
## [21] "C1_P" "C1_R" "C1_NEW_P" "C1_NEW_R" "C2_P"
## [26] "C2_R" "C3_SUM" "C3_R" "C3_NEW_SUM" "C3_NEW_R"
## [31] "D1_P" "D1_R" "D2_P" "D2_R" "D3_P"
## [36] "D3_R" "D4_P" "D4_R" "D5_SUM" "D5_P"
## [41] "D5_R" "D6_SUM" "D6_R" "SVI_CDC_SUM" "SVI_CDC_R"
## [46] "H1_P" "H1_R" "H2_P" "H2_R" "H3A_P"
## [51] "H3A_R" "H3B_P" "H3B_R" "H3C_P" "H3C_R"
## [56] "H4_P" "H4_R" "H5_P" "H5_R" "H6_P"
## [61] "H6_R" "HA_SUM" "HA_R" "H7_P" "H7_R"
## [66] "H8_P" "H8_R" "H9_P" "H9_R" "HB_SUM"
## [71] "HB_R" "P1_P" "P1_R" "P2_P" "P2_R"
## [76] "P3_P" "P3_R" "P4_P" "P4_R" "P5_SUM"
## [81] "P5_R" "SVI_ALL_SUM" "SVI_ALL_R"
# Fit the beta regression model
#b1 <- betareg(formula, data = summary_filtered_BreastData_valid_vill_wmissing, link = "logit")
(summary_filtered_BreastData_valid_vill_wmissing$DxCity_num_recode)
## [1] "Agana Heights" "Agat" "Asan"
## [4] "Barrigada" "Dededo" "Hagatna"
## [7] "Inarajan" "Mangilao" "Merizo"
## [10] "Mongmong-Toto-Maite" "Ordot / Chalan Pago" "Piti"
## [13] "Santa Rita" "Sinajana" "Talofofo"
## [16] "Tamuning" "Umatac" "Yigo"
## [19] "Yona"
# Define the regions for each village
north_villages <- c("Dededo", "Yigo", "Tamuning")
central_villages <- c("Mangilao", "Barrigada", "Agana Heights", "Asan", "Ordot / Chalan Pago", "Hagatna", "Mongmong-Toto-Maite", "Piti", "Sinajana")
south_villages <- c("Agat", "Merizo", "Umatac", "Inarajan", "Talofofo", "Santa Rita", "Yona")
# Add a region column based on village
summary_filtered_BreastData_valid_vill_wmissing <- summary_filtered_BreastData_valid_vill_wmissing %>%
mutate(
Region = case_when(
DxCity_num_recode %in% north_villages ~ "North",
DxCity_num_recode %in% central_villages ~ "Central",
DxCity_num_recode %in% south_villages ~ "South",
TRUE ~ "Unknown" # For villages not explicitly listed
)
)
# View the updated dataset
print(summary_filtered_BreastData_valid_vill_wmissing)
## # A tibble: 19 × 102
## DxCity_num_recode Early Late Unstaged Missing CDP_NAME VILLAGE_POP CDP_POP
## <chr> <dbl> <int> <int> <int> <dbl> <dbl> <dbl>
## 1 Agana Heights 8 7 8 7 NaN 3808 NaN
## 2 Agat 11 9 2 5 NaN 4917 NaN
## 3 Asan 2 5 1 6 NaN 2137 NaN
## 4 Barrigada 35 27 8 12 NaN 8875 NaN
## 5 Dededo 123 86 38 70 NaN 44943 NaN
## 6 Hagatna 20 10 3 4 NaN 1051 NaN
## 7 Inarajan 6 5 1 2 NaN 2273 NaN
## 8 Mangilao 40 30 7 14 NaN 15191 NaN
## 9 Merizo 7 6 1 1 NaN 1850 NaN
## 10 Mongmong-Toto-Maite 13 7 1 11 NaN 6825 NaN
## 11 Ordot / Chalan Pago 17 9 1 10 NaN 6822 NaN
## 12 Piti 8 2 1 2 NaN 1454 NaN
## 13 Santa Rita 23 10 3 11 NaN 6084 NaN
## 14 Sinajana 13 3 2 4 NaN 2592 NaN
## 15 Talofofo 7 6 1 6 NaN 3050 NaN
## 16 Tamuning 55 33 4 31 NaN 19685 NaN
## 17 Umatac 1 2 0 2 NaN 782 NaN
## 18 Yigo 51 41 11 23 NaN 20539 NaN
## 19 Yona 24 17 4 7 NaN 6480 NaN
## # ℹ 94 more variables: CIV_POP <dbl>, POP_25OVER <dbl>, POP_5OVER <dbl>,
## # HOUSE_UNITS <dbl>, HOUSEHOLDS <dbl>, CIV_NONINST <dbl>, A1_P <dbl>,
## # A1_R <dbl>, A2_P <dbl>, A2_R <dbl>, A3_P <dbl>, A3_R <dbl>, A4_P <dbl>,
## # A4_R <dbl>, A5_SUM <dbl>, A5_R <dbl>, B1_P <dbl>, B1_R <dbl>, B2_P <dbl>,
## # B2_R <dbl>, B3_P <dbl>, B3_R <dbl>, B4_P <dbl>, B4_R <dbl>, B5_SUM <dbl>,
## # B5_R <dbl>, C1_P <dbl>, C1_R <dbl>, C1_NEW_P <dbl>, C1_NEW_R <dbl>,
## # C2_P <dbl>, C2_R <dbl>, C3_SUM <dbl>, C3_R <dbl>, C3_NEW_SUM <dbl>, …
summary_filtered_BreastData_valid_vill_wmissing$Region <- factor(summary_filtered_BreastData_valid_vill_wmissing$Region)
# Wated to see if we can find an ideal model utilizing our null and full models to converge into a "best model" based off AIC. Note: I couldn't do it)
# Fit the initial (full) model
full_model <- glmmTMB(late_prop ~ SVI_ALL_R + A1_P + A1_R + A2_P + A2_R + A3_P + A3_R + A4_P + A4_R + A5_SUM + A5_R + B1_P + B1_R + B2_P + B2_R + B3_P + B3_R + B4_P + B4_R + B5_SUM + B5_R + C1_P + C1_R + C1_NEW_P + C1_NEW_R + C2_P + C2_R + C3_SUM + C3_R + C3_NEW_SUM + C3_NEW_R + D1_P + D1_R + D2_P + D2_R + D3_P + D3_R + D4_P + D4_R + D5_SUM + D5_P + D5_R + D6_SUM + D6_R + SVI_CDC_SUM + SVI_CDC_R + H1_P + H1_R + H2_P + H2_R + H3A_P + H3A_R + H3B_P + H3B_R + H3C_P + H3C_R + H4_P + H4_R + H5_P + H5_R + H6_P + H6_R + HA_SUM + HA_R + H7_P + H7_R + H8_P + H8_R + H9_P + H9_R + HB_SUM + HB_R + P1_P + P1_R + P2_P + P2_R + P3_P + P3_R + P4_P + P4_R + P5_SUM + P5_R + SVI_ALL_SUM + SVI_ALL_R + (1 | DxCity_num_recode),
family = beta_family(link = "logit"),
data = summary_filtered_BreastData_valid_vill_wmissing)
## dropping columns from rank-deficient conditional model: A5_SUM, B5_SUM, B5_R, C1_P, C1_R, C1_NEW_P, C1_NEW_R, C2_P, C2_R, C3_SUM, C3_R, C3_NEW_SUM, C3_NEW_R, D1_P, D1_R, D2_P, D2_R, D3_P, D3_R, D4_P, D4_R, D5_SUM, D5_P, D5_R, D6_SUM, D6_R, SVI_CDC_SUM, SVI_CDC_R, H1_P, H1_R, H2_P, H2_R, H3A_P, H3A_R, H3B_P, H3B_R, H3C_P, H3C_R, H4_P, H4_R, H5_P, H5_R, H6_P, H6_R, HA_SUM, HA_R, H7_P, H7_R, H8_P, H8_R, H9_P, H9_R, HB_SUM, HB_R, P1_P, P1_R, P2_P, P2_R, P3_P, P3_R, P4_P, P4_R, P5_SUM, P5_R, SVI_ALL_SUM
## Warning in (function (start, objective, gradient = NULL, hessian = NULL, :
## NA/NaN function evaluation
## Warning in (function (start, objective, gradient = NULL, hessian = NULL, :
## NA/NaN function evaluation
## Warning in finalizeTMB(TMBStruc, obj, fit, h, data.tmb.old): Model convergence
## problem; non-positive-definite Hessian matrix. See vignette('troubleshooting')
## Warning in finalizeTMB(TMBStruc, obj, fit, h, data.tmb.old): Model convergence
## problem; function evaluation limit reached without convergence (9). See
## vignette('troubleshooting'), help('diagnose')
# Fit the null model (intercept-only model)
null_model <- glmmTMB(late_prop ~ 1,
family = beta_family(link = "logit"),
data = summary_filtered_BreastData_valid_vill_wmissing)
summary(null_model)
## Family: beta ( logit )
## Formula: late_prop ~ 1
## Data: summary_filtered_BreastData_valid_vill_wmissing
##
## AIC BIC logLik deviance df.resid
## -40.7 -38.8 22.4 -44.7 17
##
##
## Dispersion parameter for beta family (): 35.1
##
## Conditional model:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -0.90871 0.08432 -10.78 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
####################################### BETA REGRESSION ##################################
#Note: When choosing a model to use, if we are not considering random intercepts we use betarag and if we are, we need to use glmmTMB(beta_family(link = "logit")).
#null model (nested: simplified model)
b0<-betareg(late_prop ~ 1, data=summary_filtered_BreastData_valid_vill_wmissing, link="logit")
summary(b0)
##
## Call:
## betareg(formula = late_prop ~ 1, data = summary_filtered_BreastData_valid_vill_wmissing,
## link = "logit")
##
## Quantile residuals:
## Min 1Q Median 3Q Max
## -2.2580 -0.6090 0.2173 0.6126 1.4353
##
## Coefficients (mean model with logit link):
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -0.90871 0.08432 -10.78 <2e-16 ***
##
## Phi coefficients (precision model with identity link):
## Estimate Std. Error z value Pr(>|z|)
## (phi) 35.13 11.27 3.118 0.00182 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Type of estimator: ML (maximum likelihood)
## Log-likelihood: 22.36 on 2 Df
## Number of iterations: 92 (BFGS) + 2 (Fisher scoring)
# We first want to see if variables of SVI are even considered to be predictors before choosing specific SVI variables. So we use the variable SVI_ALL_SUM as our only predictor to see if theres significance in the sum of all rankings for CDC defined SVI variables as a factor towards late-diagnosis of breast cancer.
b1 <- betareg(late_prop ~ SVI_ALL_SUM, data=summary_filtered_BreastData_valid_vill_wmissing, link="logit")
summary(b1)
##
## Call:
## betareg(formula = late_prop ~ SVI_ALL_SUM, data = summary_filtered_BreastData_valid_vill_wmissing,
## link = "logit")
##
## Quantile residuals:
## Min 1Q Median 3Q Max
## -2.2319 -0.7015 0.0323 0.7043 1.8380
##
## Coefficients (mean model with logit link):
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -1.56180 0.30029 -5.201 1.98e-07 ***
## SVI_ALL_SUM 0.04329 0.01908 2.268 0.0233 *
##
## Phi coefficients (precision model with identity link):
## Estimate Std. Error z value Pr(>|z|)
## (phi) 45.08 14.50 3.11 0.00187 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Type of estimator: ML (maximum likelihood)
## Log-likelihood: 24.73 on 3 Df
## Pseudo R-squared: 0.2178
## Number of iterations: 47 (BFGS) + 2 (Fisher scoring)
# Now we include region to see if this significance effect of SVI_ALL_SUM is also significant between regions.
b2 <- betareg(late_prop ~ SVI_ALL_SUM + Region, data=summary_filtered_BreastData_valid_vill_wmissing, link="logit")
summary(b2)
##
## Call:
## betareg(formula = late_prop ~ SVI_ALL_SUM + Region, data = summary_filtered_BreastData_valid_vill_wmissing,
## link = "logit")
##
## Quantile residuals:
## Min 1Q Median 3Q Max
## -2.0883 -0.7368 0.0492 0.5256 2.5315
##
## Coefficients (mean model with logit link):
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -1.66749 0.26472 -6.299 3e-10 ***
## SVI_ALL_SUM 0.04027 0.01736 2.320 0.0203 *
## RegionNorth 0.07366 0.19730 0.373 0.7089
## RegionSouth 0.35913 0.14284 2.514 0.0119 *
##
## Phi coefficients (precision model with identity link):
## Estimate Std. Error z value Pr(>|z|)
## (phi) 61.00 19.66 3.103 0.00192 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Type of estimator: ML (maximum likelihood)
## Log-likelihood: 27.58 on 5 Df
## Pseudo R-squared: 0.4001
## Number of iterations: 56 (BFGS) + 3 (Fisher scoring)
# Since we know that overall SVI variables have an effect, we can now investigate further with those specific SVI variables, by adding those as possible predictors seeing the effect of each one on the outcome variable.
b3 <- betareg(late_prop ~ Region + A1_P, data=summary_filtered_BreastData_valid_vill_wmissing, link="logit")
summary(b3)
##
## Call:
## betareg(formula = late_prop ~ Region + A1_P, data = summary_filtered_BreastData_valid_vill_wmissing,
## link = "logit")
##
## Quantile residuals:
## Min 1Q Median 3Q Max
## -1.9934 -0.4806 -0.0210 0.7511 2.0295
##
## Coefficients (mean model with logit link):
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -1.72874 0.34593 -4.997 5.81e-07 ***
## RegionNorth 0.16587 0.19630 0.845 0.39814
## RegionSouth 0.40446 0.14621 2.766 0.00567 **
## A1_P 0.02928 0.01537 1.904 0.05685 .
##
## Phi coefficients (precision model with identity link):
## Estimate Std. Error z value Pr(>|z|)
## (phi) 56.64 18.25 3.104 0.00191 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Type of estimator: ML (maximum likelihood)
## Log-likelihood: 26.88 on 5 Df
## Pseudo R-squared: 0.3597
## Number of iterations: 45 (BFGS) + 4 (Fisher scoring)
# As we add more predictors, the significance of each predictor starts to converge and become insignificant.
b4 <- betareg(late_prop ~ Region + A1_P + A2_P + A3_P, data=summary_filtered_BreastData_valid_vill_wmissing, link="logit")
summary(b4)
##
## Call:
## betareg(formula = late_prop ~ Region + A1_P + A2_P + A3_P, data = summary_filtered_BreastData_valid_vill_wmissing,
## link = "logit")
##
## Quantile residuals:
## Min 1Q Median 3Q Max
## -2.1658 -0.5629 -0.0034 0.4303 2.3346
##
## Coefficients (mean model with logit link):
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -6.383e-01 9.390e-01 -0.680 0.4966
## RegionNorth 8.593e-02 2.140e-01 0.401 0.6881
## RegionSouth 3.644e-01 2.027e-01 1.798 0.0722 .
## A1_P 1.721e-02 2.602e-02 0.661 0.5083
## A2_P -1.426e-02 5.557e-02 -0.257 0.7975
## A3_P -3.813e-05 3.127e-05 -1.219 0.2227
##
## Phi coefficients (precision model with identity link):
## Estimate Std. Error z value Pr(>|z|)
## (phi) 61.45 19.81 3.103 0.00192 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Type of estimator: ML (maximum likelihood)
## Log-likelihood: 27.65 on 7 Df
## Pseudo R-squared: 0.4097
## Number of iterations: 30 (BFGS) + 2 (Fisher scoring)
b5 <- betareg(late_prop ~ Region + A1_P + A2_P + A3_P + A4_P, data=summary_filtered_BreastData_valid_vill_wmissing, link="logit")
summary(b5)
##
## Call:
## betareg(formula = late_prop ~ Region + A1_P + A2_P + A3_P + A4_P, data = summary_filtered_BreastData_valid_vill_wmissing,
## link = "logit")
##
## Quantile residuals:
## Min 1Q Median 3Q Max
## -2.1418 -0.5453 -0.0220 0.4074 2.3491
##
## Coefficients (mean model with logit link):
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -4.436e-01 1.438e+00 -0.308 0.7577
## RegionNorth 7.678e-02 2.198e-01 0.349 0.7269
## RegionSouth 3.658e-01 2.029e-01 1.802 0.0715 .
## A1_P 1.971e-02 2.975e-02 0.662 0.5077
## A2_P -1.754e-02 5.875e-02 -0.299 0.7653
## A3_P -4.354e-05 4.357e-05 -0.999 0.3176
## A4_P -6.176e-03 3.482e-02 -0.177 0.8592
##
## Phi coefficients (precision model with identity link):
## Estimate Std. Error z value Pr(>|z|)
## (phi) 61.55 19.84 3.103 0.00192 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Type of estimator: ML (maximum likelihood)
## Log-likelihood: 27.67 on 8 Df
## Pseudo R-squared: 0.4112
## Number of iterations: 35 (BFGS) + 2 (Fisher scoring)
# Until no predictors are significant anymore...
b6 <- betareg(late_prop ~ Region + A1_P + A2_P + A3_P + A4_P + B1_P, data=summary_filtered_BreastData_valid_vill_wmissing, link="logit")
summary(b6)
##
## Call:
## betareg(formula = late_prop ~ Region + A1_P + A2_P + A3_P + A4_P + B1_P,
## data = summary_filtered_BreastData_valid_vill_wmissing, link = "logit")
##
## Quantile residuals:
## Min 1Q Median 3Q Max
## -1.9641 -0.5110 0.0652 0.3913 2.4291
##
## Coefficients (mean model with logit link):
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -4.745e-01 1.417e+00 -0.335 0.738
## RegionNorth 8.787e-02 2.182e-01 0.403 0.687
## RegionSouth 3.238e-01 2.096e-01 1.545 0.122
## A1_P 1.278e-02 3.127e-02 0.409 0.683
## A2_P -5.000e-03 6.108e-02 -0.082 0.935
## A3_P -3.281e-05 4.626e-05 -0.709 0.478
## A4_P 4.977e-04 3.568e-02 0.014 0.989
## B1_P -3.516e-02 5.502e-02 -0.639 0.523
##
## Phi coefficients (precision model with identity link):
## Estimate Std. Error z value Pr(>|z|)
## (phi) 62.92 20.28 3.102 0.00192 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Type of estimator: ML (maximum likelihood)
## Log-likelihood: 27.88 on 9 Df
## Pseudo R-squared: 0.4212
## Number of iterations: 29 (BFGS) + 3 (Fisher scoring)
####Comparing Models Without Random Effects (Betareg Models)###
# The lrtest() function in R is used to perform a Likelihood Ratio Test (LRT), which is a statistical test to compare two nested models. Nested models are models where one is a simpler version (i.e., a subset) of the other. This test evaluates whether the additional complexity of the more complicated model significantly improves the fit to the data.
#H0: The simpler model is sufficient to explain the data.
#HA: The more complex model provides a significantly better fit.
lrtest(b1)
## Likelihood ratio test
##
## Model 1: late_prop ~ SVI_ALL_SUM
## Model 2: late_prop ~ 1
## #Df LogLik Df Chisq Pr(>Chisq)
## 1 3 24.726
## 2 2 22.365 -1 4.7231 0.02976 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
lrtest(b2)
## Likelihood ratio test
##
## Model 1: late_prop ~ SVI_ALL_SUM + Region
## Model 2: late_prop ~ 1
## #Df LogLik Df Chisq Pr(>Chisq)
## 1 5 27.582
## 2 2 22.365 -3 10.435 0.01521 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
lrtest(b3)
## Likelihood ratio test
##
## Model 1: late_prop ~ Region + A1_P
## Model 2: late_prop ~ 1
## #Df LogLik Df Chisq Pr(>Chisq)
## 1 5 26.884
## 2 2 22.365 -3 9.0378 0.02879 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
lrtest(b4)
## Likelihood ratio test
##
## Model 1: late_prop ~ Region + A1_P + A2_P + A3_P
## Model 2: late_prop ~ 1
## #Df LogLik Df Chisq Pr(>Chisq)
## 1 7 27.653
## 2 2 22.365 -5 10.576 0.06045 .
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
lrtest(b5)
## Likelihood ratio test
##
## Model 1: late_prop ~ Region + A1_P + A2_P + A3_P + A4_P
## Model 2: late_prop ~ 1
## #Df LogLik Df Chisq Pr(>Chisq)
## 1 8 27.669
## 2 2 22.365 -6 10.608 0.1013
lrtest(b6)
## Likelihood ratio test
##
## Model 1: late_prop ~ Region + A1_P + A2_P + A3_P + A4_P + B1_P
## Model 2: late_prop ~ 1
## #Df LogLik Df Chisq Pr(>Chisq)
## 1 9 27.876
## 2 2 22.365 -7 11.023 0.1376
lrtest(b0, b1) # Compare null model to model with SVI_ALL_SUM
## Likelihood ratio test
##
## Model 1: late_prop ~ 1
## Model 2: late_prop ~ SVI_ALL_SUM
## #Df LogLik Df Chisq Pr(>Chisq)
## 1 2 22.365
## 2 3 24.726 1 4.7231 0.02976 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
lrtest(b1, b2) # Compare SVI_ALL_SUM to SVI_ALL_SUM + Region
## Likelihood ratio test
##
## Model 1: late_prop ~ SVI_ALL_SUM
## Model 2: late_prop ~ SVI_ALL_SUM + Region
## #Df LogLik Df Chisq Pr(>Chisq)
## 1 3 24.726
## 2 5 27.582 2 5.7117 0.05751 .
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
lrtest(b2, b3) # Compare SVI_ALL_SUM + Region to Region + A1_P
## Likelihood ratio test
##
## Model 1: late_prop ~ SVI_ALL_SUM + Region
## Model 2: late_prop ~ Region + A1_P
## #Df LogLik Df Chisq Pr(>Chisq)
## 1 5 27.582
## 2 5 26.884 0 1.3971 < 2.2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
lrtest(b3, b4) # Compare Region to Region + A1_P to Region + A1_P + A2_P + A3_P
## Likelihood ratio test
##
## Model 1: late_prop ~ Region + A1_P
## Model 2: late_prop ~ Region + A1_P + A2_P + A3_P
## #Df LogLik Df Chisq Pr(>Chisq)
## 1 5 26.884
## 2 7 27.653 2 1.5388 0.4633
lrtest(b4, b5) # Compare Region + A1_P + A2_P + A3_P + Region to Region + A1_P + A2_P + A3_P + A4_P
## Likelihood ratio test
##
## Model 1: late_prop ~ Region + A1_P + A2_P + A3_P
## Model 2: late_prop ~ Region + A1_P + A2_P + A3_P + A4_P
## #Df LogLik Df Chisq Pr(>Chisq)
## 1 7 27.653
## 2 8 27.669 1 0.0319 0.8583
lrtest(b5, b6) # Compare Region + A1_P + A2_P + A3_P + A4_P + Region to Region + A1_P + A2_P + A3_P + A4_P + B1_P
## Likelihood ratio test
##
## Model 1: late_prop ~ Region + A1_P + A2_P + A3_P + A4_P
## Model 2: late_prop ~ Region + A1_P + A2_P + A3_P + A4_P + B1_P
## #Df LogLik Df Chisq Pr(>Chisq)
## 1 8 27.669
## 2 9 27.876 1 0.4142 0.5199
# Comparing AIC
AIC(b0, b1, b2, b3, b4, b5, b6)
## df AIC
## b0 2 -40.72941
## b1 3 -43.45254
## b2 5 -45.16423
## b3 5 -43.76717
## b4 7 -41.30595
## b5 8 -39.33783
## b6 9 -37.75201
tab_model(b1, b2, b3, b4, b5, b6,
show.ci = FALSE, # remove CI
show.aic = TRUE, # display AIC
p.style = "numeric" # display p-values
)
|
Â
|
late_prop
|
late_prop
|
late_prop
|
late_prop
|
late_prop
|
late_prop
|
|
Predictors
|
Estimates
|
p
|
Estimates
|
p
|
Estimates
|
p
|
Estimates
|
p
|
Estimates
|
p
|
Estimates
|
p
|
|
(Intercept)
|
0.21
|
<0.001
|
0.19
|
<0.001
|
0.18
|
<0.001
|
0.53
|
0.497
|
0.64
|
0.758
|
0.62
|
0.738
|
|
SVI ALL SUM
|
1.04
|
0.023
|
1.04
|
0.020
|
|
|
|
|
|
|
|
|
|
Region [North]
|
|
|
1.08
|
0.709
|
1.18
|
0.398
|
1.09
|
0.688
|
1.08
|
0.727
|
1.09
|
0.687
|
|
Region [South]
|
|
|
1.43
|
0.012
|
1.50
|
0.006
|
1.44
|
0.072
|
1.44
|
0.072
|
1.38
|
0.122
|
|
A1 P
|
|
|
|
|
1.03
|
0.057
|
1.02
|
0.508
|
1.02
|
0.508
|
1.01
|
0.683
|
|
A2 P
|
|
|
|
|
|
|
0.99
|
0.797
|
0.98
|
0.765
|
1.00
|
0.935
|
|
A3 P
|
|
|
|
|
|
|
1.00
|
0.223
|
1.00
|
0.318
|
1.00
|
0.478
|
|
A4 P
|
|
|
|
|
|
|
|
|
0.99
|
0.859
|
1.00
|
0.989
|
|
B1 P
|
|
|
|
|
|
|
|
|
|
|
0.97
|
0.523
|
|
Observations
|
19
|
19
|
19
|
19
|
19
|
19
|
|
R2
|
0.218
|
0.400
|
0.360
|
0.410
|
0.411
|
0.421
|
|
AIC
|
-43.453
|
-45.164
|
-43.767
|
-41.306
|
-39.338
|
-37.752
|
###########################################################################################
# We will now run the same model (same predictors) but now with a random intercept for each village to account for clustering by allowing the intercept to vary by village and adjusting to account for dependencies within villages (e.g., individuals within the same village sharing similar characteristics).
b7 <- glmmTMB(late_prop ~ SVI_ALL_SUM + (1 | DxCity_num_recode),
family = beta_family(link = "logit"),
data = summary_filtered_BreastData_valid_vill_wmissing)
summary(b7)
## Family: beta ( logit )
## Formula: late_prop ~ SVI_ALL_SUM + (1 | DxCity_num_recode)
## Data: summary_filtered_BreastData_valid_vill_wmissing
##
## AIC BIC logLik deviance df.resid
## -41.5 -37.7 24.7 -49.5 15
##
## Random effects:
##
## Conditional model:
## Groups Name Variance Std.Dev.
## DxCity_num_recode (Intercept) 3.174e-09 5.634e-05
## Number of obs: 19, groups: DxCity_num_recode, 19
##
## Dispersion parameter for beta family (): 45.1
##
## Conditional model:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -1.56180 0.29780 -5.245 1.57e-07 ***
## SVI_ALL_SUM 0.04329 0.01893 2.287 0.0222 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
b8 <- glmmTMB(late_prop ~ SVI_ALL_SUM + Region + (1 | DxCity_num_recode),
family = beta_family(link = "logit"),
data = summary_filtered_BreastData_valid_vill_wmissing)
summary(b8)
## Family: beta ( logit )
## Formula: late_prop ~ SVI_ALL_SUM + Region + (1 | DxCity_num_recode)
## Data: summary_filtered_BreastData_valid_vill_wmissing
##
## AIC BIC logLik deviance df.resid
## -43.2 -37.5 27.6 -55.2 13
##
## Random effects:
##
## Conditional model:
## Groups Name Variance Std.Dev.
## DxCity_num_recode (Intercept) 2.506e-10 1.583e-05
## Number of obs: 19, groups: DxCity_num_recode, 19
##
## Dispersion parameter for beta family (): 61
##
## Conditional model:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -1.66749 0.26456 -6.303 2.92e-10 ***
## SVI_ALL_SUM 0.04027 0.01723 2.337 0.0195 *
## RegionNorth 0.07366 0.19769 0.373 0.7095
## RegionSouth 0.35913 0.14215 2.526 0.0115 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
b9 <- glmmTMB(late_prop ~ Region + A1_P + (1 | DxCity_num_recode),
family = beta_family(link = "logit"),
data = summary_filtered_BreastData_valid_vill_wmissing)
summary(b9)
## Family: beta ( logit )
## Formula: late_prop ~ Region + A1_P + (1 | DxCity_num_recode)
## Data: summary_filtered_BreastData_valid_vill_wmissing
##
## AIC BIC logLik deviance df.resid
## -41.8 -36.1 26.9 -53.8 13
##
## Random effects:
##
## Conditional model:
## Groups Name Variance Std.Dev.
## DxCity_num_recode (Intercept) 5.309e-10 2.304e-05
## Number of obs: 19, groups: DxCity_num_recode, 19
##
## Dispersion parameter for beta family (): 56.6
##
## Conditional model:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -1.72874 0.33770 -5.119 3.07e-07 ***
## RegionNorth 0.16587 0.19642 0.844 0.39842
## RegionSouth 0.40446 0.14607 2.769 0.00563 **
## A1_P 0.02928 0.01491 1.964 0.04952 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
b10 <- glmmTMB(late_prop ~ Region + A1_P + A2_P + (1 | DxCity_num_recode),
family = beta_family(link = "logit"),
data = summary_filtered_BreastData_valid_vill_wmissing)
summary(b10)
## Family: beta ( logit )
## Formula: late_prop ~ Region + A1_P + A2_P + (1 | DxCity_num_recode)
## Data: summary_filtered_BreastData_valid_vill_wmissing
##
## AIC BIC logLik deviance df.resid
## -39.8 -33.2 26.9 -53.8 12
##
## Random effects:
##
## Conditional model:
## Groups Name Variance Std.Dev.
## DxCity_num_recode (Intercept) 5.815e-10 2.411e-05
## Number of obs: 19, groups: DxCity_num_recode, 19
##
## Dispersion parameter for beta family (): 56.9
##
## Conditional model:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -1.72264 0.33723 -5.108 3.25e-07 ***
## RegionNorth 0.18568 0.20782 0.893 0.3716
## RegionSouth 0.36166 0.20956 1.726 0.0844 .
## A1_P 0.02333 0.02563 0.910 0.3626
## A2_P 0.01486 0.05213 0.285 0.7756
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
# Again it converges to insignificant...
b11 <- glmmTMB(late_prop ~ Region + A1_P + A2_P + A3_P + (1 | DxCity_num_recode),
family = beta_family(link = "logit"),
data = summary_filtered_BreastData_valid_vill_wmissing)
## Warning in (function (start, objective, gradient = NULL, hessian = NULL, :
## NA/NaN function evaluation
## Warning in (function (start, objective, gradient = NULL, hessian = NULL, :
## NA/NaN function evaluation
## Warning in finalizeTMB(TMBStruc, obj, fit, h, data.tmb.old): failed to invert
## Hessian from numDeriv::jacobian(), falling back to internal vcov estimate
summary(b11)
## Family: beta ( logit )
## Formula:
## late_prop ~ Region + A1_P + A2_P + A3_P + (1 | DxCity_num_recode)
## Data: summary_filtered_BreastData_valid_vill_wmissing
##
## AIC BIC logLik deviance df.resid
## -39.3 -31.8 27.7 -55.3 11
##
## Random effects:
##
## Conditional model:
## Groups Name Variance Std.Dev.
## DxCity_num_recode (Intercept) 3.687e-10 1.92e-05
## Number of obs: 19, groups: DxCity_num_recode, 19
##
## Dispersion parameter for beta family (): 61.4
##
## Conditional model:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -6.383e-01 NaN NaN NaN
## RegionNorth 8.593e-02 NaN NaN NaN
## RegionSouth 3.644e-01 NaN NaN NaN
## A1_P 1.721e-02 NaN NaN NaN
## A2_P -1.426e-02 NaN NaN NaN
## A3_P -3.813e-05 NaN NaN NaN
###Comparing Models With Random Effects (glmmTMB Models)###
lrtest(b7)
## Likelihood ratio test
##
## Model 1: late_prop ~ SVI_ALL_SUM + (1 | DxCity_num_recode)
## Model 2: late_prop ~ 1
## #Df LogLik Df Chisq Pr(>Chisq)
## 1 4 24.726
## 2 2 22.365 -2 4.7231 0.09427 .
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
lrtest(b8)
## Likelihood ratio test
##
## Model 1: late_prop ~ SVI_ALL_SUM + Region + (1 | DxCity_num_recode)
## Model 2: late_prop ~ 1
## #Df LogLik Df Chisq Pr(>Chisq)
## 1 6 27.582
## 2 2 22.365 -4 10.435 0.03371 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
lrtest(b9)
## Likelihood ratio test
##
## Model 1: late_prop ~ Region + A1_P + (1 | DxCity_num_recode)
## Model 2: late_prop ~ 1
## #Df LogLik Df Chisq Pr(>Chisq)
## 1 6 26.884
## 2 2 22.365 -4 9.0378 0.06016 .
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
lrtest(b10)
## Likelihood ratio test
##
## Model 1: late_prop ~ Region + A1_P + A2_P + (1 | DxCity_num_recode)
## Model 2: late_prop ~ 1
## #Df LogLik Df Chisq Pr(>Chisq)
## 1 7 26.924
## 2 2 22.365 -5 9.119 0.1044
lrtest(b11)
## Likelihood ratio test
##
## Model 1: late_prop ~ Region + A1_P + A2_P + A3_P + (1 | DxCity_num_recode)
## Model 2: late_prop ~ 1
## #Df LogLik Df Chisq Pr(>Chisq)
## 1 8 27.653
## 2 2 22.365 -6 10.576 0.1024
AIC(b7, b8, b9, b10, b11)
## df AIC
## b7 4 -41.45254
## b8 6 -43.16423
## b9 6 -41.76717
## b10 7 -39.84838
## b11 8 -39.30595
tab_model(b7, b8, b9, b10, b11,
show.ci = FALSE, # remove CI
show.aic = TRUE, # display AIC
p.style = "numeric" # display p-values
)
|
Â
|
late_prop
|
late_prop
|
late_prop
|
late_prop
|
late_prop
|
|
Predictors
|
Estimates
|
p
|
Estimates
|
p
|
Estimates
|
p
|
Estimates
|
p
|
Estimates
|
p
|
|
(Intercept)
|
0.21
|
<0.001
|
0.19
|
<0.001
|
0.18
|
<0.001
|
0.18
|
<0.001
|
0.53
|
NaN
|
|
SVI ALL SUM
|
1.04
|
0.022
|
1.04
|
0.019
|
|
|
|
|
|
|
|
Region [North]
|
|
|
1.08
|
0.709
|
1.18
|
0.398
|
1.20
|
0.372
|
1.09
|
NaN
|
|
Region [South]
|
|
|
1.43
|
0.012
|
1.50
|
0.006
|
1.44
|
0.084
|
1.44
|
NaN
|
|
A1 P
|
|
|
|
|
1.03
|
0.050
|
1.02
|
0.363
|
1.02
|
NaN
|
|
A2 P
|
|
|
|
|
|
|
1.01
|
0.776
|
0.99
|
NaN
|
|
A3 P
|
|
|
|
|
|
|
|
|
1.00
|
NaN
|
|
Random Effects
|
|
σ2
|
0.05
|
0.04
|
0.04
|
0.04
|
0.04
|
|
τ00
|
0.00 DxCity_num_recode
|
0.00 DxCity_num_recode
|
0.00 DxCity_num_recode
|
0.00 DxCity_num_recode
|
0.00 DxCity_num_recode
|
|
N
|
19 DxCity_num_recode
|
19 DxCity_num_recode
|
19 DxCity_num_recode
|
19 DxCity_num_recode
|
19 DxCity_num_recode
|
|
Observations
|
19
|
19
|
19
|
19
|
19
|
|
Marginal R2 / Conditional R2
|
0.372 / NA
|
0.613 / NA
|
0.563 / NA
|
0.565 / NA
|
0.616 / NA
|
|
AIC
|
-41.453
|
-43.164
|
-41.767
|
-39.848
|
-39.306
|
###########################################################################################
# We then move into regional-level by using a nesting model with random intercepts for village nested within region.
b12 <- glmmTMB(
late_prop ~ SVI_ALL_SUM + (1 | Region/DxCity_num_recode),
family = beta_family(link = "logit"),
data = summary_filtered_BreastData_valid_vill_wmissing
)
summary(b12)
## Family: beta ( logit )
## Formula: late_prop ~ SVI_ALL_SUM + (1 | Region/DxCity_num_recode)
## Data: summary_filtered_BreastData_valid_vill_wmissing
##
## AIC BIC logLik deviance df.resid
## -40.3 -35.6 25.1 -50.3 14
##
## Random effects:
##
## Conditional model:
## Groups Name Variance Std.Dev.
## DxCity_num_recode:Region (Intercept) 5.277e-10 2.297e-05
## Region (Intercept) 1.405e-02 1.185e-01
## Number of obs: 19, groups: DxCity_num_recode:Region, 19; Region, 3
##
## Dispersion parameter for beta family (): 52.3
##
## Conditional model:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -1.52943 0.29591 -5.169 2.36e-07 ***
## SVI_ALL_SUM 0.04112 0.01806 2.276 0.0228 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
b13 <- glmmTMB(
late_prop ~ A1_P + (1 | Region/DxCity_num_recode),
family = beta_family(link = "logit"),
data = summary_filtered_BreastData_valid_vill_wmissing
)
summary(b13)
## Family: beta ( logit )
## Formula: late_prop ~ A1_P + (1 | Region/DxCity_num_recode)
## Data: summary_filtered_BreastData_valid_vill_wmissing
##
## AIC BIC logLik deviance df.resid
## -38.6 -33.9 24.3 -48.6 14
##
## Random effects:
##
## Conditional model:
## Groups Name Variance Std.Dev.
## DxCity_num_recode:Region (Intercept) 1.560e-09 3.949e-05
## Region (Intercept) 1.876e-02 1.370e-01
## Number of obs: 19, groups: DxCity_num_recode:Region, 19; Region, 3
##
## Dispersion parameter for beta family (): 48.8
##
## Conditional model:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -1.53997 0.36909 -4.172 3.01e-05 ***
## A1_P 0.02928 0.01605 1.825 0.0681 .
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
b14 <- glmmTMB(
late_prop ~ A1_P + A2_P + (1 | Region/DxCity_num_recode),
family = beta_family(link = "logit"),
data = summary_filtered_BreastData_valid_vill_wmissing
)
summary(b14)
## Family: beta ( logit )
## Formula: late_prop ~ A1_P + A2_P + (1 | Region/DxCity_num_recode)
## Data: summary_filtered_BreastData_valid_vill_wmissing
##
## AIC BIC logLik deviance df.resid
## -38.4 -32.8 25.2 -50.4 13
##
## Random effects:
##
## Conditional model:
## Groups Name Variance Std.Dev.
## DxCity_num_recode:Region (Intercept) 1.061e-02 1.030e-01
## Region (Intercept) 8.443e-11 9.188e-06
## Number of obs: 19, groups: DxCity_num_recode:Region, 19; Region, 3
##
## Dispersion parameter for beta family (): 53.1
##
## Conditional model:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -1.624220 0.364165 -4.460 8.19e-06 ***
## A1_P 0.005382 0.021355 0.252 0.8010
## A2_P 0.064839 0.035672 1.818 0.0691 .
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
# Converges insignificantly...
b15 <- glmmTMB(
late_prop ~ A1_P + A2_P + A3_P + (1 | Region/DxCity_num_recode),
family = beta_family(link = "logit"),
data = summary_filtered_BreastData_valid_vill_wmissing
)
## Warning in (function (start, objective, gradient = NULL, hessian = NULL, :
## NA/NaN function evaluation
## Warning in (function (start, objective, gradient = NULL, hessian = NULL, :
## NA/NaN function evaluation
## Warning in finalizeTMB(TMBStruc, obj, fit, h, data.tmb.old): failed to invert
## Hessian from numDeriv::jacobian(), falling back to internal vcov estimate
summary(b15)
## Family: beta ( logit )
## Formula:
## late_prop ~ A1_P + A2_P + A3_P + (1 | Region/DxCity_num_recode)
## Data: summary_filtered_BreastData_valid_vill_wmissing
##
## AIC BIC logLik deviance df.resid
## -38.2 -31.6 26.1 -52.2 12
##
## Random effects:
##
## Conditional model:
## Groups Name Variance Std.Dev.
## DxCity_num_recode:Region (Intercept) 3.228e-03 5.681e-02
## Region (Intercept) 9.628e-11 9.812e-06
## Number of obs: 19, groups: DxCity_num_recode:Region, 19; Region, 3
##
## Dispersion parameter for beta family (): 54
##
## Conditional model:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -4.161e-01 NaN NaN NaN
## A1_P -6.704e-03 NaN NaN NaN
## A2_P 4.280e-02 NaN NaN NaN
## A3_P -4.227e-05 NaN NaN NaN
###Comparing Models Across Model Types (Betareg vs. glmmTMB)###
lrtest(b12)
## Likelihood ratio test
##
## Model 1: late_prop ~ SVI_ALL_SUM + (1 | Region/DxCity_num_recode)
## Model 2: late_prop ~ 1
## #Df LogLik Df Chisq Pr(>Chisq)
## 1 5 25.142
## 2 2 22.365 -3 5.5543 0.1354
lrtest(b13)
## Likelihood ratio test
##
## Model 1: late_prop ~ A1_P + (1 | Region/DxCity_num_recode)
## Model 2: late_prop ~ 1
## #Df LogLik Df Chisq Pr(>Chisq)
## 1 5 24.309
## 2 2 22.365 -3 3.888 0.2738
lrtest(b14)
## Likelihood ratio test
##
## Model 1: late_prop ~ A1_P + A2_P + (1 | Region/DxCity_num_recode)
## Model 2: late_prop ~ 1
## #Df LogLik Df Chisq Pr(>Chisq)
## 1 6 25.217
## 2 2 22.365 -4 5.7045 0.2223
lrtest(b15)
## Likelihood ratio test
##
## Model 1: late_prop ~ A1_P + A2_P + A3_P + (1 | Region/DxCity_num_recode)
## Model 2: late_prop ~ 1
## #Df LogLik Df Chisq Pr(>Chisq)
## 1 7 26.095
## 2 2 22.365 -5 7.4603 0.1886
AIC(b12, b13, b14, b15)
## df AIC
## b12 5 -40.28370
## b13 5 -38.61745
## b14 6 -38.43395
## b15 7 -38.18967
tab_model(b12, b13, b14, b15,
show.ci = FALSE, # remove CI
show.aic = TRUE, # display AIC
p.style = "numeric" # display p-values
)
|
Â
|
late_prop
|
late_prop
|
late_prop
|
late_prop
|
|
Predictors
|
Estimates
|
p
|
Estimates
|
p
|
Estimates
|
p
|
Estimates
|
p
|
|
(Intercept)
|
0.22
|
<0.001
|
0.21
|
<0.001
|
0.20
|
<0.001
|
0.66
|
NaN
|
|
SVI ALL SUM
|
1.04
|
0.023
|
|
|
|
|
|
|
|
A1 P
|
|
|
1.03
|
0.068
|
1.01
|
0.801
|
0.99
|
NaN
|
|
A2 P
|
|
|
|
|
1.07
|
0.069
|
1.04
|
NaN
|
|
A3 P
|
|
|
|
|
|
|
1.00
|
NaN
|
|
Random Effects
|
|
σ2
|
0.05
|
0.05
|
0.06
|
0.05
|
|
τ00
|
0.00 DxCity_num_recode:Region
|
0.00 DxCity_num_recode:Region
|
0.01 DxCity_num_recode:Region
|
0.00 DxCity_num_recode:Region
|
|
|
0.01 Region
|
0.02 Region
|
0.00 Region
|
0.00 Region
|
|
N
|
19 DxCity_num_recode
|
19 DxCity_num_recode
|
19 DxCity_num_recode
|
19 DxCity_num_recode
|
|
|
3 Region
|
3 Region
|
3 Region
|
3 Region
|
|
Observations
|
19
|
19
|
19
|
19
|
|
Marginal R2 / Conditional R2
|
0.384 / NA
|
0.261 / NA
|
0.396 / NA
|
0.502 / NA
|
|
AIC
|
-40.284
|
-38.617
|
-38.434
|
-38.190
|
###Comparison across different models###
tab_model(b2, b8, b12,
show.ci = FALSE, # remove CI
show.aic = TRUE, # display AIC
p.style = "numeric" # display p-values
)
|
Â
|
late_prop
|
late_prop
|
late_prop
|
|
Predictors
|
Estimates
|
p
|
Estimates
|
p
|
Estimates
|
p
|
|
(Intercept)
|
0.19
|
<0.001
|
0.19
|
<0.001
|
0.22
|
<0.001
|
|
SVI ALL SUM
|
1.04
|
0.020
|
1.04
|
0.019
|
1.04
|
0.023
|
|
Region [North]
|
1.08
|
0.709
|
1.08
|
0.709
|
|
|
|
Region [South]
|
1.43
|
0.012
|
1.43
|
0.012
|
|
|
|
Random Effects
|
|
σ2
|
Â
|
0.04
|
0.05
|
|
τ00
|
Â
|
0.00 DxCity_num_recode
|
0.00 DxCity_num_recode:Region
|
|
|
Â
|
Â
|
0.01 Region
|
|
N
|
Â
|
19 DxCity_num_recode
|
19 DxCity_num_recode
|
|
|
Â
|
Â
|
3 Region
|
|
Observations
|
19
|
19
|
19
|
|
R2
|
0.400
|
0.613 / NA
|
0.384 / NA
|
|
AIC
|
-45.164
|
-43.164
|
-40.284
|
# Likelihood Ratio Test: Compare nested models to see if adding random effects significantly improves fit
lrtest(b2, b8) # Compare fixed effects vs. random intercepts for Village
## Warning in modelUpdate(objects[[i - 1]], objects[[i]]): original model was of
## class "betareg", updated model is of class "glmmTMB"
## Likelihood ratio test
##
## Model 1: late_prop ~ SVI_ALL_SUM + Region
## Model 2: late_prop ~ SVI_ALL_SUM + Region + (1 | DxCity_num_recode)
## #Df LogLik Df Chisq Pr(>Chisq)
## 1 5 27.582
## 2 6 27.582 1 0 0.9999
lrtest(b8, b12) # Compare random intercepts for Village vs. nested Region/Village
## Likelihood ratio test
##
## Model 1: late_prop ~ SVI_ALL_SUM + Region + (1 | DxCity_num_recode)
## Model 2: late_prop ~ SVI_ALL_SUM + (1 | Region/DxCity_num_recode)
## #Df LogLik Df Chisq Pr(>Chisq)
## 1 6 27.582
## 2 5 25.142 -1 4.8805 0.02716 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
# Post-Hoc Analysis for Region: Conduct pairwise comparisons for Region to test specific differences (can't run)
# summary(glht(b2, linfct = mcp(Region = "Tukey")))
### Top 3 Models Overall ###
tab_model(b1, b2, b3,
show.ci = FALSE, # remove CI
show.aic = TRUE, # display AIC
p.style = "numeric" # display p-values
)
|
Â
|
late_prop
|
late_prop
|
late_prop
|
|
Predictors
|
Estimates
|
p
|
Estimates
|
p
|
Estimates
|
p
|
|
(Intercept)
|
0.21
|
<0.001
|
0.19
|
<0.001
|
0.18
|
<0.001
|
|
SVI ALL SUM
|
1.04
|
0.023
|
1.04
|
0.020
|
|
|
|
Region [North]
|
|
|
1.08
|
0.709
|
1.18
|
0.398
|
|
Region [South]
|
|
|
1.43
|
0.012
|
1.50
|
0.006
|
|
A1 P
|
|
|
|
|
1.03
|
0.057
|
|
Observations
|
19
|
19
|
19
|
|
R2
|
0.218
|
0.400
|
0.360
|
|
AIC
|
-43.453
|
-45.164
|
-43.767
|
###THIS MODEL?###
predictors_no_region <- betareg(late_prop ~ A1_P + A2_P + A3_P + A4_P + B1_P + B2_P + B3_P + B4_P + C1_NEW_P + C2_P + D1_P + D2_P + D3_P + D4_P + H1_P + H2_P, data=summary_filtered_BreastData_valid_vill_wmissing, link="logit")
summary(predictors_no_region)
##
## Call:
## betareg(formula = late_prop ~ A1_P + A2_P + A3_P + A4_P + B1_P + B2_P +
## B3_P + B4_P + C1_NEW_P + C2_P + D1_P + D2_P + D3_P + D4_P + H1_P +
## H2_P, data = summary_filtered_BreastData_valid_vill_wmissing, link = "logit")
##
## Quantile residuals:
## Min 1Q Median 3Q Max
## -1.5164 -1.0231 0.0097 0.7886 1.9687
##
## Coefficients (mean model with logit link):
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -7.454e+00 3.971e+00 -1.877 0.0605 .
## A1_P 9.609e-02 6.587e-02 1.459 0.1447
## A2_P -1.476e-01 1.231e-01 -1.199 0.2304
## A3_P 8.162e-05 6.087e-05 1.341 0.1800
## A4_P -1.086e-01 6.110e-02 -1.777 0.0755 .
## B1_P -1.701e-01 2.188e-01 -0.777 0.4369
## B2_P 5.698e-02 5.228e-02 1.090 0.2758
## B3_P 3.253e-01 2.038e-01 1.596 0.1104
## B4_P -1.029e-01 1.148e-01 -0.896 0.3702
## C1_NEW_P 4.542e-02 2.453e-02 1.852 0.0641 .
## C2_P -7.444e-02 3.882e-01 -0.192 0.8479
## D1_P -3.586e-02 1.585e-02 -2.262 0.0237 *
## D2_P 2.775e-01 1.731e-01 1.603 0.1089
## D3_P 1.029e-01 4.587e-02 2.242 0.0249 *
## D4_P 1.357e-01 9.598e-02 1.414 0.1573
## H1_P -2.184e-01 2.654e-01 -0.823 0.4106
## H2_P 1.161e-02 7.896e-03 1.470 0.1416
##
## Phi coefficients (precision model with identity link):
## Estimate Std. Error z value Pr(>|z|)
## (phi) 169.66 54.91 3.09 0.002 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Type of estimator: ML (maximum likelihood)
## Log-likelihood: 37.3 on 18 Df
## Pseudo R-squared: 0.7894
## Number of iterations: 72 (BFGS) + 2 (Fisher scoring)
AIC(predictors_no_region)
## [1] -38.59955
# Chisquare Test for Independence
age_data <- matrix(c(10, 1, 51, 14, 63, 29, 94, 18, 53, 14, 36, 21),
ncol = 2, byrow = TRUE)
colnames(age_data) <- c("Late", "Unstaged")
rownames(age_data) <- c("25-34", "35-44", "45-54", "55-64", "65-74", "75+")
age_data <- as.table(age_data)
age_data
## Late Unstaged
## 25-34 10 1
## 35-44 51 14
## 45-54 63 29
## 55-64 94 18
## 65-74 53 14
## 75+ 36 21
chisq.test(age_data)$expected
## Warning in chisq.test(age_data): Chi-squared approximation may be incorrect
## Late Unstaged
## 25-34 8.358911 2.641089
## 35-44 49.393564 15.606436
## 45-54 69.910891 22.089109
## 55-64 85.108911 26.891089
## 65-74 50.913366 16.086634
## 75+ 43.314356 13.685644
chisq.test(age_data)
## Warning in chisq.test(age_data): Chi-squared approximation may be incorrect
##
## Pearson's Chi-squared test
##
## data: age_data
## X-squared = 13.774, df = 5, p-value = 0.01711
# Because its statistically significant, there indicates a difference in the distribution between groups. We can further asses which groups are statistically significnat within age groups using a proportion test
# Compare proportions for 25-34 vs. 35-44
prop.test(x = c(10, 51), n = c(19, 106))
##
## 2-sample test for equality of proportions with continuity correction
##
## data: c(10, 51) out of c(19, 106)
## X-squared = 0.012913, df = 1, p-value = 0.9095
## alternative hypothesis: two.sided
## 95 percent confidence interval:
## -0.2296782 0.3200456
## sample estimates:
## prop 1 prop 2
## 0.5263158 0.4811321
prop.test(x = c(51, 63), n = c(106, 198))
##
## 2-sample test for equality of proportions with continuity correction
##
## data: c(51, 63) out of c(106, 198)
## X-squared = 7.1418, df = 1, p-value = 0.007531
## alternative hypothesis: two.sided
## 95 percent confidence interval:
## 0.04057287 0.28532765
## sample estimates:
## prop 1 prop 2
## 0.4811321 0.3181818
prop.test(x = c(63, 94), n = c(198, 237))
##
## 2-sample test for equality of proportions with continuity correction
##
## data: c(63, 94) out of c(198, 237)
## X-squared = 2.5478, df = 1, p-value = 0.1104
## alternative hypothesis: two.sided
## 95 percent confidence interval:
## -0.17301047 0.01612516
## sample estimates:
## prop 1 prop 2
## 0.3181818 0.3966245
prop.test(x = c(94, 53), n = c(237, 179))
##
## 2-sample test for equality of proportions with continuity correction
##
## data: c(94, 53) out of c(237, 179)
## X-squared = 4.0816, df = 1, p-value = 0.04335
## alternative hypothesis: two.sided
## 95 percent confidence interval:
## 0.004243908 0.196826266
## sample estimates:
## prop 1 prop 2
## 0.3966245 0.2960894
prop.test(x = c(53, 36), n = c(179, 116))
##
## 2-sample test for equality of proportions with continuity correction
##
## data: c(53, 36) out of c(179, 116)
## X-squared = 0.017089, df = 1, p-value = 0.896
## alternative hypothesis: two.sided
## 95 percent confidence interval:
## -0.1288798 0.1003689
## sample estimates:
## prop 1 prop 2
## 0.2960894 0.3103448
ethnic_data <- matrix(c(146, 37, 85, 28, 23, 6, 22, 4, 14, 8),
ncol = 2, byrow = TRUE)
colnames(ethnic_data) <- c("Late", "Unstaged")
rownames(ethnic_data) <- c("CHamoru", "FIlipino", "Micronesian", "Asian", "White")
ethnic_data <- as.table(ethnic_data)
ethnic_data
## Late Unstaged
## CHamoru 146 37
## FIlipino 85 28
## Micronesian 23 6
## Asian 22 4
## White 14 8
# Perform chi-square test
# But first let's assess the assumptions for chisquare
chisq.test(ethnic_data)$expected
## Warning in chisq.test(ethnic_data): Chi-squared approximation may be incorrect
## Late Unstaged
## CHamoru 142.27882 40.721180
## FIlipino 87.85523 25.144772
## Micronesian 22.54692 6.453083
## Asian 20.21448 5.785523
## White 17.10456 4.895442
chisq.test(ethnic_data)
## Warning in chisq.test(ethnic_data): Chi-squared approximation may be incorrect
##
## Pearson's Chi-squared test
##
## data: ethnic_data
## X-squared = 4.1364, df = 4, p-value = 0.3879
# Example logistic regression for Age predicting Late
data <- data.frame(
Stage = c(rep("Late", 6), rep("Unstaged", 6)),
Age = rep(c("25-34", "35-44", "45-54", "55-64", "65-74", "75+"), 2),
Count = c(10, 51, 63, 94, 53, 36, 1, 14, 29, 18, 14, 21)
)
data$Stage <- factor(data$Stage, levels = c("Unstaged", "Late"))
data$Age <- factor(data$Age, levels = unique(data$Age))
model <- glm(Stage ~ Age, family = binomial, weights = Count, data = data)
summary(model)
##
## Call:
## glm(formula = Stage ~ Age, family = binomial, data = data, weights = Count)
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 2.3026 1.0488 2.195 0.0281 *
## Age35-44 -1.0098 1.0913 -0.925 0.3548
## Age45-54 -1.5267 1.0725 -1.423 0.1546
## Age55-64 -0.6497 1.0799 -0.602 0.5474
## Age65-74 -0.9714 1.0910 -0.890 0.3733
## Age75+ -1.7636 1.0842 -1.627 0.1038
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 445.36 on 11 degrees of freedom
## Residual deviance: 431.56 on 6 degrees of freedom
## AIC: 443.56
##
## Number of Fisher Scoring iterations: 5
library(ggplot2)
age_data_prop <- data.frame(
Age = c("25-34", "35-44", "45-54", "55-64", "65-74", "75+"),
Late = c(52.6, 48.1, 31.8, 39.7, 29.6, 31.0),
Unstaged = c(5.3, 13.2, 14.6, 7.6, 7.8, 18.1)
)
age_data_prop_long <- tidyr::pivot_longer(age_data_prop, cols = c(Late, Unstaged), names_to = "Stage", values_to = "Percentage")
ggplot(age_data_prop_long, aes(x = Age, y = Percentage, fill = Stage)) +
geom_bar(stat = "identity", position = "dodge") +
labs(title = "Late and Unstaged Cases by Age Group", y = "Percentage", x = "Age Group") +
theme_minimal()

library(ggplot2)
library(tidyr)
##
## Attaching package: 'tidyr'
## The following objects are masked from 'package:Matrix':
##
## expand, pack, unpack
# Data for ethnicity
ethnic_data <- data.frame(
Ethnicity = c("CHamoru", "Filipino", "Micronesian", "Asian", "White"),
Late = c(36.4, 34.0, 46.9, 43.1, 31.1),
Unstaged = c(9.2, 11.2, 12.2, 7.8, 17.8)
)
# Convert data to long format
ethnic_data_long <- tidyr::pivot_longer(ethnic_data, cols = c(Late, Unstaged),
names_to = "Stage", values_to = "Percentage")
# Plot
ggplot(ethnic_data_long, aes(x = Ethnicity, y = Percentage, fill = Stage)) +
geom_bar(stat = "identity", position = "dodge") +
labs(title = "Late and Unstaged Cases by Ethnicity",
y = "Percentage", x = "Ethnicity") +
theme_minimal() +
theme(axis.text.x = element_text(angle = 45, hjust = 1))
