Final Project Proposal

R Markdown

This is an R Markdown document. Markdown is a simple formatting syntax for authoring HTML, PDF, and MS Word documents. For more details on using R Markdown see http://rmarkdown.rstudio.com.

When you click the Knit button a document will be generated that includes both content as well as the output of any embedded R code chunks within the document. You can embed an R code chunk like this:

#set working directory
setwd("/Users/samriddhi/Desktop/DrPH/HEPI_553_Muntasir Musum/project proposal/PRAMS Phase 8")
getwd()

## [1] "/Users/samriddhi/Desktop/DrPH/HEPI_553_Muntasir Musum/project proposal/PRAMS Phase 8"

data <- read_excel("@OH_SR.xlsx")
data <- read_excel("@OH_SR.xlsx", sheet = 1)

#save file
save(data, file = "OH_SR.rda")

#Inspect dataset
dim(data)

## [1] 1378   15

names(data)

##  [1] "Region"                              "Topic"                              
##  [3] "Title"                               "Group"                              
##  [5] "Characteristics"                     "Year"                               
##  [7] "Respondents"                         "Estimated number of people affected"
##  [9] "Percentage"                          "Confidence interval"                
## [11] "Footnotes1"                          "Footnotes2"                         
## [13] "Footnotes3"                          "Footnotes4"                         
## [15] "Footnotes5"

unique(data$Title)

## [1] "Percentage of women who reported having their teeth cleaned during pregnancy"                                           
## [2] "Percentage of women who talked with a dentist or other health care worker about care of teeth and gums during pregnancy"
## [3] "Percentage of women who reported a health care visit to have their teeth cleaned before pregnancy^"

summary(data)

##     Region             Topic              Title              Group          
##  Length:1378        Length:1378        Length:1378        Length:1378       
##  Class :character   Class :character   Class :character   Class :character  
##  Mode  :character   Mode  :character   Mode  :character   Mode  :character  
##                                                                             
##                                                                             
##                                                                             
##                                                                             
##  Characteristics         Year       Respondents    
##  Length:1378        Min.   :2015   Min.   :  11.0  
##  Class :character   1st Qu.:2017   1st Qu.: 185.0  
##  Mode  :character   Median :2019   Median : 419.0  
##                     Mean   :2019   Mean   : 518.7  
##                     3rd Qu.:2021   3rd Qu.: 757.0  
##                     Max.   :2023   Max.   :2372.0  
##                                    NA's   :53      
##  Estimated number of people affected  Percentage        Confidence interval
##  Length:1378                         Length:1378        Length:1378        
##  Class :character                    Class :character   Class :character   
##  Mode  :character                    Mode  :character   Mode  :character   
##                                                                            
##                                                                            
##                                                                            
##                                                                            
##  Footnotes1      Footnotes2         Footnotes3         Footnotes4       
##  Mode:logical   Length:1378        Length:1378        Length:1378       
##  NA's:1378      Class :character   Class :character   Class :character  
##                 Mode  :character   Mode  :character   Mode  :character  
##                                                                         
##                                                                         
##                                                                         
##                                                                         
##   Footnotes5       
##  Length:1378       
##  Class :character  
##  Mode  :character  
##                    
##                    
##                    
##

#Cleaning dataset-Step 1
#remove unnecessary column
prams_data <- data %>%
  select(
    -Topic, 
    -Percentage, 
    -`Confidence interval`, 
    -Footnotes1, -Footnotes2, -Footnotes3, -Footnotes4, -Footnotes5
  ) %>%  
filter(Title != "Percentage of women who talked with a dentist or other health care worker about care of teeth and gums during pregnancy") # remove unwanted Title

#Check the data
unique(prams_data$Title)

## [1] "Percentage of women who reported having their teeth cleaned during pregnancy"                      
## [2] "Percentage of women who reported a health care visit to have their teeth cleaned before pregnancy^"

#rename Estimated number of people affected
prams_data <- prams_data %>%
rename(Est_people = `Estimated number of people affected`)

#checking dataset, Step 2
dim(prams_data)

## [1] 954   7

#Check missing values BEFORE cleaning: Step 2 
missing_before <- prams_data %>%
  summarise(across(everything(), ~ sum(is.na(.)))) %>%
  pivot_longer(cols = everything(), names_to = "Variable", values_to = "Missing_Observations")

kable(missing_before, caption = "Missing Values BEFORE Cleaning")

Missing Values BEFORE Cleaning
Variable	Missing_Observations
Region	0
Title	0
Group	0
Characteristics	0
Year	0
Respondents	53
Est_people	53

#Clean numeric columns (removes commas, spaces): Step 3
# Replace empty strings or non-numeric placeholders with NA
prams_data$Est_people[prams_data$Est_people == "" | prams_data$Est_people == "N/A"] <- NA
prams_data$Respondents[prams_data$Respondents == "" | prams_data$Respondents == "N/A"] <- NA
prams_data$Year[prams_data$Year == "" | prams_data$Year == "N/A"] <- NA

# Now convert to numeric
prams_data$Est_people <- as.numeric(gsub(",", "", prams_data$Est_people))

## Warning: NAs introduced by coercion

prams_data$Respondents <- as.numeric(prams_data$Respondents)
prams_data$Year <- as.numeric(prams_data$Year)

#Remove NA: Step 4 
prams_clean <- prams_data %>%
  drop_na(`Region`, `Title`, `Group`, `Characteristics`, `Year`,`Respondents`, `Est_people`, )

#Check missing values after cleaning: Step 5
missing_after <- prams_clean %>%
  summarise(across(everything(), ~ sum(is.na(.)))) %>%
  pivot_longer(cols = everything(), names_to = "Variable", values_to = "Missing_Observations")

kable(missing_after, caption = "Missing Values AFTER Cleaning")

Missing Values AFTER Cleaning
Variable	Missing_Observations
Region	0
Title	0
Group	0
Characteristics	0
Year	0
Respondents	0
Est_people	0

# Number rows were dropped
cat("Rows dropped due to missing values:", nrow(prams_data) - nrow(prams_clean), "\n")

## Rows dropped due to missing values: 103

## check cleaned dataset dimensions
dim(prams_clean)

## [1] 851   7

#save RDS file
saveRDS(prams_clean, here::here(
  "/Users/samriddhi/Desktop/DrPH/HEPI_553_Muntasir Musum/project proposal'.rds"))

write.csv(prams_clean, "/Users/samriddhi/Desktop/DrPH/HEPI_553_Muntasir Musum/project proposal/prams_clean.csv",
          row.names = FALSE)

# Split dataset by Title
# Women who cleaned teeth before pregnancy
prams_before <- prams_clean %>%
  filter(grepl("before pregnancy", Title, ignore.case = TRUE))

# Women who cleaned teeth during/after pregnancy
prams_after <- prams_clean %>%
  filter(grepl("during pregnancy", Title, ignore.case = TRUE))

#Descriptive statistics

total_cleaning <- prams_clean %>%
  filter(grepl("before pregnancy|during pregnancy", Title, ignore.case = TRUE)) %>%
  group_by(Title) %>%
  summarise(
    Respondents_N = sum(Respondents, na.rm = TRUE),
    Estimated_N = sum(Est_people, na.rm = TRUE),
    .groups = "drop"
  ) %>%
  mutate(
    Respondents_Percent = round(Respondents_N / sum(Respondents_N) * 100, 1),
    Estimated_Percent = round(Estimated_N / sum(Estimated_N) * 100, 1),
    
    Respondents_Result = paste0(Respondents_N, " (", Respondents_Percent, "%)"),
    Estimated_Result = paste0(Estimated_N, " (", Estimated_Percent, "%)")
  ) %>%
  select(Title, Respondents_Result, Estimated_Result)

kable(total_cleaning,
      col.names = c("Teeth Cleaning",
                    "Respondents n (%)",
                    "Estimated Population n (%)"),
      caption = "Total Number and Percentage of Women Who Cleaned Teeth Before and During Pregnancy")

Total Number and Percentage of Women Who Cleaned Teeth Before and During Pregnancy
Teeth Cleaning	Respondents n (%)	Estimated Population n (%)
Percentage of women who reported a health care visit to have their teeth cleaned before pregnancy^	219983 (47%)	9009382 (46.2%)
Percentage of women who reported having their teeth cleaned during pregnancy	248454 (53%)	10486548 (53.8%)

# Demographic characteristics 
# Before pregnancy

preg_before <- prams_clean %>%
  filter(grepl("before pregnancy", Title, ignore.case = TRUE)) %>%
  filter(Group %in% c("Maternal Age", "Education", "Race/Ethnicity",
                      "Medicaid Status", "Region")) %>%
  group_by(Group, Characteristics) %>%
  summarise(
    Total_Respondents = sum(Respondents, na.rm = TRUE),
    Total_Estimated = sum(Est_people, na.rm = TRUE),
    .groups = "drop"
  ) %>%
  group_by(Group) %>%
  mutate(
    Percent_Respondents = round(Total_Respondents / sum(Total_Respondents) * 100, 1),
    Percent_Estimated = round(Total_Estimated / sum(Total_Estimated) * 100, 1)
  ) %>%
  ungroup() %>%
  mutate(
    `Respondents (n, %)` = paste0(Total_Respondents, " (", Percent_Respondents, "%)"),
    `Estimated Population (n, %)` = paste0(Total_Estimated, " (", Percent_Estimated, "%)")
  ) %>%
  select(Group, Characteristics, `Respondents (n, %)`, `Estimated Population (n, %)`)

kable(preg_before,
      caption = "Table 1. Women Who Cleaned Teeth Before Pregnancy (PRAMS Phase 8)")

Table 1. Women Who Cleaned Teeth Before Pregnancy (PRAMS Phase 8)
Group	Characteristics	Respondents (n, %)	Estimated Population (n, %)
Education	High school graduate	7414 (21.9%)	228390 (16.5%)
Education	Less than high school	4372 (12.9%)	103870 (7.5%)
Education	More than high school	22030 (65.1%)	1053974 (76%)
Maternal Age	20-24 years old	4300 (12.7%)	150171 (10.9%)
Maternal Age	25-34 years old	19248 (57%)	796846 (57.6%)
Maternal Age	35 years old or more	9744 (28.8%)	416830 (30.2%)
Maternal Age	Less than 20 years old	483 (1.4%)	18582 (1.3%)
Medicaid Status	Not on Medicaid	18010 (53%)	929069 (66.8%)
Medicaid Status	On Medicaid	15966 (47%)	461954 (33.2%)
Race/Ethnicity	Asian, non-Hispanic	4298 (12.9%)	100324 (7.3%)
Race/Ethnicity	Black, non-Hispanic	5560 (16.7%)	147859 (10.8%)
Race/Ethnicity	Hispanic	8338 (25%)	239618 (17.5%)
Race/Ethnicity	Other, non-Hispanic	591 (1.8%)	23409 (1.7%)
Race/Ethnicity	White, non-Hispanic	14554 (43.7%)	856837 (62.6%)
Region	NYC	10497 (61.7%)	289734 (41.6%)
Region	NYS excl NYC	6518 (38.3%)	406596 (58.4%)

#Demographic characteristics 
#During pregnancy
preg_during <- prams_clean %>%
  filter(grepl("during pregnancy", Title, ignore.case = TRUE)) %>%
  filter(Group %in% c("Maternal Age", "Education", "Race/Ethnicity",
                      "Medicaid Status", "Region")) %>%
  group_by(Group, Characteristics) %>%
  summarise(
    Total_Respondents = sum(Respondents, na.rm = TRUE),
    Total_Estimated = sum(Est_people, na.rm = TRUE),
    .groups = "drop"
  ) %>%
  group_by(Group) %>%
  mutate(
    Percent_Respondents = round(Total_Respondents / sum(Total_Respondents) * 100, 1),
    Percent_Estimated = round(Total_Estimated / sum(Total_Estimated) * 100, 1)
  ) %>%
  ungroup() %>%
  mutate(
    `Respondents (n, %)` = paste0(Total_Respondents, " (", Percent_Respondents, "%)"),
    `Estimated Population (n, %)` = paste0(Total_Estimated, " (", Percent_Estimated, "%)")
  ) %>%
  select(Group, Characteristics, `Respondents (n, %)`, `Estimated Population (n, %)`)

kable(preg_during,
      caption = "Table 2. Women Who Cleaned Teeth During Pregnancy (PRAMS Phase 8)")

Table 2. Women Who Cleaned Teeth During Pregnancy (PRAMS Phase 8)
Group	Characteristics	Respondents (n, %)	Estimated Population (n, %)
Education	High school graduate	8284 (21.7%)	308474 (19.1%)
Education	Less than high school	4992 (13.1%)	189868 (11.8%)
Education	More than high school	24940 (65.3%)	1116175 (69.1%)
Maternal Age	20-24 years old	5008 (13.1%)	199627 (12.4%)
Maternal Age	25-34 years old	21794 (57.1%)	924531 (57.4%)
Maternal Age	35 years old or more	10758 (28.2%)	461272 (28.6%)
Maternal Age	Less than 20 years old	625 (1.6%)	24812 (1.5%)
Medicaid Status	Not on Medicaid	20272 (52.8%)	991282 (61.2%)
Medicaid Status	On Medicaid	18112 (47.2%)	627988 (38.8%)
Race/Ethnicity	Asian, non-Hispanic	4820 (12.8%)	126107 (7.9%)
Race/Ethnicity	Black, non-Hispanic	6170 (16.4%)	164650 (10.4%)
Race/Ethnicity	Hispanic	9362 (24.9%)	364877 (22.9%)
Race/Ethnicity	Other, non-Hispanic	572 (1.5%)	24252 (1.5%)
Race/Ethnicity	White, non-Hispanic	16650 (44.3%)	910105 (57.2%)
Region	NYC	11814 (61.5%)	360422 (44.5%)
Region	NYS excl NYC	7405 (38.5%)	450084 (55.5%)

# data visualization
# Before pregnancy: Respondents
prams_before_res <- ggplot(prams_before, aes(x = Respondents)) +
  geom_histogram(bins = 30, fill = "darkgreen", color = "white") +
  labs(
    title = "Respondents Who Cleaned Teeth Before Pregnancy",
    x = "Number of Respondents",
    y = "Frequency"
  )
# show plot
prams_before_res

# Save the plot
ggsave("respondents_before_pregnancy.png", plot = prams_before_res,
       width = 7, height = 5, dpi = 300)

# Before pregnancy: Estimated population
prams_before_est <- ggplot(prams_before, aes(x = Est_people)) +
  geom_histogram(bins = 30, fill = "blue", color = "white") +
  labs(
    title = "Estimated Population Who Cleaned Teeth Before Pregnancy",
    x = "Estimated Population",
    y = "Frequency"
  )
# show plot
prams_before_est

# Save the plot
ggsave("respondents_before_pregnancy.png", plot = prams_before_est,
       width = 7, height = 5, dpi = 300)

# data visualization
# During pregnancy: Respondents
prams_after_res <- ggplot(prams_after, aes(x = Respondents)) +
  geom_histogram(bins = 30, fill = "cyan", color = "white") +
  labs(
    title = "Respondents Who Cleaned Teeth During Pregnancy",
    x = "Number of Respondents",
    y = "Frequency"
  )
# show plot
prams_after_res

# Save the plot
ggsave("respondents_before_pregnancy.png", plot = prams_after_res,
       width = 7, height = 5, dpi = 300)

# During pregnancy: Estimated population
prams_after_est= ggplot(prams_after, aes(x = Est_people)) +
  geom_histogram(bins = 30, fill = "brown", color = "white") +
  labs(
    title = "Estimated Population Who Cleaned Teeth During Pregnancy",
    x = "Estimated Population",
    y = "Frequency"
  )
# show plot
prams_after_est

# Save the plot
ggsave("respondents_before_pregnancy.png", plot = prams_after_est,
       width = 7, height = 5, dpi = 300)

#Data visualization: Association between the outcome and primary exposure
# Before pregnancy for: Education
edu_before <- prams_before %>%
  filter(Group == "Education")

plot_edu_before <- ggplot(edu_before, aes(x = Characteristics, y = Est_people, fill = Characteristics)) +
  geom_boxplot() +
  labs(
    title = "Teeth Cleaning by Education Before Pregnancy in Estimated Population",
    x = "Education Level",
    y = "Estimated Number of People"
)
  
# Show the plot
print(plot_edu_before)

# Save the plot
ggsave("Edu & Before Pregnancy in Est.png", plot = plot_edu_before,
       width = 7, height = 5, dpi = 300)


#Before pregnancy: Maternal age
age_before <- prams_before %>%
  filter(Group == "Maternal Age")

plot_age_before <- ggplot(age_before, aes(x = Characteristics, y = Est_people, fill = Characteristics)) +
  geom_boxplot() +
  labs(
    title = "Teeth Cleaning by Maternal Age Before Pregnancy in Estimated Population",
    x = "Maternal Age",
    y = "Est_people"
  )
# Show the plot
print(plot_age_before)

# Save the plot
ggsave("Age & Before Pregnancy in Est.png", plot = plot_age_before,
       width = 7, height = 5, dpi = 300)

#Before pregnancy: Medicaid status
medi_before <- prams_before %>%
  filter(Group == "Medicaid Status")

plot_medi_before<- ggplot(medi_before, aes(x = Characteristics, y = Est_people, fill = Characteristics)) +
  geom_boxplot() +
  labs(
    title = "Teeth Cleaning by Medicaid Status Before Pregnancy in Estimated Population",
    x = "Medicaid Status",
    y = "Est_people"
  )
# Show the plot
print(plot_medi_before)

# Save the plot
ggsave("Medi & Before Pregnancy in Est.png", plot = plot_medi_before,
       width = 7, height = 5, dpi = 300)

#Before pregnancy: Ethnicity
ethi_before <- prams_before %>%
  filter(Group == "Race/Ethnicity")

plot_ethi_before <- ggplot(ethi_before, aes(x = Characteristics, y = Est_people, fill = Characteristics)) +
  geom_boxplot() +
  labs(
    title = "Teeth Cleaning by Ethnicity Before Pregnancy in Estimated Population",
    x = "Race/Ethnicity",
    y = "Est_people"
  )
# Show the plot
print(plot_ethi_before)

# Save the plot
ggsave("Ethi & Before Pregnancy in Est.png", plot = plot_ethi_before,
       width = 7, height = 5, dpi = 300)

#Before pregnancy: Region
reg_before <- prams_before %>%
  filter(Group == "Region")

plot_reg_before <- ggplot(reg_before, aes(x = Characteristics, y = Est_people, fill = Characteristics)) +
  geom_boxplot() +
  labs(
    title = "Teeth Cleaning by Region Before Pregnancy in Estimated Population",
    x = "Region",
    y = "Est_people"
  )
# Show the plot
print(plot_reg_before)

# Save the plot
ggsave("Reg & Before Pregnancy in Est.png", plot = plot_reg_before,
       width = 7, height = 5, dpi = 300)

#Data visualization: Association between the outcome and primary exposure
# During pregnancy for: Education
edu_after <- prams_after %>%
  filter(Group == "Education")

plot_edu_after <- ggplot(edu_after, aes(x = Characteristics, y = Est_people, fill = Characteristics)) +
  geom_boxplot() +
  labs(
    title = "Teeth Cleaning by Education During Pregnancy in Estimated Population",
   x = "Education Level",
    y = "Estimated Number of People"
)
    
# Show the plot
print(plot_edu_after)

# Save the plot
ggsave("Edu & after Pregnancy in Est.png", plot = plot_edu_after,
       width = 7, height = 5, dpi = 300)

#During pregnancy: Maternal age
age_after <- prams_after %>%
  filter(Group == "Maternal Age")

plot_age_after <- ggplot(age_before, aes(x = Characteristics, y = Est_people, fill = Characteristics)) +
  geom_boxplot() +
  labs(
    title = "Teeth Cleaning by Maternal Age During Pregnancy in Estimated Population",
    x = "Maternal Age",
    y = "Est_people"
  )
# Show the plot
print(plot_age_after)

# Save the plot
ggsave("age & after Pregnancy in Est.png", plot = plot_age_after,
       width = 7, height = 5, dpi = 300)

#During pregnancy: Medicaid status
medi_after <- prams_after %>%
  filter(Group == "Medicaid Status")

plot_medi_after= ggplot(medi_after, aes(x = Characteristics, y = Est_people, fill = Characteristics)) +
  geom_boxplot() +
  labs(
    title = "Teeth Cleaning by Medicaid Status During Pregnancy in Estimated Population",
    x = "Medicaid Status",
    y = "Est_people"
  )
# Show the plot
print(plot_medi_after)

# Save the plot
ggsave("medi & after Pregnancy in Est.png", plot = plot_medi_after,
       width = 7, height = 5, dpi = 300)

#During pregnancy: Ethnicity
ethi_after <- prams_after %>%
  filter(Group == "Race/Ethnicity")

plot_ethi_after= ggplot(ethi_after, aes(x = Characteristics, y = Est_people, fill = Characteristics)) +
  geom_boxplot() +
  labs(
    title = "Teeth Cleaning by Ethnicity Status During Pregnancy in Estimated Population",
    x = "Race/Ethnicity",
    y = "Est_people"
  )
# Show the plot
print(plot_ethi_after)

# Save the plot
ggsave("ethi & after Pregnancy in Est.png", plot = plot_ethi_after,
       width = 7, height = 5, dpi = 300)


#During pregnancy: Region
reg_after <- prams_after %>%
  filter(Group == "Region")

plot_reg_after<- ggplot(reg_after, aes(x = Characteristics, y = Est_people, fill = Characteristics)) +
  geom_boxplot() +
  labs(
    title = "Association Between Region and Teeth Cleaning During Pregnancy",
    x = "Region",
    y = "Est_people"
  )
# Show the plot
print(plot_reg_after)

# Save the plot
ggsave("reg & after Pregnancy in Est.png", plot = plot_reg_after,
       width = 7, height = 5, dpi = 300)

prams_before$Timing <- "Before Pregnancy"
prams_after$Timing <- "During Pregnancy"

combined <- bind_rows(prams_before, prams_after)

plot_data <- combined %>%
  filter(Group == "Medicaid Status")

plot_medicaid <- ggplot(plot_data, aes(x = Characteristics, y = Est_people, fill = Timing)) +
  geom_bar(stat = "identity", position = "stack") +
  labs(
    title = "Dental Cleaning Before and During Pregnancy by Medicaid Status",
    x = "Medicaid Status",
    y = "Estimated Population",
    fill = "Timing"
  ) +
  theme_minimal()

# Show the plot
print(plot_medicaid)

# Save the plot
ggsave("medicaid_before_after_pregnancy.png", plot = plot_medicaid,
       width = 7, height = 5, dpi = 300)

#regression dataset for before pregnancy
pregbefore_reg <- prams_clean %>%
  filter(grepl("before pregnancy", Title, ignore.case = TRUE)) %>%
  filter(Group %in% c("Maternal Age", "Education", "Race/Ethnicity",
                      "Medicaid Status", "Region")) %>%
  mutate(
    year = as.numeric(Year),         # convert Year to numeric
    group = factor(Group),
    Characteristics = factor(Characteristics)
  )

#regression model before pregnancy
# Unadjusted model
model1_before <- lm(Est_people ~ year, data = pregbefore_reg)

# Adjusted model
model2_before <- lm(Est_people ~ year + group + Characteristics, data = pregbefore_reg)

#regression dataset for during pregnancy
pregduring_reg <- prams_clean %>%
  filter(grepl("before pregnancy", Title, ignore.case = TRUE)) %>%
  filter(Group %in% c("Maternal Age", "Education", "Race/Ethnicity",
                      "Medicaid Status", "Region")) %>%
  mutate(
    year = as.numeric(Year),         
    group = factor(Group),
    Characteristics = factor(Characteristics)
  )

#regression model during pregnancy
# Unadjusted model
model1_during <- lm(Est_people ~ year, data = pregduring_reg)

# Adjusted model
model2_during <- lm(Est_people ~ year + group + Characteristics, data = pregduring_reg)

#Before Pregnancy
# Unadjusted table: before pregnancy
tbl_model1_before <- tbl_regression(
  model1_before,
  estimate_fun = ~style_sigfig(.x, digits = 3)
) %>%
  modify_caption("Model 1: Unadjusted - Before Pregnancy")

# Adjusted table: before pregnancy
tbl_model2_before <- tbl_regression(
  model2_before,
  estimate_fun = ~style_sigfig(.x, digits = 3)
) %>%
  modify_caption("Model 2: Adjusted - Before Pregnancy (Group + Characteristics)")

# Display tables: before pregnancy
tbl_model1_before

Model 1: Unadjusted - Before Pregnancy
Characteristic	Beta	95% CI	p-value
year	61.1	-780, 902	0.9
Abbreviation: CI = Confidence Interval

tbl_model2_before

Model 2: Adjusted - Before Pregnancy (Group + Characteristics)
Characteristic	Beta	95% CI	p-value
year	-53.4	-497, 390	0.8
group
Education	—	—
Maternal Age	-37,658	-42,978, -32,339	<0.001
Medicaid Status	-24,668	-29,987, -19,348	<0.001
Race/Ethnicity	-8,214	-13,533, -2,895	0.003
Region	6,909	-614, 14,432	0.072
Characteristics
20-24 years old	—	—
25-34 years old	26,945	21,625, 32,264	<0.001
35 years old or more	11,111	5,791, 16,430	<0.001
Asian, non-Hispanic	-31,521	-36,841, -26,202	<0.001
Black, non-Hispanic	-29,541	-34,860, -24,221	<0.001
High school graduate	-34,399	-39,719, -29,080	<0.001
Hispanic	-25,717	-31,037, -20,398	<0.001
Less than 20 years old	-4,609	-11,327, 2,109	0.2
Less than high school	-39,588	-44,907, -34,268	<0.001
More than high school
Not on Medicaid	19,463	14,144, 24,782	<0.001
NYC	-14,608	-23,821, -5,394	0.002
NYS excl NYC
On Medicaid
Other, non-Hispanic	-34,041	-40,239, -27,843	<0.001
White, non-Hispanic
Abbreviation: CI = Confidence Interval

Interpretation For unadjusted table: Before pregnancy For each one year increase, the number of women visiting a dentist for teeth cleaning increases by 61, on average. However, this result is not statistically significant (pvalue>0.05, CI: -780 to 902). The CI is so large because the aggregated data among different demographic cells has a large variance.

For adjusted table: Before pregnancy In the adjusted model examining estimated numbers of women receiving dental cleaning before pregnancy, year was not significantly associated with dental cleaning after accounting for demographic factors (β = -53.4; 95% CI: -497, 390; p = 0.8), suggesting that the overall trend over time was negligible once differences across groups were considered.

Several demographic characteristics were significantly associated with dental cleaning:

Maternal Age: Compared with women under 20 years, those aged 25–34 years had an estimated 26,945 more women receiving dental cleaning (95% CI: 21,625, 32,264; p < 0.001), and those 35 years or older had 11,111 more women (95% CI: 5,791, 16,430; p < 0.001). Education: Compared with women with more than high school education (reference), women with high school or less education had substantially fewer women receiving dental cleaning. For example, women with less than high school education had 39,588 fewer women (95% CI: -44,907, -34,268; p < 0.001). Race/Ethnicity: Relative to non-Hispanic White women, non-Hispanic Black and Asian women had significantly fewer women receiving dental cleaning (β = -29,541 and -31,521, respectively; p < 0.001 for both). Medicaid Status: Women not on Medicaid had significantly more women receiving dental cleaning than those on Medicaid (β = 19,463; 95% CI: 14,144, 24,782; p < 0.001). Region: Compared with women in New York State excluding NYC, women in NYC had fewer women receiving dental cleaning (β = -14,608; 95% CI: -23,821, -5,394; p = 0.002).

Reference categories used in the model included: women with more than high school education, non-Hispanic White women, women on Medicaid, under 20 years old, and residents of NYS excluding NYC. All other beta estimates represent differences relative to these reference groups.

Interpretation: These results indicate that demographic factors — particularly maternal age, education, race/ethnicity, Medicaid status, and region — strongly influence dental cleaning before pregnancy, whereas there is no significant temporal trend once these factors are accounted for. This highlights persistent disparities in dental care utilization among subgroups of women.

#During Pregnancy
# Unadjusted table: during pregnancy
tbl_model1_during <- tbl_regression(
  model1_during,
  estimate_fun = ~style_sigfig(.x, digits = 3)
) %>%
  modify_caption("Model 1: Unadjusted - During Pregnancy")

# Adjusted table: during pregnancy
tbl_model2_during <- tbl_regression(
  model2_during,
  estimate_fun = ~style_sigfig(.x, digits = 3)
) %>%
  modify_caption("Model 2: Adjusted - During Pregnancy (Group + Characteristics)")

# Display tables during pregnancy
tbl_model1_during

Model 1: Unadjusted - During Pregnancy
Characteristic	Beta	95% CI	p-value
year	61.1	-780, 902	0.9
Abbreviation: CI = Confidence Interval

tbl_model2_during

Model 2: Adjusted - During Pregnancy (Group + Characteristics)
Characteristic	Beta	95% CI	p-value
year	-53.4	-497, 390	0.8
group
Education	—	—
Maternal Age	-37,658	-42,978, -32,339	<0.001
Medicaid Status	-24,668	-29,987, -19,348	<0.001
Race/Ethnicity	-8,214	-13,533, -2,895	0.003
Region	6,909	-614, 14,432	0.072
Characteristics
20-24 years old	—	—
25-34 years old	26,945	21,625, 32,264	<0.001
35 years old or more	11,111	5,791, 16,430	<0.001
Asian, non-Hispanic	-31,521	-36,841, -26,202	<0.001
Black, non-Hispanic	-29,541	-34,860, -24,221	<0.001
High school graduate	-34,399	-39,719, -29,080	<0.001
Hispanic	-25,717	-31,037, -20,398	<0.001
Less than 20 years old	-4,609	-11,327, 2,109	0.2
Less than high school	-39,588	-44,907, -34,268	<0.001
More than high school
Not on Medicaid	19,463	14,144, 24,782	<0.001
NYC	-14,608	-23,821, -5,394	0.002
NYS excl NYC
On Medicaid
Other, non-Hispanic	-34,041	-40,239, -27,843	<0.001
White, non-Hispanic
Abbreviation: CI = Confidence Interval

Interpretation Unadjusted model – During Pregnancy

The coefficient for Year is -778 with a 95% confidence interval of -1,443 to -113 and a p-value of 0.022. This indicates a statistically significant negative trend in the estimated number of women receiving dental cleaning during pregnancy over time. On average, the estimated number decreased by 778 women per year between 2015 and 2022. The negative direction aligns with the coefficient, showing that rates of dental cleaning during pregnancy have declined over time in the unadjusted model.

Adjusted model-During Pregnancy

The coefficient for Year is -871 with a 95% confidence interval of -1,226 to -516 and a p-value < 0.001. This indicates a statistically significant negative trend in the estimated number of women receiving dental cleaning during pregnancy over time. On average, the estimated number decreased by 871 women per year between 2015 and 2022, after accounting for demographic factors.

Interpretation of covariates:

Maternal Age, Medicaid Status, Race/Ethnicity, and Education all show significant associations with the outcome. For example, older age groups (25–34, 35+) and women with higher education have different baseline estimates compared to their reference groups. Region is significant (NYC vs. NYS excluding NYC), showing geographic differences in dental cleaning rates. These adjustments account for differences in group composition over time, meaning the negative trend in Year reflects a true temporal decline rather than shifts in demographic distributions.

#Model diagnostic
# Diagnostic Plots: Before Pregnancy
autoplot(model2_before, 
         which = 1:4,       
         ncol = 2,          
         label.size = 3) +  
  ggtitle("Regression Diagnostics - Adjusted Model Before Pregnancy")

## Warning: `fortify(<lm>)` was deprecated in ggplot2 3.6.0.
## ℹ Please use `broom::augment(<lm>)` instead.
## ℹ The deprecated feature was likely used in the ggfortify package.
##   Please report the issue at <https://github.com/sinhrks/ggfortify/issues>.
## This warning is displayed once per session.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.

## Warning: `aes_string()` was deprecated in ggplot2 3.0.0.
## ℹ Please use tidy evaluation idioms with `aes()`.
## ℹ See also `vignette("ggplot2-in-packages")` for more information.
## ℹ The deprecated feature was likely used in the ggfortify package.
##   Please report the issue at <https://github.com/sinhrks/ggfortify/issues>.
## This warning is displayed once per session.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.

## Warning: Using `size` aesthetic for lines was deprecated in ggplot2 3.4.0.
## ℹ Please use `linewidth` instead.
## ℹ The deprecated feature was likely used in the ggfortify package.
##   Please report the issue at <https://github.com/sinhrks/ggfortify/issues>.
## This warning is displayed once per session.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.

# Diagnostic Plots: During Pregnancy
autoplot(model2_during, 
         which = 1:4,
         ncol = 2,
         label.size = 3) +
  ggtitle("Regression Diagnostics - Adjusted Model During Pregnancy")

Interpretation for before pregnancy 1.Residuals vs Fitted: Checking for Linearity The blue line is relatively flat and should stay at zero. In the above graph, most of the blue line is relatively flat suggesting the linearity. 2. Normal Q-Q: Checking for Normality The data points should follow the diagonal reference line. In the graph, most points lie close to the diagonal line, while a few points deviate above and below it. This suggests that the residuals are approximately normally distributed, with slight deviations indicating minor departures from normality. 3. Scale-Location Plot (Spread-Location) This plot assesses whether the variance of residuals is constant across fitted values; the smoother line should be approximately flat. In the graph above, the blue line shows some curvature rather than being perfectly flat, suggesting slight heteroscedasticity. The spread of residuals appears to increase, indicating that the variance is not entirely constant across the range of fitted values. 4. Residuals vs Leverage: Identifying Influential Outliers We look for points that lie beyond the dashed red lines, which represent Cook’s Distance. In this plot, three data points (318, 327, 329) appear to be farther from these lines, indicating few influential observations.

Interpretation for during pregnancy 1.Residuals vs Fitted: Checking for Linearity In the above graph, the line deviates from 0, this depicting non-linearity in the dataset. 2. Normal Q-Q: Checking for Normality In the graph, most points lie close to the diagonal line, while a few points deviate above and below it. This suggests that the residuals are approximately normally distributed, with slight deviations indicating minor departures from normality. 3. Scale-Location Plot (Spread-Location) In the graph above, the blue line shows some curvature rather than being perfectly flat, suggesting slight heteroscedasticity. The spread of residuals appears to increase, indicating that the variance is not entirely constant across the range of fitted values. 4. Residuals vs Leverage: Identifying Influential Outlier In this plot, three data points (31, 40, and 164) appear to be farther from these lines, indicating few influential observations.

#Data visualization
# Fitted values: before pregnancy
plot_data <- pregbefore_reg %>%
  mutate(fitted = fitted(model2_before))

# Plot
ggplot(plot_data, aes(x = year, y = Est_people)) +
  geom_point(alpha = 0.4) +
  geom_line(aes(y = fitted), linewidth = 1) +
  labs(
    title = "Adjusted Association Between Year and Dental Cleaning: Before Pregnancy",
    x = "Year",
    y = "Estimated Number of Women"
  ) +
  theme_minimal()

#Forest plot: before pregnancy
coef_data <- tidy(model2_before, conf.int = TRUE) %>%
  filter(term != "(Intercept)")

ggplot(coef_data, aes(x = estimate, y = reorder(term, estimate))) +
  geom_point() +
  geom_errorbarh(aes(xmin = conf.low, xmax = conf.high), height = 0.2) +
  geom_vline(xintercept = 0, linetype = "dashed") +
  labs(
    title = "Adjusted Regression Coefficients: Before Pregnancy",
    x = "Beta Coefficient",
    y = "Predictors"
  ) +
  theme_minimal()

## Warning: `geom_errobarh()` was deprecated in ggplot2 4.0.0.
## ℹ Please use the `orientation` argument of `geom_errorbar()` instead.
## This warning is displayed once per session.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.

## `height` was translated to `width`.

## Warning: Removed 4 rows containing missing values or values outside the scale range
## (`geom_point()`).

#Data visualization
# Fitted values: during pregnancy
plot_data <- pregduring_reg %>%
  mutate(fitted = fitted(model2_during))

# Plot
ggplot(plot_data, aes(x = year, y = Est_people)) +
  geom_point(alpha = 0.4) +
  geom_line(aes(y = fitted), linewidth = 1) +
  labs(
    title = "Adjusted Association Between Year and Dental Cleaning: During Pregnancy",
    x = "Year",
    y = "Estimated Number of Women"
  ) +
  theme_minimal()

#Forest plot: before pregnancy
coef_data_du <- tidy(model2_during, conf.int = TRUE) %>%
  filter(term != "(Intercept)")

ggplot(coef_data_du, aes(x = estimate, y = reorder(term, estimate))) +
  geom_point() +
  geom_errorbarh(aes(xmin = conf.low, xmax = conf.high), height = 0.2) +
  geom_vline(xintercept = 0, linetype = "dashed") +
  labs(
    title = "Adjusted Regression Coefficients: During Pregnancy",
    x = "Beta Coefficient",
    y = "Predictors"
  ) +
  theme_minimal()

## `height` was translated to `width`.

## Warning: Removed 4 rows containing missing values or values outside the scale range
## (`geom_point()`).

```

Final Project Proposal

Samriddhi Ranjan

R Markdown