Packages

Show the code
pacman::p_load(tidyverse, 
               here, 
               kableExtra, 
               viridis, 
               DescTools, 
               gtsummary)
Show the code
theme_set(theme_minimal())

Dataset

Show the code
# glimpse(df)

Prepare for abstract

Show the code
df <- df |> 
  mutate(Gender = recode(Gender, 
                         "Meitene" = "Female", 
                         "Zēns" = "Male")) |> 
  mutate(`Place of living` = recode(`Place of living`, 
                                    "Daugavpils, Jēkabpils, Jelgava, Jūrmala, Liepāja, Rēzekne, Valmiera, Ventspils" = "City", 
                                    " Lauki" = "Rural", 
                                    "Cita pilsēta" = "Other City"
                                    ))

Filter the 12 and 15 years old

Show the code
df <- df |> 
  filter(Age %in% (c("12", "15"))) 

Create a unique ID for each child

Show the code
df |> 
  mutate(`Date of birth` = as.Date(`Date of birth`, format = "%m/%d/%Y")) |> 
  # mutate(`Date of birth` = str_replace_all(`Date of birth`, "/", ""))
  unite(ChildID, `School code`, `Child code`, `Date of birth`, sep = "_") |> 
  select(ChildID) |> 
  group_by(ChildID) |> 
  count() |> 
  filter(n > 1)|> 
  kbl() |> 
  kable_styling()
ChildID n
25_a12_2007-01-27 2
26_17d_2010-12-03 2
38_9b10_2007-06-16 2
41_9c6_2007-10-11 2
63_9c20_2007-09-14 2
68_6b6_2010-06-23 2
73_8a9_2008-02-06 2

Task

Results: prevalencia de caries en niveles d1, d3 y d5 y DMFT, indicando diferencias por genero, region, place of living, lenguaje, para todos, 12 y 15 años

Table 1

Show the code
df |> 
  group_by(Gender)  |> 
  summarise(AgeMean = mean(Age), sdAge = sd(Age))  |> 
  mutate(across(where(is.numeric), round, 1)) |> 
  kbl() |> 
  kable_styling()
Gender AgeMean sdAge
Female 13.5 1.5
Male 13.5 1.5
Show the code
df |> 
  select(Gender, 
         Age, 
         Region, 
         `Place of living`, 
         D1MFT, 
         D3MFT, 
         D5MFT) |> 
  gtsummary::tbl_summary(by = "Gender")
Characteristic Female, N = 1,5061 Male, N = 1,6231
Age
    12 757 (50%) 800 (49%)
    15 749 (50%) 823 (51%)
Region
    Kurzeme 247 (16%) 304 (19%)
    Latgale 149 (9.9%) 163 (10%)
    Pierīga 205 (14%) 241 (15%)
    Rīga 436 (29%) 419 (26%)
    Vidzeme 259 (17%) 267 (16%)
    Zemgale 210 (14%) 229 (14%)
Place of living
    City 345 (23%) 369 (23%)
    Lauki 218 (14%) 237 (15%)
    Other City 388 (26%) 427 (26%)
    Pierīga 179 (12%) 213 (13%)
    Rīga 376 (25%) 377 (23%)
D1MFT 6.0 (3.0, 10.0) 6.0 (2.0, 10.0)
D3MFT 3.0 (1.0, 5.0) 2.0 (0.0, 5.0)
D5MFT 2.00 (0.00, 4.00) 1.00 (0.00, 4.00)
1 n (%); Median (IQR)
Show the code
df |> 
  select(Gender, 
         Age, 
         Region, 
         `Place of living`, 
         D1MFT, 
         D3MFT, 
         D5MFT) |> 
  gtsummary::tbl_summary(by = "Gender")
Characteristic Female, N = 1,5061 Male, N = 1,6231
Age
    12 757 (50%) 800 (49%)
    15 749 (50%) 823 (51%)
Region
    Kurzeme 247 (16%) 304 (19%)
    Latgale 149 (9.9%) 163 (10%)
    Pierīga 205 (14%) 241 (15%)
    Rīga 436 (29%) 419 (26%)
    Vidzeme 259 (17%) 267 (16%)
    Zemgale 210 (14%) 229 (14%)
Place of living
    City 345 (23%) 369 (23%)
    Lauki 218 (14%) 237 (15%)
    Other City 388 (26%) 427 (26%)
    Pierīga 179 (12%) 213 (13%)
    Rīga 376 (25%) 377 (23%)
D1MFT 6.0 (3.0, 10.0) 6.0 (2.0, 10.0)
D3MFT 3.0 (1.0, 5.0) 2.0 (0.0, 5.0)
D5MFT 2.00 (0.00, 4.00) 1.00 (0.00, 4.00)
1 n (%); Median (IQR)

Comparison D1, D3, D5, MFT

Show the code
df |>
  select(`Place of living`,
         D1MFT,
         D3MFT,
         D5MFT) |>
  pivot_longer(-`Place of living`,
               names_to = "Index",
               values_to = "value")  |>
  ggplot(aes(x =  `Place of living`,
             y = value,
             fill = Index)) +
  scale_fill_viridis_d(begin = 0.4) +  # Use viridis color palette
  geom_boxplot(outlier.color = 'grey50', outlier.alpha = 0.2) + 
  labs(title = "Caries severity by place of living", 
       fill = "Index", 
       y = "Index value", 
       x = "") + 
  theme(legend.position = "top")

Model D1MFT by Place of living

Show the code
df %>%
  mutate(`Place of living` = fct_relevel(`Place of living`, "Rīga")) %>%
  glm(D1MFT ~ `Place of living`, data = .) %>%
  tbl_regression() |> 
  modify_caption("**D1MFT** (N = {N})")
D1MFT (N = 3129)
Characteristic Beta 95% CI1 p-value
Place of living
    Rīga
    City 2.5 2.0, 3.0 <0.001
    Lauki 2.8 2.2, 3.4 <0.001
    Other City 1.6 1.1, 2.1 <0.001
    Pierīga 0.23 -0.41, 0.87 0.5
1 CI = Confidence Interval

Model D3MFT by Place of living

Show the code
df %>%
  mutate(`Place of living` = fct_relevel(`Place of living`, "Rīga")) %>%
  glm(D3MFT ~ `Place of living`, data = .) %>%
  tbl_regression() |> 
  modify_caption("**D3MFT** (N = {N})")
D3MFT (N = 3129)
Characteristic Beta 95% CI1 p-value
Place of living
    Rīga
    City 1.2 0.83, 1.5 <0.001
    Lauki 1.3 0.91, 1.7 <0.001
    Other City 0.50 0.16, 0.83 0.004
    Pierīga 0.05 -0.37, 0.46 0.8
1 CI = Confidence Interval

Model D5MFT by Place of living

Show the code
df %>%
  mutate(`Place of living` = fct_relevel(`Place of living`, "Rīga")) %>%
  glm(D5MFT ~ `Place of living`, data = .) %>%
  tbl_regression() |> 
  modify_caption("**D5MFT** (N = {N})")
D5MFT (N = 3129)
Characteristic Beta 95% CI1 p-value
Place of living
    Rīga
    City 0.66 0.37, 0.95 <0.001
    Lauki 0.71 0.38, 1.0 <0.001
    Other City 0.19 -0.09, 0.48 0.2
    Pierīga -0.09 -0.44, 0.27 0.6
1 CI = Confidence Interval
Show the code
df |> 
  select( `Place of living`, 
    D1MFT, 
         D3MFT, 
         D5MFT) |> 
  gtsummary::tbl_summary(by = `Place of living`) |> 
  add_ci() |> 
  add_p()
✖ `add_ci()` added mean CI for "D1MFT"; however, no mean is shown in the `tbl_summary()` table.
✖ `add_ci()` added mean CI for "D1MFT"; however, no mean is shown in the `tbl_summary()` table.
✖ `add_ci()` added mean CI for "D1MFT"; however, no mean is shown in the `tbl_summary()` table.
✖ `add_ci()` added mean CI for "D1MFT"; however, no mean is shown in the `tbl_summary()` table.
✖ `add_ci()` added mean CI for "D1MFT"; however, no mean is shown in the `tbl_summary()` table.
✖ `add_ci()` added mean CI for "D3MFT"; however, no mean is shown in the `tbl_summary()` table.
✖ `add_ci()` added mean CI for "D3MFT"; however, no mean is shown in the `tbl_summary()` table.
✖ `add_ci()` added mean CI for "D3MFT"; however, no mean is shown in the `tbl_summary()` table.
✖ `add_ci()` added mean CI for "D3MFT"; however, no mean is shown in the `tbl_summary()` table.
✖ `add_ci()` added mean CI for "D3MFT"; however, no mean is shown in the `tbl_summary()` table.
✖ `add_ci()` added mean CI for "D5MFT"; however, no mean is shown in the `tbl_summary()` table.
✖ `add_ci()` added mean CI for "D5MFT"; however, no mean is shown in the `tbl_summary()` table.
✖ `add_ci()` added mean CI for "D5MFT"; however, no mean is shown in the `tbl_summary()` table.
✖ `add_ci()` added mean CI for "D5MFT"; however, no mean is shown in the `tbl_summary()` table.
✖ `add_ci()` added mean CI for "D5MFT"; however, no mean is shown in the `tbl_summary()` table.
Characteristic City, N = 7141 95% CI2 Lauki, N = 4551 95% CI2 Other City, N = 8151 95% CI2 Pierīga, N = 3921 95% CI2 Rīga, N = 7531 95% CI2 p-value3
D1MFT 7.0 (4.0, 11.0) 7.4, 8.3 8.0 (4.0, 11.0) 7.7, 8.6 6.0 (3.0, 10.0) 6.6, 7.4 4.0 (2.0, 9.0) 5.1, 6.1 4.0 (2.0, 8.0) 5.0, 5.7 <0.001
D3MFT 3.0 (1.0, 6.0) 3.7, 4.2 3.0 (1.0, 6.0) 3.7, 4.4 3.0 (0.0, 5.0) 3.0, 3.5 2.0 (0.0, 4.0) 2.5, 3.1 2.0 (0.0, 4.0) 2.5, 3.0 <0.001
D5MFT 2.00 (0.00, 4.00) 2.6, 3.1 2.00 (0.00, 4.00) 2.6, 3.2 2.00 (0.00, 4.00) 2.2, 2.6 1.00 (0.00, 3.00) 1.8, 2.4 1.00 (0.00, 3.00) 2.0, 2.4 <0.001
1 Median (IQR)
2 CI = Confidence Interval
3 Kruskal-Wallis rank sum test
Show the code
df |> 
  select(
    D1MFT, 
         D3MFT, 
         D5MFT) |> 
  gtsummary::tbl_summary() |> 
  add_ci()
Characteristic N = 3,1291 95% CI2
D1MFT 6.0 (3.0, 10.0) 6.6, 7.0
D3MFT 3.0 (0.0, 5.0) 3.2, 3.5
D5MFT 2.00 (0.00, 4.00) 2.4, 2.6
1 Median (IQR)
2 CI = Confidence Interval

Visualizations

Prevalence

Prevalence by Age

Table prevalence by age

Show the code
df %>% 
  group_by(Age) |> 
  summarise(
    D1MFT = (1 - mean(D1MFT == 0, na.rm = TRUE)) *100,
    D3MFT = (1 - mean(D3MFT == 0, na.rm = TRUE))*100,
    D5MFT = (1 - mean(D5MFT == 0, na.rm = TRUE)) *100
  ) |> 
  mutate(across(everything(), ~ round(., 1)))
# A tibble: 2 × 4
    Age D1MFT D3MFT D5MFT
  <dbl> <dbl> <dbl> <dbl>
1    12  86.1  66.3  55.6
2    15  93.3  83.5  76.3

Figure prevalence by age

Show the code
df %>%
  group_by(Age) %>%
  summarise(
    "D1MFT" = (1 - mean(D1MFT == 0, na.rm = TRUE)) * 100,
    "D3MFT" = (1 - mean(D3MFT == 0, na.rm = TRUE)) * 100,
    "D5MFT" = (1 - mean(D5MFT == 0, na.rm = TRUE)) * 100
  ) %>%
  pivot_longer("D1MFT":"D5MFT",
               names_to = "index",
               values_to = "value") %>%
  mutate(se = sd(value) / sqrt(n())) %>%
  ggplot(aes(x = index,
             y = value,
             fill = as.factor(Age))) +
  geom_col(position = "dodge") +
  geom_errorbar(aes(ymax = pmin(value + 1.96 * se, 100), ymin = value - 1.96 * se),
                position = position_dodge(width = 0.9),
                width = 0.25, color = "grey30") +
  scale_fill_viridis_d(begin = .4,  end = 0.9, direction = -1) +  # Use viridis color palette
  labs(title = "Caries prevalence by Age",
       caption = "Bar shows 95% CI", 
       x = "Index",
       y = "%",
       fill = "Age") + 
  coord_cartesian(ylim = c(0, 100))

Prevalence by area

Show the code
df %>% 
  group_by(Region) |> 
  summarise(
    D1MFT = (1 - mean(D1MFT == 0, na.rm = TRUE)) *100,
    D3MFT = (1 - mean(D3MFT == 0, na.rm = TRUE))*100,
    D5MFT = (1 - mean(D5MFT == 0, na.rm = TRUE)) *100
  ) |> 
  mutate(across(where(is.numeric), ~ round(., 1)))
# A tibble: 6 × 4
  Region  D1MFT D3MFT D5MFT
  <chr>   <dbl> <dbl> <dbl>
1 Kurzeme  87.1  70.2  61.5
2 Latgale  94.6  85.3  67.9
3 Pierīga  87.7  73.1  67.5
4 Rīga     86.4  70.1  60.5
5 Vidzeme  92.4  77.4  72.6
6 Zemgale  94.8  82    71.5
Show the code
df %>%
  group_by(Region) %>%
  summarise(
    "D1MFT" = (1 - mean(D1MFT == 0, na.rm = TRUE)) * 100,
    "D3MFT" = (1 - mean(D3MFT == 0, na.rm = TRUE)) * 100,
    "D5MFT" = (1 - mean(D5MFT == 0, na.rm = TRUE)) * 100
  ) %>%
  pivot_longer("D1MFT":"D5MFT",
               names_to = "index",
               values_to = "value") %>%
  mutate(se = sd(value) / sqrt(n())) %>%
  ggplot(aes(x = index,
             y = value,
             fill = as.factor(Region))) +
  geom_col(position = "dodge") +
  geom_errorbar(aes(ymax = pmin(value + 1.96 * se, 100), ymin = value - 1.96 * se),
                position = position_dodge(width = 0.9),
                width = 0.25, color = "grey30") +
  scale_fill_viridis_d(begin = .1) +  # Use viridis color palette
  labs(title = "Caries prevalence by Region",
       caption = "Bar shows 95% CI", 
       x = "Index",
       y = "%",
       fill = "Region") + 
  coord_cartesian(ylim = c(0, 100)) 

Show the code
df %>%
  group_by(Region) %>%
  summarise(
    "D1MFT" = (1 - mean(D1MFT == 0, na.rm = TRUE)) * 100,
    "D3MFT" = (1 - mean(D3MFT == 0, na.rm = TRUE)) * 100,
    "D5MFT" = (1 - mean(D5MFT == 0, na.rm = TRUE)) * 100
  ) %>%
  pivot_longer("D1MFT":"D5MFT",
               names_to = "index",
               values_to = "value") %>%
  mutate(se = sd(value) / sqrt(n())) %>%
  ggplot(aes(x = Region,
             y = value,
             fill = index)) +
  geom_col(position = "dodge") +
  geom_errorbar(aes(ymax = pmin(value + 1.96 * se, 100), ymin = value - 1.96 * se),
                position = position_dodge(width = 0.9),
                width = 0.25, color = "grey30") +
  scale_fill_viridis_d(begin = .4) +  # Use viridis color palette
  labs(title = "Caries prevalence by Region",
       caption = "Bar shows 95% CI", 
       x = "Region",
       y = "%",
       fill = "Index") + 
  coord_cartesian(ylim = c(0, 100))

Severity

Severity table by age

Show the code
df |> 
  group_by(Age) |> 
  summarise(D1_mean = mean(D1MFT), D1_sd = sd(D1MFT), 
            D3_mean = mean(D3MFT), D3_sd = sd(D3MFT), 
            D5_mean = mean(D5MFT), D5_sd = sd(D5MFT)) |> 
  mutate(across(everything(), ~ round(., 1)))
# A tibble: 2 × 7
    Age D1_mean D1_sd D3_mean D3_sd D5_mean D5_sd
  <dbl>   <dbl> <dbl>   <dbl> <dbl>   <dbl> <dbl>
1    12     5.3   4.6     2.3   2.5     1.6   2  
2    15     8.2   5.7     4.4   3.9     3.4   3.3

Severity figure by age and gender

Show the code
# df |> 
#   select(Age, Gender, D1MFT, D3MFT, D5MFT) |> 
#   pivot_longer(-c(Age, Gender), 
#                names_to = "Index", 
#                values_to = "values") |> 
#   ggplot(aes(x = as.factor(Age), 
#              y = values, 
#              fill = Index)) + 
#   # geom_violin() + 
#   # scale_y_log10() + 
#   geom_boxplot(width = .5, outlier.color = "grey50", outlier.alpha = 0.1) + 
#   scale_fill_viridis_d(begin = .4) +
#   facet_grid(Gender ~ Index ) + 
#   labs(title = "Caries severity by Age and Gender",
#        # caption = "Note: Log10 scale for index values", 
#        y = "Index value", 
#        x = "Age"     )
With the values
Show the code
# Your data manipulation code
df_plot <- df %>%
  group_by(Age) %>%
  summarise(D1_mean = mean(D1MFT), D1_sd = sd(D1MFT),
            D3_mean = mean(D3MFT), D3_sd = sd(D3MFT),
            D5_mean = mean(D5MFT), D5_sd = sd(D5MFT)) %>%
  mutate(across(everything(), ~ round(., 1)))

# Your plot code
df_plot <- df %>%
  select(Age, Gender, D1MFT, D3MFT, D5MFT) %>%
  pivot_longer(-c(Age, Gender),
               names_to = "Index",
               values_to = "values")

# Create the ggplot with median values
df_plot |>  
  ggplot(aes(x = as.factor(Age), y = values, fill = Index)) +
  geom_boxplot(
    width = 0.5,
    outlier.color = "grey30",
    outlier.alpha = 0.1
  ) +
  stat_summary(
    fun = "median",
    geom = "text",
    aes(label = sprintf("%.1f", ..y..)),
    position = position_dodge(width = 0.75),
    vjust = -0.5,
    size = 3,
    color = "black"
  ) +
  scale_fill_viridis_d(begin = 0.5) +
  facet_grid(Gender ~ Index) +
  labs(title = "Caries severity by Age and Gender",
       y = "Index value",
       x = "Age")
Warning: The dot-dot notation (`..y..`) was deprecated in ggplot2 3.4.0.
ℹ Please use `after_stat(y)` instead.

Show the code
rm(df_plot)

Severity by region

Severity by region table

Show the code
df |> 
  group_by(Region) |> 
  summarise(D1_mean = mean(D1MFT), D1_sd = sd(D1MFT), 
            D3_mean = mean(D3MFT), D3_sd = sd(D3MFT), 
            D5_mean = mean(D5MFT), D5_sd = sd(D5MFT)) |> 
    mutate(across(where(is.numeric), ~ round(., 1)))
# A tibble: 6 × 7
  Region  D1_mean D1_sd D3_mean D3_sd D5_mean D5_sd
  <chr>     <dbl> <dbl>   <dbl> <dbl>   <dbl> <dbl>
1 Kurzeme     5.5   4.4     3     3.3     2.3   2.8
2 Latgale     9.8   6       4.4   3.5     2.8   3  
3 Pierīga     6     5.2     3.2   3.4     2.5   3  
4 Rīga        5.4   4.7     2.7   3.1     2.2   2.8
5 Vidzeme     7.4   5.2     3.6   3.5     2.7   2.9
6 Zemgale     9.2   5.9     4.1   3.7     2.8   3  
Show the code
# Your data manipulation code
df_plot <-  df %>%
  group_by(Region) %>%
  summarise(D1_mean = mean(D1MFT), D1_sd = sd(D1MFT),
            D3_mean = mean(D3MFT), D3_sd = sd(D3MFT),
            D5_mean = mean(D5MFT), D5_sd = sd(D5MFT)) %>%
  mutate(across(where(is.numeric), ~ round(., 1)))

# Your plot code
df_plot <- df %>%
  select(Region, D1MFT, D3MFT, D5MFT) %>%
  pivot_longer(-c(Region),
               names_to = "Index",
               values_to = "values")

# Create the ggplot with median values
df_plot |>  
  ggplot(aes(x = as.factor(Region), y = values, fill = Index)) +
  geom_boxplot(
    width = 0.5,
    outlier.color = "grey30",
    outlier.alpha = 0.1
  ) +
  stat_summary(
    fun = "median",
    geom = "text",
    aes(label = sprintf("%.1f", ..y..)),
    position = position_dodge(width = 0.75),
    vjust = -0.5,
    size = 3,
    color = "black"
  ) +
  scale_fill_viridis_d(begin = 0.5) +
  facet_grid(. ~ Index) +
  labs(title = "Caries severity by Region ",
       y = "Index value",
       x = "Region") + 
  theme(legend.position = "top")

Show the code
rm(df_plot)

Differences in severity by region

Show the code
# set Riga as the comparison
df <- df |> 
  mutate(Region = fct_relevel(Region, c("Rīga")))  
Show the code
# Step 1: Fit the regression model
modelD1 <- glm(D1MFT ~ Region, data = df)

# Step 2: Summarize the regression results
modelD1 <- gtsummary::tbl_regression(modelD1) 
Show the code
modelD3 <- glm(D3MFT ~ Region, data = df)

# Step 2: Summarize the regression results
modelD3 <- gtsummary::tbl_regression(modelD3) 
Show the code
modelD5 <- glm(D5MFT ~ Region, data = df)

# Step 2: Summarize the regression results
modelD5 <- gtsummary::tbl_regression(modelD5)
Show the code
tbl_merge(
    tbls = list(modelD1, modelD3, modelD5),
    tab_spanner = c("**D1MFT**", "**D3MFT**", "**D5MFT**")
  )
Characteristic D1MFT D3MFT D5MFT
Beta 95% CI1 p-value Beta 95% CI1 p-value Beta 95% CI1 p-value
Region
    Rīga
    Kurzeme 0.11 -0.44, 0.66 0.7 0.29 -0.07, 0.65 0.12 0.12 -0.19, 0.43 0.4
    Latgale 4.4 3.8, 5.1 <0.001 1.6 1.2, 2.1 <0.001 0.66 0.28, 1.0 <0.001
    Pierīga 0.62 0.04, 1.2 0.037 0.48 0.09, 0.87 0.016 0.33 0.00, 0.66 0.047
    Vidzeme 2.0 1.4, 2.6 <0.001 0.83 0.47, 1.2 <0.001 0.59 0.27, 0.90 <0.001
    Zemgale 3.8 3.3, 4.4 <0.001 1.3 0.95, 1.7 <0.001 0.64 0.30, 0.97 <0.001
1 CI = Confidence Interval
Show the code
rm(modelD1, modelD3, modelD5)