---
title: "03_EDA_datasets_itu"
author: "Sergio Uribe"
date-modified: last-modified
format:
html:
toc: true
toc-expand: 3
code-fold: true
code-tools: true
editor: visual
execute:
echo: false
cache: false
warning: false
message: false
---
# Packages
```{r}
# Load required libraries with pacman; installs them if not already installed
pacman::p_load(tidyverse, # tools for data science
visdat, #NAs
janitor, # for data cleaning and tables
here, # for reproducible research
gtsummary, # for tables
maps,
patchwork,
viridis,
scales,
countrycode # to normalize country data
)
```
```{r}
theme_set(theme_minimal())
```
# Datasets
```{r}
df <- read.csv(here("data", "df.csv"))
```
```{r}
df_long <- read.csv(here("data", "df_long.csv"))
```
# Data cleaning
```{r}
# Only dataset with image number
# df |>
# filter(Is.the.Number.of.images.in.the.dataset.reported. == "Yes")
```
## How many datasets?
```{r}
dim(df)
```
## From which databases?
```{r}
df |>
tabyl(Database) |>
adorn_pct_formatting() |>
arrange(desc(n)) |>
knitr::kable()
```
```{r}
df |>
tabyl(Database) |>
ggplot(aes(x = fct_reorder(Database, n ) ,
y = n)) +
geom_col() +
coord_flip() +
labs(title = "Datasets by Database",
x = "")
```
## Year of dataset publication
```{r}
df |>
tabyl(Year.of.dataset.publication) |>
adorn_pct_formatting() |>
knitr::kable()
```
```{r}
df |>
ggplot(aes(x = Year.of.dataset.publication)) +
geom_bar() +
labs(title = "Year of dataset publication",
x = "Year",
y = "n")
```
## Associated with publication?
```{r}
df |>
tabyl(Paper.associated) |>
adorn_pct_formatting() |>
knitr::kable()
```
## Areas of research
```{r}
df |>
pivot_longer(cols = starts_with("Areas"),
names_to = "Area",
values_to = "Value") |>
separate_rows(Value, sep = ",") |>
filter(!is.na(Value)) |>
select(Value) |>
tabyl(Value) |>
arrange(desc(n)) |>
knitr::kable()
```
```{r}
df |>
pivot_longer(cols = starts_with("Areas"),
names_to = "Area",
values_to = "Value") |>
separate_rows(Value, sep = ",") |>
filter(!is.na(Value)) |>
select(Value) |>
tabyl(Value) |>
ggplot(aes(x = fct_reorder(Value, n),
y = n)) +
geom_col() +
theme_minimal() +
labs(title = "Distribution of Research Areas",
x = "Research Area",
y = "Count",
caption = "Each dataset can be in more than one area") +
coord_flip() +
scale_y_continuous(breaks= pretty_breaks())
```
## Imaging modality
```{r}
df |>
pivot_longer(cols = starts_with("Imaging"),
names_to = "Imaging",
values_to = "Value") |>
separate_rows(Value, sep = ",") |>
filter(!is.na(Value)) |>
select(Value) |>
mutate(Value = str_trim(Value, side = c( "both")) ) |>
tabyl(Value) |>
adorn_pct_formatting() |>
arrange(desc(n)) |>
knitr::kable()
```
```{r}
df |>
pivot_longer(cols = starts_with("Imaging"),
names_to = "Imaging",
values_to = "Value") |>
separate_rows(Value, sep = ",") |>
filter(!is.na(Value)) |>
select(Value) |>
mutate(Value = str_trim(Value, side = c( "both")) ) |>
tabyl(Value) |>
ggplot(aes(x = fct_reorder(Value, n),
y = n)) +
geom_col() +
theme_minimal() +
labs(title = "Distribution of Dataset by Imaging Modalities",
x = "Imaging Modality",
y = "Count",
caption = "Each dataset can be in more than one area") +
coord_flip() +
scale_y_continuous(breaks= pretty_breaks())
```
## Images amount analysis
```{r}
df |>
select(Response.ID,
Images...Panoramic:Images...Other) |>
pivot_longer(-Response.ID) |>
group_by(name) |>
filter(!is.na(value)) |>
summarise(sum = sum(value), median = median(value), average = mean(value), sd = sd(value)) |>
mutate(across(where(is.numeric), round, 1)) |>
knitr::kable()
```
```{r}
df |>
select(Response.ID,
Images...Panoramic:Images...Other) |>
pivot_longer(-Response.ID) |>
mutate(name = str_replace_all(name, "Images...", "")) |>
ggplot(aes(x = name,
y = value)) +
geom_col()
```
## Patients per dataset and country
```{r}
df |>
separate_rows(Imaging.modality..multiple.choices., sep = ", ") |>
group_by(Imaging.modality..multiple.choices.) |>
filter(!is.na(Number.of.patients.in.the.dataset)) |>
summarise(n = n(),
mean_patients = mean(Number.of.patients.in.the.dataset, na.rm = T),
sd_patients = sd(Number.of.patients.in.the.dataset, na.rm = T))|>
knitr::kable()
```
## MAP
### Distribution by country
How many countries?
```{r}
df |>
select(Response.ID, Number.of.images.in.the.dataset, country_1, country_2, country_3) |>
pivot_longer(-c(Response.ID, Number.of.images.in.the.dataset)) |>
filter(!is.na(value)) |>
tabyl(value) |>
arrange(desc(n)) |>
adorn_pct_formatting()|>
knitr::kable()
```
```{r}
df |>
pivot_longer(cols = starts_with("country")) |>
filter(!is.na(value)) |>
tabyl(value) |>
adorn_pct_formatting() |>
arrange(desc(n)) |>
rename(Country = value) |>
mutate(Country = countrycode(Country, "iso3c", "country.name")) |>
knitr::kable()
```
```{r}
countries <- df |>
pivot_longer(cols = starts_with("country")) |>
filter(!is.na(value)) |>
mutate(value = countrycode(value, "iso3c", "country.name")) |>
tabyl(value) |>
adorn_pct_formatting() |>
arrange(desc(n)) |>
rename(country = value)
```
```{r}
# Load world map data
world_map <- map_data("world")
world_map <- world_map |>
mutate(region = countrycode(region, "country.name", "country.name"))
```
```{r}
merged_data <- world_map |>
left_join(countries, by = c("region" = "country"))
```
```{r}
merged_data |>
ggplot() +
geom_polygon(aes(
x = long,
y = lat,
group = group,
fill = n
), color = "Grey 80") +
scale_fill_viridis_c(option = "plasma",
na.value = "Grey 97",
direction = -1) +
# coord_sf(crs= "+proj=cea +lon_0=0 +x_0=0 +y_0=0 +lat_ts=45 +ellps=WGS84 +datum=WGS84 +units=m +no_defs") +
theme_minimal() +
labs(fill = "N", title = "A. Dataset Distribution by Country",
caption = "Some datasets are associated with multiple countries")
```
```{r}
map_by_dataset <- merged_data |>
ggplot() +
geom_polygon(aes(
x = long,
y = lat,
group = group,
fill = n
), color = "Grey 80") +
scale_fill_viridis_c(option = "plasma",
na.value = "Grey 97",
direction = -1) +
theme_minimal() +
labs(fill = "N", title = "A. Dataset Distribution by Country",
caption = "Some datasets are associated with multiple countries")
```
### By numbers of images
```{r}
df |>
select(starts_with("country"), Number.of.images.in.the.dataset) |>
pivot_longer(-Number.of.images.in.the.dataset) |>
filter(!is.na(value)) |>
rename(n = Number.of.images.in.the.dataset) |>
select(-name) |>
mutate(country = countrycode(value, "iso3c", "country.name")) |>
select(-value) |>
group_by(country) |>
summarise(n = sum(n)) |>
arrange(desc(n))|>
knitr::kable()
```
```{r}
countries <- df |>
select(starts_with("country"), Number.of.images.in.the.dataset) |>
pivot_longer(-Number.of.images.in.the.dataset) |>
filter(!is.na(value)) |>
rename(n = Number.of.images.in.the.dataset) |>
select(-name) |>
mutate(country = countrycode(value, "iso3c", "country.name")) |>
select(-value) |>
group_by(country) |>
summarise(n = sum(n)) |>
arrange(desc(n))
```
```{r}
merged_data <- world_map |>
left_join(countries, by = c("region" = "country"))
```
```{r}
merged_data |>
ggplot() +
geom_polygon(aes(
x = long,
y = lat,
group = group,
fill = n
), color = "Grey 80") +
scale_fill_viridis_c(option = "plasma",
na.value = "Grey 97",
direction = -1) +
theme_minimal() +
labs(fill = "N", title = "B. Image Count per Country",
caption = "Some datasets are associated with multiple countries")
```
```{r}
map_by_images <- merged_data |>
ggplot() +
geom_polygon(aes(
x = long,
y = lat,
group = group,
fill = n
), color = "Grey 80") +
scale_fill_viridis_c(option = "plasma",
na.value = "Grey 97",
direction = -1) +
theme_minimal() +
labs(fill = "N", title = "B. Image Count per Country",
caption = "Some datasets are associated with multiple countries")
```
```{r}
map_by_dataset / map_by_images
```
```{r}
ggsave(here("figures", "Fig2_map.pdf"), dpi = 300, height = 30, width = 25, units = c("cm"))
```
```{r}
rm(merged_data, world_map, countries, map_by_images, map_by_dataset)
```
## Number of images per repository source
```{r}
df |>
group_by(Database) |>
summarise(n = n(), images = sum(Number.of.images.in.the.dataset), sd_images = sd(Number.of.images.in.the.dataset)) |>
arrange(desc(n)) |>
mutate("img_per_dataset" = images / n) |>
mutate(across(where(is.numeric), round, 1)) |>
knitr::kable()
```
## Table of images, datasets per imaging modality
```{r}
df |>
select(Response.ID, contains("Images...")) |>
pivot_longer(-Response.ID) |>
filter(!is.na(value)) |>
group_by(name) |>
summarise(n = n(), sum_images = sum(value), sd_images = sd(value)) |>
mutate(across(where(is.numeric), round, 1)) |>
knitr::kable()
```
## Metadata analysis
```{r}
df |>
# select relevant yes no columns
select(Response.ID,
Paper.associated,
reporting...Ethical.approval.for.dataset.publication.:reporting...Image.acquisition.device..e.g..Sirona..Germany..,
reporting...Image.processing., reporting...Gender.ratio..males.females..,
reporting...Ethnicity. , Does.the.dataset.include.annotations.,
Is.the.calibration.of.training.of.the.annotators.described. : Is.the.Number.of.patients.in.the.dataset.reported.) |>
# remove unwanted columns
select(-Annotation.Software,
How.was.the.ground.truth...gold.standard.established.in.the.study.) |>
# relevel if the ground truth was annotated
mutate(How.was.the.ground.truth...gold.standard.established.in.the.study. = if_else(
How.was.the.ground.truth...gold.standard.established.in.the.study. == "Not described",
"No",
"Yes")) |>
pivot_longer(-Response.ID) |>
filter(!is.na(value)) |>
mutate(value = fct_collapse(value, "No" = c("Not specified", "Not sure"))) |>
mutate(name = recode(name,
"reporting...Ground.truth.or.gold.standard.method.described." = "Ground truth method explanation",
"Does.the.dataset.include.annotations." = "Contain annotations",
"How.was.the.ground.truth...gold.standard.established.in.the.study." = "Ground truth definition",
"Is.the.Number.of.patients.in.the.dataset.reported." = "Patients number",
"Is.the.calibration.of.training.of.the.annotators.described." = "Annotators calibration",
"Paper.associated" = "Associated to paper",
"annotators..Is.any.metric.related.to.the.calibration.of.annotators.reported..kappa..ICC..etc..." = "Calibration metric reported",
"annotators..Is.described.the.calibration.or.training.of.the.annotators.." = "Annotators training",
"annotators..Is.the.age.of.annotators.reported.." = "Annotator age reporting",
"annotators..Is.the.experience.or.qualifications.of.the.annotators.described.." = "Annotators experience reported",
"annotators..Is.the.reporting.of.mechanisms.strategies.to.deal.with.disagreements.included.in.the.study.." = "Annotator dispute handling",
"annotators..Is.the.software.used.for.annotations.described.in.the.study.." = "Annotation tool reported",
"reporting...Anonymisation.strategy." = "Anonymization strategy",
"reporting...Ethical.approval.for.dataset.publication." = "Ethical approval",
"reporting...Ethnicity." = "Patient ethnicity",
"reporting...Gender.ratio..males.females.." = "Patient sex distribution",
"reporting...Image.acquisition.device..e.g..Sirona..Germany.." = "Equipment used",
"reporting...Image.processing." = "Image processing",
"reporting...Image.processing.or.adjustment." = "Type of image processing",
"reporting...Inclusion.or.exclusion.criteria.stated." = "Patient inclusion/exclusion criteria",
"reporting...Lesion.feature.or.image.size.annotations." = "Lesion segmentation",
"reporting...Participant.consent." = "Patient consent",
"reporting...Segmentations." = "Anatomic segmentation"
)) |>
# calculate the % yes
group_by(name) |>
summarize(
`Percentage Yes` = mean(value == "Yes") * 100,
.groups = 'drop'
) |>
mutate(across(where(is.numeric), round, 1)) |>
arrange(desc(`Percentage Yes`)) |>
knitr::kable()
```
Colors for yes/no
```{r}
# Define colors with distinct luminance values
# colors <- c("No" = "#F8766D", "Yes" = "#00BFC4") # Base
colors <- c("No" = "#F8766D", "Yes" = "#009498")
```
```{r}
df |>
# select relevant yes no columns
select(Response.ID,
Paper.associated,
reporting...Ethical.approval.for.dataset.publication.:reporting...Image.acquisition.device..e.g..Sirona..Germany..,
reporting...Image.processing., reporting...Gender.ratio..males.females..,
reporting...Ethnicity. , Does.the.dataset.include.annotations.,
Is.the.calibration.of.training.of.the.annotators.described. : Is.the.Number.of.patients.in.the.dataset.reported.) |>
# remove unwanted columns
select(-Annotation.Software,
How.was.the.ground.truth...gold.standard.established.in.the.study.) |>
# relevel if the ground truth was annotated
mutate(How.was.the.ground.truth...gold.standard.established.in.the.study. = if_else(
How.was.the.ground.truth...gold.standard.established.in.the.study. == "Not described",
"No",
"Yes")) |>
pivot_longer(-Response.ID) |>
filter(!is.na(value)) |>
mutate(value = fct_collapse(value, "No" = c("Not specified", "Not sure"))) |>
mutate(name = recode(name,
"reporting...Ground.truth.or.gold.standard.method.described." = "Ground truth method explanation",
"Does.the.dataset.include.annotations." = "Contain annotations",
"How.was.the.ground.truth...gold.standard.established.in.the.study." = "Ground truth definition",
"Is.the.Number.of.patients.in.the.dataset.reported." = "Patients number",
"Is.the.calibration.of.training.of.the.annotators.described." = "Annotators calibration",
"Paper.associated" = "Associated to paper",
"annotators..Is.any.metric.related.to.the.calibration.of.annotators.reported..kappa..ICC..etc..." = "Calibration metric reported",
"annotators..Is.described.the.calibration.or.training.of.the.annotators.." = "Annotators training",
"annotators..Is.the.age.of.annotators.reported.." = "Annotator age reporting",
"annotators..Is.the.experience.or.qualifications.of.the.annotators.described.." = "Annotators experience reported",
"annotators..Is.the.reporting.of.mechanisms.strategies.to.deal.with.disagreements.included.in.the.study.." = "Annotator dispute handling",
"annotators..Is.the.software.used.for.annotations.described.in.the.study.." = "Annotation tool reported",
"reporting...Anonymisation.strategy." = "Anonymization strategy",
"reporting...Ethical.approval.for.dataset.publication." = "Ethical approval",
"reporting...Ethnicity." = "Patient ethnicity",
"reporting...Gender.ratio..males.females.." = "Patient sex distribution",
"reporting...Image.acquisition.device..e.g..Sirona..Germany.." = "Equipment used",
"reporting...Image.processing." = "Image processing",
"reporting...Image.processing.or.adjustment." = "Type of image processing",
"reporting...Inclusion.or.exclusion.criteria.stated." = "Patient inclusion/exclusion criteria",
"reporting...Lesion.feature.or.image.size.annotations." = "Lesion segmentation",
"reporting...Participant.consent." = "Patient consent",
"reporting...Segmentations." = "Anatomic segmentation"
)) |>
ggplot(aes(
x = fct_reorder(name, value, .fun = function(x) mean(x == "Yes") ),
fill = value
)) +
geom_bar(position = "fill", aes(y = ..prop.., group = value)) +
scale_y_continuous(labels = scales::percent_format()) +
labs(x = "Question",
y = "Percentage",
fill = "Answer",
title = "Metadata Completeness of Dental Imaging Datasets for AI") +
theme_minimal() +
# theme(axis.text.x = element_text(angle = 45, hjust = 1)) +
geom_hline(yintercept = 0.25, linetype = "dashed", color = "lightgrey") +
geom_hline(yintercept = 0.5, linetype = "dashed", color = "lightgrey") +
geom_hline(yintercept = 0.75, linetype = "dashed", color = "lightgrey") +
coord_flip() +
scale_fill_manual(values = colors)
# scale_fill_viridis_d(option = "viridis", direction = -1)
# scale_fill_grey(start = 0.8, end = 0.2)
```
```{r}
ggsave(here("figures", "Fig3_metadata.pdf"), dpi = 300, width = 18, height = 15, units = c("cm"))
```
## FAIR ANALYSIS
```{r}
df |>
select(Response.ID,
FAIRness:Reusable.max.10) |>
mutate("Findable" = Findable.max.7 / 7 * 100,
"Accesible" = Accessible.max.3 / 3 * 100,
"Interoperable" = Interoperable.max.4 / 4 * 100,
"Reusable" = Reusable.max.10 / 10 * 100) |>
select(Response.ID, FAIRness.level, Findable:Reusable) |>
pivot_longer(-c(Response.ID, FAIRness.level)) |>
mutate(name = fct_relevel(name,
"Findable",
"Accesible",
"Interoperable",
"Reusable")) |>
group_by(name) |>
summarise(n = n(), mean = mean(value), sd = sd(value)) |>
mutate(across(where(is.numeric), round, 1)) |>
knitr::kable()
```
```{r}
df |>
select(Response.ID,
FAIRness:Reusable.max.10) |>
mutate("Findable" = Findable.max.7 / 7 * 100,
"Accesible" = Accessible.max.3 / 3 * 100,
"Interoperable" = Interoperable.max.4 / 4 * 100,
"Reusable" = Reusable.max.10 / 10 * 100) |>
select(Response.ID, FAIRness.level, Findable:Reusable) |>
pivot_longer(-c(Response.ID, FAIRness.level)) |>
mutate(name = fct_relevel(name,
"Findable",
"Accesible",
"Interoperable",
"Reusable")) |>
group_by(name) |>
ggplot(aes(x = name,
y = value)) +
geom_boxplot(width = .3, alpha = .9) +
labs(title = "FAIRness of the datasets",
x = "",
y = "Percentage")
```
# Final Table
```{r}
df_short <- df |>
select(Imaging = Imaging.modality..multiple.choices.,
Images = Number.of.images.in.the.dataset,
Patients = Number.of.patients.in.the.dataset,
country_1,
country_2,
country_3,
Response.ID,
FAIRness,
FAIRness.level,
Findable.max.7,
Accessible.max.3,
Interoperable.max.4,
Reusable.max.10) |>
# separate the imaging modality
separate_rows(Imaging, sep = ", ") |>
mutate(Imaging = fct_collapse(Imaging,
"Intraoral 3D Scans or images" = c("Intra-oral 3D scans", "Intraoral photograph"))) |>
# now the country
pivot_longer(
cols = starts_with("country"),
names_to = "country_number",
values_to = "country"
) |>
# filter the full cells of country
filter(!is.na(country)) |>
select(-country_number)
```
```{r}
# df_short |>
# group_by(Response.ID) |>
# summarise(
# Total_Images = sum(Images, na.rm = TRUE),
# Total_Patients = sum(Patients, na.rm = TRUE),
# Mean_FAIRness = mean(FAIRness, na.rm = TRUE),
# Mean_FAIRness_Level = mean(FAIRness.level, na.rm = TRUE),
# Mean_Findable = mean(Findable.max.7, na.rm = TRUE),
# Mean_Accessible = mean(Accessible.max.3, na.rm = TRUE),
# Mean_Interoperable = mean(Interoperable.max.4, na.rm = TRUE),
# Mean_Reusable = mean(Reusable.max.10, na.rm = TRUE)
# )
```
```{r}
df_short |>
# group_by(Imaging, Response.ID) |>
group_by(Imaging) |>
summarise(n = n(),
images = sum(Images)) |>
knitr::kable()
```
```{r}
df_short |>
# group_by(Imaging, Response.ID) |>
group_by(Imaging) |>
summarise(n = n()) |>
knitr::kable()
```
```{r}
df |>
select(Images...Panoramic:Images...CBCT, Response.ID) |>
pivot_longer(-Response.ID) |>
filter(!is.na(value)) |>
group_by(name) |>
summarise(n = n(),
sum = sum(value),
mean = mean(value),
sd = sd(value)) |>
mutate(across(where(is.numeric), round, 1)) |>
knitr::kable()
```
```{r}
df_short |>
tabyl(Imaging, country) |>
knitr::kable()
```
```{r}
df_short |>
select(Imaging, FAIRness, Findable.max.7:Reusable.max.10) |>
pivot_longer(-Imaging) |>
mutate(Imaging = fct_collapse(Imaging,
"Other" = c("Cephalometric radiographs", "Intraoral 3D Scans or images" ))) |>
group_by(name, Imaging) |>
summarise(# n = n(),
# sum = sum(value),
mean = mean(value) ) |>
# sd = sd(value)) |>
mutate(name = fct_relevel(name,
"Findable.max.7",
"Accessible.max.3",
"Interoperable.max.4",
"Reusable.max.10")) |>
pivot_wider(names_from = name,
values_from = mean) |>
mutate(across(where(is.numeric), round, 1)) |>
knitr::kable()
```