# Load Libraries
library(DBI)
library(RMariaDB)
library(dplyr)
library(tidyverse)
library(ggplot2)FDA-Approved A.I.-based algorthms
Original Dataset
Source: The Medical Futurist website
Content: This dataset contains information on medical devices and algorithms approved by the FDA from 1995 to 2021.
Selection Reason: This dataset was chosen as an illustrative example of an untidy dataset due to the presence of the following data quality issues:
Duplicate variables: The dataset contained redundant variables named “Medical specialty” and “Secondary medical specialty” with identical purposes.
Ambiguous variable names: The dataset included variable names that were unclear or lacked proper definition.
Missing or incomplete data: Some data points were either missing entirely or incomplete.
Inconsistent missing value representation: Missing data was represented inconsistently.
Tidy Dataset Features
| Feature Name | Description | Data Type |
|---|---|---|
| Device or Algo | Name of the approved device or algorithm | chr |
| Company | Name of the parent company who created the product | chr |
| Description | Short description of the device/algorithm | chr |
| FDA Approval Number | assigned by the FDA upon approval | chr |
| Type of Approval | Categorization of the FDA approval process(e.g., De Novo, 510(k), PMA) | chr |
| Mention of AI | Indicating key words mentioned if “AI” was mentioned during the product announcement | chr |
| Date | Date of FDA Approval | Date |
| Specialty | Medical field where the product is intended for use | chr |
These are the libraries used in the project:
Data Connection and Import
Data was scraped from the website and saved to the MySQL database. The following code connects to the database and fetches the data.
#Connect to the database
connection <- DBI::dbConnect(
drv = RMariaDB::MariaDB(),
dbname = database,
host = host,
port = port,
user = user,
password = password
)
# Fetch the results
tbl(connection, "fda approved ai - sheet1") |>
collect() -> fda_approved_ai
# Close the connection
DBI::dbDisconnect(connection)# Preview the data
str(fda_approved_ai)tibble [79 × 10] (S3: tbl_df/tbl/data.frame)
$ Name of device or algorithm : chr [1:79] "Arterys Cardio DL" "EnsoSleep" "Arterys Oncology DL" "Idx" ...
$ Name of parent company : chr [1:79] "Arterys Inc" "EnsoData, Inc" "Arterys Inc" "IDx LLC" ...
$ Short description : chr [1:79] "software analyzing cardiovascular images from MR" "diagnosis of sleep disorders" "medical diagnostic application" "detection of diabetic retinopathy" ...
$ FDA approval number : chr [1:79] "K163253" "K162627" "K173542" "DEN180001" ...
$ Type of FDA approval : chr [1:79] "510(k) premarket notification" "510(k) premarket notification" "510(k) premarket notification" "de novo pathway" ...
$ Mention of A.I. in announcement : chr [1:79] "deep learning" "automated algorithm" "deep learning" "A.I." ...
$ If no mention of A.I. in FDA announcement: chr [1:79] "" "" "" "" ...
$ Date : chr [1:79] "2016 11" "2017 03" "2017 11" "2018 01" ...
$ Medical specialty : chr [1:79] "Radiology" "Neurology" "Radiology" "Ophthalmology" ...
$ Secondary medical specialty : chr [1:79] "Cardiology" "" "Oncology" "" ...
head(fda_approved_ai)# A tibble: 6 × 10
`Name of device or algorithm` `Name of parent company` `Short description`
<chr> <chr> <chr>
1 Arterys Cardio DL Arterys Inc software analyzing car…
2 EnsoSleep EnsoData, Inc diagnosis of sleep dis…
3 Arterys Oncology DL Arterys Inc medical diagnostic app…
4 Idx IDx LLC detection of diabetic …
5 Koios DS for Breast Koios Medical, Inc diagnostic software fo…
6 ContaCT Viz.AI stroke detection on CT
# ℹ 7 more variables: `FDA approval number` <chr>,
# `Type of FDA approval` <chr>, `Mention of A.I. in announcement` <chr>,
# `If no mention of A.I. in FDA announcement` <chr>, Date <chr>,
# `Medical specialty` <chr>, `Secondary medical specialty` <chr>
Data Cleaning and Tidying
The names of the columns were made to be more descriptive and the date column was formatted to be more readable. The “Medical specialty” and “Secondary medical specialty” columns were combined into a single column and the “Mention of AI” column was cleaned to have consistent missing value representation.
# Rename the columns
fda_approved_ai_tidy <- fda_approved_ai |>
rename(
"Device or Algo" = "Name of device or algorithm",
"Company" = "Name of parent company",
"Description" = "Short description",
"FDA Approval Number" = "FDA approval number",
"Type of Approval" = "Type of FDA approval",
"Mention of AI" = "Mention of A.I. in announcement",
"No Mention of AI" = "If no mention of A.I. in FDA announcement"
) |> # Fix date formatting issue
mutate(
"Date" = gsub(" ", "-", Date),
"Date" = paste(Date, "01", sep = "-"),
"Date" = as.Date(Date, format = "%Y-%m-%d"),
) |>
select (-"No Mention of AI")
str(fda_approved_ai_tidy)tibble [79 × 9] (S3: tbl_df/tbl/data.frame)
$ Device or Algo : chr [1:79] "Arterys Cardio DL" "EnsoSleep" "Arterys Oncology DL" "Idx" ...
$ Company : chr [1:79] "Arterys Inc" "EnsoData, Inc" "Arterys Inc" "IDx LLC" ...
$ Description : chr [1:79] "software analyzing cardiovascular images from MR" "diagnosis of sleep disorders" "medical diagnostic application" "detection of diabetic retinopathy" ...
$ FDA Approval Number : chr [1:79] "K163253" "K162627" "K173542" "DEN180001" ...
$ Type of Approval : chr [1:79] "510(k) premarket notification" "510(k) premarket notification" "510(k) premarket notification" "de novo pathway" ...
$ Mention of AI : chr [1:79] "deep learning" "automated algorithm" "deep learning" "A.I." ...
$ Date : Date[1:79], format: "2016-11-01" "2017-03-01" ...
$ Medical specialty : chr [1:79] "Radiology" "Neurology" "Radiology" "Ophthalmology" ...
$ Secondary medical specialty: chr [1:79] "Cardiology" "" "Oncology" "" ...
head(fda_approved_ai_tidy)# A tibble: 6 × 9
`Device or Algo` Company Description `FDA Approval Number` `Type of Approval`
<chr> <chr> <chr> <chr> <chr>
1 Arterys Cardio DL Artery… software a… K163253 510(k) premarket …
2 EnsoSleep EnsoDa… diagnosis … K162627 510(k) premarket …
3 Arterys Oncology… Artery… medical di… K173542 510(k) premarket …
4 Idx IDx LLC detection … DEN180001 de novo pathway
5 Koios DS for Bre… Koios … diagnostic… K190442 510(k) premarket …
6 ContaCT Viz.AI stroke det… DEN170073 de novo pathway
# ℹ 4 more variables: `Mention of AI` <chr>, Date <date>,
# `Medical specialty` <chr>, `Secondary medical specialty` <chr>
# Fill the 'Medical specialty' column of row where 'Device or Algo' is "Koios DS for Breast" with "Radiology"
fda_approved_ai_tidy <- fda_approved_ai_tidy |>
mutate(
"Medical specialty" = ifelse(
`Device or Algo` == "Koios DS for Breast",
"Radiology",
`Medical specialty`
)
)
str(fda_approved_ai_tidy)tibble [79 × 9] (S3: tbl_df/tbl/data.frame)
$ Device or Algo : chr [1:79] "Arterys Cardio DL" "EnsoSleep" "Arterys Oncology DL" "Idx" ...
$ Company : chr [1:79] "Arterys Inc" "EnsoData, Inc" "Arterys Inc" "IDx LLC" ...
$ Description : chr [1:79] "software analyzing cardiovascular images from MR" "diagnosis of sleep disorders" "medical diagnostic application" "detection of diabetic retinopathy" ...
$ FDA Approval Number : chr [1:79] "K163253" "K162627" "K173542" "DEN180001" ...
$ Type of Approval : chr [1:79] "510(k) premarket notification" "510(k) premarket notification" "510(k) premarket notification" "de novo pathway" ...
$ Mention of AI : chr [1:79] "deep learning" "automated algorithm" "deep learning" "A.I." ...
$ Date : Date[1:79], format: "2016-11-01" "2017-03-01" ...
$ Medical specialty : chr [1:79] "Radiology" "Neurology" "Radiology" "Ophthalmology" ...
$ Secondary medical specialty: chr [1:79] "Cardiology" "" "Oncology" "" ...
head(fda_approved_ai_tidy)# A tibble: 6 × 9
`Device or Algo` Company Description `FDA Approval Number` `Type of Approval`
<chr> <chr> <chr> <chr> <chr>
1 Arterys Cardio DL Artery… software a… K163253 510(k) premarket …
2 EnsoSleep EnsoDa… diagnosis … K162627 510(k) premarket …
3 Arterys Oncology… Artery… medical di… K173542 510(k) premarket …
4 Idx IDx LLC detection … DEN180001 de novo pathway
5 Koios DS for Bre… Koios … diagnostic… K190442 510(k) premarket …
6 ContaCT Viz.AI stroke det… DEN170073 de novo pathway
# ℹ 4 more variables: `Mention of AI` <chr>, Date <date>,
# `Medical specialty` <chr>, `Secondary medical specialty` <chr>
Combining the 2 specialty columns was appropriate for the particular analysis that was chosen. Using the Distinct function would allow for accurate numerical analysis if needed.
# Pivot the 2 specialty columns into 1 column
fda_approved_ai_tidy <- fda_approved_ai_tidy |>
pivot_longer(
cols = c("Medical specialty", "Secondary medical specialty"),
names_to = "Specialty Type",
values_to = "Specialty"
) |> # Remove "Specialty Type"column
select(-"Specialty Type") |> # Remove rows where "Specialty" is empty
filter(Specialty != "") |> # If Mention of AI column has "Not Available" then replace it with "Not Mentioned"
mutate("Mention of AI" = ifelse(`Mention of AI` == "Not available",
"Not Mentioned",
`Mention of AI`))
str(fda_approved_ai_tidy)tibble [106 × 8] (S3: tbl_df/tbl/data.frame)
$ Device or Algo : chr [1:106] "Arterys Cardio DL" "Arterys Cardio DL" "EnsoSleep" "Arterys Oncology DL" ...
$ Company : chr [1:106] "Arterys Inc" "Arterys Inc" "EnsoData, Inc" "Arterys Inc" ...
$ Description : chr [1:106] "software analyzing cardiovascular images from MR" "software analyzing cardiovascular images from MR" "diagnosis of sleep disorders" "medical diagnostic application" ...
$ FDA Approval Number: chr [1:106] "K163253" "K163253" "K162627" "K173542" ...
$ Type of Approval : chr [1:106] "510(k) premarket notification" "510(k) premarket notification" "510(k) premarket notification" "510(k) premarket notification" ...
$ Mention of AI : chr [1:106] "deep learning" "deep learning" "automated algorithm" "deep learning" ...
$ Date : Date[1:106], format: "2016-11-01" "2016-11-01" ...
$ Specialty : chr [1:106] "Radiology" "Cardiology" "Neurology" "Radiology" ...
head(fda_approved_ai_tidy)# A tibble: 6 × 8
`Device or Algo` Company Description `FDA Approval Number` `Type of Approval`
<chr> <chr> <chr> <chr> <chr>
1 Arterys Cardio DL Artery… software a… K163253 510(k) premarket …
2 Arterys Cardio DL Artery… software a… K163253 510(k) premarket …
3 EnsoSleep EnsoDa… diagnosis … K162627 510(k) premarket …
4 Arterys Oncology… Artery… medical di… K173542 510(k) premarket …
5 Arterys Oncology… Artery… medical di… K173542 510(k) premarket …
6 Idx IDx LLC detection … DEN180001 de novo pathway
# ℹ 3 more variables: `Mention of AI` <chr>, Date <date>, Specialty <chr>
Analysis
Between 1995 and 2021, which are the Top 10 in Approvals by Specialty?
Radiology (41), Cardiology(19), and Neurology(10) are the top 3 specialties with the most FDA approvals for AI-based algorithms. T
Of the top 10 FDA Approval Specialties, Radiology and Cardiology account for 64% of the total number of approvals.
fda_approved_ai_tidy |>
group_by(Specialty) |>
summarize("Number of Specialty Approvals" = n()) |>
arrange(desc(`Number of Specialty Approvals`)) |>
head(10) |>
ggplot(aes(
x = reorder(Specialty, -`Number of Specialty Approvals`),
y = `Number of Specialty Approvals`
)) +
geom_point(size = 6, color = 'red') +
geom_segment(
aes(
x = Specialty,
xend = Specialty,
y = 0,
yend = `Number of Specialty Approvals`
),
color = 'red'
) +
geom_text(aes(label = `Number of Specialty Approvals`), color = "black") +
labs(title = "Top 10 Specialty Approvals", x = "Specialty", y = "Number of Approvals") +
theme(axis.text.x = element_text(angle = 65, vjust = .5))fda_approved_ai_tidy |>
group_by(Specialty) |>
summarize("Number of Specialty Approvals" = n_distinct(`FDA Approval Number`)) |>
arrange(desc(`Number of Specialty Approvals`)) |>
head(10) |>
arrange(Specialty) |>
mutate("Percentage of All Approvals" = round(
`Number of Specialty Approvals` / sum(`Number of Specialty Approvals`) * 100
)) |>
ggplot(aes(x = "", y = `Percentage of All Approvals` , fill = Specialty)) +
geom_bar(width = 1, stat = "identity") +
geom_text(aes(label = paste0(`Percentage of All Approvals`, "%")),
position = position_stack(vjust = 0.5),
size = 3) +
theme_void()Conclusion
Accounting for 44% of the top 10 specialties, Radiology is the most common specialty for FDA approvals of AI-based algorithms.
This is likely due to the increasing use of AI in medical imaging and the potential for AI to improve diagnostic accuracy and efficiency. According to HealthExec.com, “Artificial intelligence is really good at discerning patterns within the data. There has been a lot of work in the medical imaging space, where AI can really help improve diagnostic capabilities with image recognition.”1 It also seems that in the field of Radiology there is ample data for device developers to draw on from imaging and electrocardiograms.2
Dataset was successfully tidied and transformed to allow for this analysis.