library(rvest)
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(ggplot2)
#Defines URL, I made this easier to loop by using paste0 to add the ID
#to the generic URL
namePage <- paste0("https://www.ncbi.nlm.nih.gov/biosample/?term=",3128077)
#testPage variable will store all data from the webpage
#similar to what you see with "Inspect Element"
testPage <- read_html(namePage)
#Since the data we want is in a table, this is a good first step
tableText <- testPage %>%
html_node("table") %>%
html_table()
names(tableText)<-c('Question','Response')
head(tableText)
## # A tibble: 6 × 2
## Question Response
## <chr> <chr>
## 1 study name The Breast and Prostate Cancer Cohort Consortium (BPC3) GWAS o…
## 2 study design Case-Control
## 3 gap subject id 1078196
## 4 sex female
## 5 tissue Blood
## 6 analyte type DNA
# Define the range of IDs
ids <- 31280770:31280777
# Loop through each ID and scrape the table
survey_list <- lapply(ids, function(id) {
# Construct URL
url <- paste0("https://www.ncbi.nlm.nih.gov/biosample/?term=", id)
# Read page
page <- read_html(url)
# Extract table
table <- page %>%
html_node("table") %>%
html_table()
# Give consistent column names
names(table) <- c("Question", "Response")
return(table)
})
# Give names to list elements for clarity
names(survey_list) <- paste0("Sample_", ids)
# --- Summarize a single question ---
# Example: pick the question "host" (change to whatever is in all surveys)
# Extract the responses across all data frames
host_responses <- lapply(survey_list, function(df) {
df %>% filter(Question == "host") %>% pull(Response)
})
# Unlist into a single vector
host_responses <- unlist(host_responses)
# Create a frequency table
host_summary <- as.data.frame(table(host_responses))
# --- Summarize responses for "sex" ---
responses <- lapply(survey_list, function(df) {
df %>% filter(Question == "sex") %>% pull(Response)
}) %>% unlist()
# Tabulate
summary_df <- as.data.frame(table(responses))
# --- Pie chart for "sex" ---
ggplot(summary_df, aes(x = "", y = Freq, fill = responses)) +
geom_col(width = 1, color = "white") +
coord_polar(theta = "y") +
theme_void() +
labs(
title = "Distribution of Sex Responses",
fill = "Sex"
) +
theme(
plot.title = element_text(hjust = 0.5, face = "bold")
)
