library(rvest)
library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(ggplot2)
#Defines URL, I made this easier to loop by using paste0 to add the ID
#to the generic URL
namePage <- paste0("https://www.ncbi.nlm.nih.gov/biosample/?term=",3128077)

#testPage variable will store all data from the webpage
#similar to what you see with "Inspect Element"
testPage <- read_html(namePage)

#Since the data we want is in a table, this is a good first step
tableText <- testPage %>% 
    html_node("table") %>%
    html_table()

names(tableText)<-c('Question','Response')

head(tableText)
## # A tibble: 6 × 2
##   Question       Response                                                       
##   <chr>          <chr>                                                          
## 1 study name     The Breast and Prostate Cancer Cohort Consortium (BPC3) GWAS o…
## 2 study design   Case-Control                                                   
## 3 gap subject id 1078196                                                        
## 4 sex            female                                                         
## 5 tissue         Blood                                                          
## 6 analyte type   DNA
# Define the range of IDs
ids <- 31280770:31280777

# Loop through each ID and scrape the table
survey_list <- lapply(ids, function(id) {
  
  # Construct URL
  url <- paste0("https://www.ncbi.nlm.nih.gov/biosample/?term=", id)
  
  # Read page
  page <- read_html(url)
  
  # Extract table
  table <- page %>%
    html_node("table") %>%
    html_table()
  
  # Give consistent column names
  names(table) <- c("Question", "Response")
  
  return(table)
})

# Give names to list elements for clarity
names(survey_list) <- paste0("Sample_", ids)

# --- Summarize a single question ---
# Example: pick the question "host" (change to whatever is in all surveys)
# Extract the responses across all data frames
host_responses <- lapply(survey_list, function(df) {
  df %>% filter(Question == "host") %>% pull(Response)
})

# Unlist into a single vector
host_responses <- unlist(host_responses)

# Create a frequency table
host_summary <- as.data.frame(table(host_responses))
# --- Summarize responses for "sex" ---
responses <- lapply(survey_list, function(df) {
  df %>% filter(Question == "sex") %>% pull(Response)
}) %>% unlist()

# Tabulate
summary_df <- as.data.frame(table(responses))

# --- Pie chart for "sex" ---
ggplot(summary_df, aes(x = "", y = Freq, fill = responses)) +
  geom_col(width = 1, color = "white") +
  coord_polar(theta = "y") +
  theme_void() +
  labs(
    title = "Distribution of Sex Responses",
    fill = "Sex"
  ) +
  theme(
    plot.title = element_text(hjust = 0.5, face = "bold")
  )