Load libraries
library(rvest)
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
Create empty list to hold data from multiple pages
surveyData = list()
Read data from a set of 8 webpages
# Defines URL
for (i in 1:8) {
j = i+31280169
namePage = paste0("https://www.ncbi.nlm.nih.gov/biosample/?term=", j)
# testPage variable stores all data from webpage
testPage = read_html(namePage)
tableText = testPage %>%
html_node("table") %>%
html_table()
names(tableText) = c("Question",
"Response")
surveyData[[as.character(j)]] = tableText
}
Use sapply to pull data from list
responses = sapply(surveyData,
function(x) x[2])
ages = sapply(responses,
function(x) x[9])
Make a pie chart of the response
# Change ages from chr to num
ages2 = as.numeric(gsub(" years",
"",
ages))
table(ages2)
## ages2
## 50 56 58 59 84 86 88
## 1 1 1 1 2 1 1
pie(table(ages2),
main = "Pie Chart of Host Age")

Make a pie chart of the response for gender
genders = sapply(responses,
function(x) x[10])
# Change ages from chr to num (done above)
#ages2 = as.numeric(gsub(" years", "", ages))
#table(ages2)
pie(table(genders),
main = "Pie Chart of Host Gender")

Personal code
library(purrr)
library(tidyverse)
## Warning: package 'ggplot2' was built under R version 4.3.3
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ forcats 1.0.0 ✔ stringr 1.5.1
## ✔ ggplot2 3.5.2 ✔ tibble 3.2.1
## ✔ lubridate 1.9.3 ✔ tidyr 1.3.1
## ✔ readr 2.1.5
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ readr::guess_encoding() masks rvest::guess_encoding()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
1. Test assigned ID
Function to scrape one BioSample
# Create a reusable function to scrape one biosample page
get_biosample = function(id) {
# Build URL for NCBI BioSample page
url = paste0("https://www.ncbi.nlm.nih.gov/biosample/?term=",
id)
# Download the webpage and parse HTML
page = read_html(url)
# Extract the first table from the page and convert to data frame
table_data = page %>%
html_node("table") %>% # Find table element
html_table() # Convert HTML table to data frame
# Assign column names
names(table_data) = c("Question",
"Response")
return(table_data)
}
Test with assigned ID
my_data = get_biosample(31280772)
head(my_data)
## # A tibble: 6 × 2
## Question Response
## <chr> <chr>
## 1 dominant hand I am left handed
## 2 environmental medium feces
## 3 environmental package human-gut
## 4 host body habitat UBERON:feces
## 5 host body mass index 25.7
## 6 host body product UBERON:feces
2. Download all 8 samples
Use map() instead of for loop
# Define range of sample IDs to scrape
sample_ids = 31280770:31280777
# Use map() from purrr instead of for loop
survey_data = map(sample_ids,
get_biosample) %>%
set_names(as.character(sample_ids))
# Verify the 8 samples
cat("Successfully scraped",
length(survey_data),
"samples\n")
## Successfully scraped 8 samples
str(survey_data,
max.level = 1)
## List of 8
## $ 31280770: tibble [230 × 2] (S3: tbl_df/tbl/data.frame)
## $ 31280771: tibble [295 × 2] (S3: tbl_df/tbl/data.frame)
## $ 31280772: tibble [293 × 2] (S3: tbl_df/tbl/data.frame)
## $ 31280773: tibble [295 × 2] (S3: tbl_df/tbl/data.frame)
## $ 31280774: tibble [230 × 2] (S3: tbl_df/tbl/data.frame)
## $ 31280775: tibble [293 × 2] (S3: tbl_df/tbl/data.frame)
## $ 31280776: tibble [292 × 2] (S3: tbl_df/tbl/data.frame)
## $ 31280777: tibble [293 × 2] (S3: tbl_df/tbl/data.frame)
3. Create a pie chart of age categories
Create pie chart showing distribution of age categories
# Count how many samples fall into each age category
age_counts = table(ages)
# Create pie chart
pie(age_counts,
main = "Age Categories - American Gut Project Samples",
col = rainbow(length(age_counts)))

# Display counts for reference
print(age_counts)
## ages
## 20s 30s 40s 50s
## 3 3 1 1