Example code extrapolated from: https://rpubs.com/profbiot/gutscrape (until line break)

Load libraries

library(rvest)
library(dplyr)

## 
## Attaching package: 'dplyr'

## The following objects are masked from 'package:stats':
## 
##     filter, lag

## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union

Create empty list to hold data from multiple pages

surveyData = list()

Read data from a set of 8 webpages

# Defines URL
for (i in 1:8) {
  j = i+31280169
  namePage = paste0("https://www.ncbi.nlm.nih.gov/biosample/?term=", j)
  # testPage variable stores all data from webpage
  testPage = read_html(namePage)
  tableText = testPage %>%
    html_node("table") %>%
    html_table()
  names(tableText) = c("Question",
                       "Response")
  surveyData[[as.character(j)]] = tableText
}

Use sapply to pull data from list

responses = sapply(surveyData,
                   function(x) x[2])
ages = sapply(responses,
              function(x) x[9])

Make a pie chart of the response

# Change ages from chr to num
ages2 = as.numeric(gsub(" years",
                        "",
                        ages))
table(ages2)

## ages2
## 50 56 58 59 84 86 88 
##  1  1  1  1  2  1  1

pie(table(ages2),
    main = "Pie Chart of Host Age")

Make a pie chart of the response for gender

genders = sapply(responses,
                 function(x) x[10])
# Change ages from chr to num (done above)
#ages2 = as.numeric(gsub(" years", "", ages))
#table(ages2)
pie(table(genders),
    main = "Pie Chart of Host Gender")

Personal code

library(purrr)
library(tidyverse)

## Warning: package 'ggplot2' was built under R version 4.3.3

## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ forcats   1.0.0     ✔ stringr   1.5.1
## ✔ ggplot2   3.5.2     ✔ tibble    3.2.1
## ✔ lubridate 1.9.3     ✔ tidyr     1.3.1
## ✔ readr     2.1.5     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter()         masks stats::filter()
## ✖ readr::guess_encoding() masks rvest::guess_encoding()
## ✖ dplyr::lag()            masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors

1. Test assigned ID

Function to scrape one BioSample

# Create a reusable function to scrape one biosample page
get_biosample = function(id) {
  # Build URL for NCBI BioSample page
  url = paste0("https://www.ncbi.nlm.nih.gov/biosample/?term=",
               id)
  # Download the webpage and parse HTML
  page = read_html(url)
  # Extract the first table from the page and convert to data frame
  table_data = page %>%
    html_node("table") %>% # Find table element
    html_table()           # Convert HTML table to data frame
# Assign column names
names(table_data) = c("Question",
                      "Response")
return(table_data)
}

Test with assigned ID

my_data = get_biosample(31280772)
head(my_data)

## # A tibble: 6 × 2
##   Question              Response        
##   <chr>                 <chr>           
## 1 dominant hand         I am left handed
## 2 environmental medium  feces           
## 3 environmental package human-gut       
## 4 host body habitat     UBERON:feces    
## 5 host body mass index  25.7            
## 6 host body product     UBERON:feces

2. Download all 8 samples

Use map() instead of for loop

# Define range of sample IDs to scrape
sample_ids = 31280770:31280777
# Use map() from purrr instead of for loop
survey_data = map(sample_ids,
                  get_biosample) %>%
  set_names(as.character(sample_ids))
# Verify the 8 samples
cat("Successfully scraped",
    length(survey_data),
    "samples\n")

## Successfully scraped 8 samples

str(survey_data,
    max.level = 1)

## List of 8
##  $ 31280770: tibble [230 × 2] (S3: tbl_df/tbl/data.frame)
##  $ 31280771: tibble [295 × 2] (S3: tbl_df/tbl/data.frame)
##  $ 31280772: tibble [293 × 2] (S3: tbl_df/tbl/data.frame)
##  $ 31280773: tibble [295 × 2] (S3: tbl_df/tbl/data.frame)
##  $ 31280774: tibble [230 × 2] (S3: tbl_df/tbl/data.frame)
##  $ 31280775: tibble [293 × 2] (S3: tbl_df/tbl/data.frame)
##  $ 31280776: tibble [292 × 2] (S3: tbl_df/tbl/data.frame)
##  $ 31280777: tibble [293 × 2] (S3: tbl_df/tbl/data.frame)

3. Create a pie chart of age categories

Extract age category responses from all samples

# Use map_chr() instead of sapply()
ages = map_chr(survey_data, ~{
  # Find which rows contain age_cat question
  age_row = which(str_detect(.x$Question,
                             "age_cat"))
  # Return the response; otherwise NA
  if(length(age_row) > 0) .x$Response[age_row] else NA
}) %>%
  na.omit() # Remove NA values
print(ages)

## 31280770 31280771 31280772 31280773 31280774 31280775 31280776 31280777 
##    "30s"    "20s"    "30s"    "50s"    "20s"    "30s"    "40s"    "20s"

Create pie chart showing distribution of age categories

# Count how many samples fall into each age category
age_counts = table(ages)
# Create pie chart
pie(age_counts,
    main = "Age Categories - American Gut Project Samples",
    col = rainbow(length(age_counts)))

# Display counts for reference
print(age_counts)

## ages
## 20s 30s 40s 50s 
##   3   3   1   1

Scrape Surveys

Collin McNeil

2025-09-27

Example code extrapolated from: https://rpubs.com/profbiot/gutscrape (until line break)

Load libraries

Create empty list to hold data from multiple pages

Read data from a set of 8 webpages

Use sapply to pull data from list

Make a pie chart of the response

Make a pie chart of the response for gender

Personal code

1. Test assigned ID

Function to scrape one BioSample

Test with assigned ID

2. Download all 8 samples

Use map() instead of for loop

3. Create a pie chart of age categories

Extract age category responses from all samples

Create pie chart showing distribution of age categories