Part I: Repeat code above for your assigned Biosample ID (31280771)

library(xml2)
library(tidyverse)

## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.4     ✔ readr     2.1.5
## ✔ forcats   1.0.0     ✔ stringr   1.5.1
## ✔ ggplot2   3.5.1     ✔ tibble    3.2.1
## ✔ lubridate 1.9.3     ✔ tidyr     1.3.1
## ✔ purrr     1.0.2     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors

library(rvest)

## 
## Attaching package: 'rvest'
## 
## The following object is masked from 'package:readr':
## 
##     guess_encoding

namePage <- paste0("https://www.ncbi.nlm.nih.gov/biosample/?term=",31280771)

testPage <- read_html(namePage)

tableText <- testPage %>% 
    html_node("table") %>%
    html_table()

names(tableText)<-c('Question','Response')

head(tableText)

## # A tibble: 6 × 2
##   Question              Response         
##   <chr>                 <chr>            
## 1 dominant hand         I am right handed
## 2 environmental medium  feces            
## 3 environmental package human-gut        
## 4 host body habitat     UBERON:feces     
## 5 host body mass index  26.0             
## 6 host body product     UBERON:feces

Part II & III: Modify the script to download ALL the survey results for samples 31280770-31280775 into a single list consisting of 5 data frames. Columns in the imported data frames are given meaningful column names.

sample_ids <- c(31280770, 31280771, 31280772, 31280773, 31280775)

survey_list <- list()

for (id in sample_ids) {
  name_pages <-
    paste0("https://www.ncbi.nlm.nih.gov/biosample/?term=", id)
  
  test_pages <- read_html(name_pages)
  
  table_text2 <- test_pages %>%
    html_node("table") %>%
    html_table(fill = TRUE)
    names(table_text2) <- c('Question', 'Response')
  
  survey_list[[as.character(id)]] <- table_text2
}
head(survey_list)

## $`31280770`
## # A tibble: 230 × 2
##    Question              Response         
##    <chr>                 <chr>            
##  1 dominant hand         I am right handed
##  2 environmental medium  feces            
##  3 environmental package human-gut        
##  4 host body habitat     UBERON:feces     
##  5 host body mass index  21.0             
##  6 host body product     UBERON:feces     
##  7 host tissue sampled   UBERON:feces     
##  8 host height           162.0            
##  9 life stage            Adult            
## 10 race                  Hispanic         
## # ℹ 220 more rows
## 
## $`31280771`
## # A tibble: 295 × 2
##    Question              Response         
##    <chr>                 <chr>            
##  1 dominant hand         I am right handed
##  2 environmental medium  feces            
##  3 environmental package human-gut        
##  4 host body habitat     UBERON:feces     
##  5 host body mass index  26.0             
##  6 host body product     UBERON:feces     
##  7 host tissue sampled   UBERON:feces     
##  8 host height           184.0            
##  9 life stage            Adult            
## 10 race                  Hispanic         
## # ℹ 285 more rows
## 
## $`31280772`
## # A tibble: 293 × 2
##    Question              Response        
##    <chr>                 <chr>           
##  1 dominant hand         I am left handed
##  2 environmental medium  feces           
##  3 environmental package human-gut       
##  4 host body habitat     UBERON:feces    
##  5 host body mass index  25.7            
##  6 host body product     UBERON:feces    
##  7 host tissue sampled   UBERON:feces    
##  8 host height           159.0           
##  9 life stage            Adult           
## 10 race                  Hispanic        
## # ℹ 283 more rows
## 
## $`31280773`
## # A tibble: 295 × 2
##    Question              Response        
##    <chr>                 <chr>           
##  1 dominant hand         I am left handed
##  2 environmental medium  feces           
##  3 environmental package human-gut       
##  4 host body habitat     UBERON:feces    
##  5 host body mass index  25.4            
##  6 host body product     UBERON:feces    
##  7 host tissue sampled   UBERON:feces    
##  8 host height           166.0           
##  9 life stage            Adult           
## 10 race                  Hispanic        
## # ℹ 285 more rows
## 
## $`31280775`
## # A tibble: 293 × 2
##    Question              Response         
##    <chr>                 <chr>            
##  1 dominant hand         I am right handed
##  2 environmental medium  feces            
##  3 environmental package human-gut        
##  4 host body habitat     UBERON:feces     
##  5 host body mass index  33.3             
##  6 host body product     UBERON:feces     
##  7 host tissue sampled   UBERON:feces     
##  8 host height           154.0            
##  9 life stage            Adult            
## 10 race                  Hispanic         
## # ℹ 283 more rows

Part IV: Summarize the responses from a single question that is in all 5 surveys with a pie chart.

library(ggplot2)
library(dplyr)
dfs <- bind_rows(survey_list, .id = "From")

life_stage <- dfs %>% filter(Question == "life stage")

life_stage <- life_stage %>%
  group_by(Response) %>%
  summarise(count = n())

pie = ggplot(life_stage, aes(x = "", y = count, fill = Response)) +
  geom_bar(stat = "identity", width = 1) +
  coord_polar("y", start = 0) +
  geom_text(aes(label = paste0(round(count / sum(count) * 100), "%")),
            position = position_stack(vjust = 0.5)) +
  scale_fill_manual(values=c("#55DDE0", "#33658A", "#2F4858", "#F6AE2D", "#F26419")) +
  labs(x = NULL, y = NULL, fill = NULL, title = "Survey Responses: Life Stage") +
  theme_classic() +
  theme(axis.line = element_blank(),
        axis.text = element_blank(),
        axis.ticks = element_blank(),
        plot.title = element_text(hjust = 0.5, color = "#666666"))
print(pie)

Scrape Surveys

Jourdan Hourican

2024-09-23

Part I: Repeat code above for your assigned Biosample ID (31280771)

Part II & III: Modify the script to download ALL the survey results for samples 31280770-31280775 into a single list consisting of 5 data frames. Columns in the imported data frames are given meaningful column names.

Part IV: Summarize the responses from a single question that is in all 5 surveys with a pie chart.