Goal: Create a script that downloads survey data from the American Gut Project when given the ID number for an NBCI Biosample.
My ID number: 31280773
loading libraries
library("rvest")
library("ggplot2")
library("dplyr")
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library("tidyverse")
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ forcats 1.0.0 ✔ stringr 1.5.1
## ✔ lubridate 1.9.3 ✔ tibble 3.2.1
## ✔ purrr 1.0.2 ✔ tidyr 1.3.1
## ✔ readr 2.1.5
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ readr::guess_encoding() masks rvest::guess_encoding()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library("reshape")
##
## Attaching package: 'reshape'
##
## The following object is masked from 'package:lubridate':
##
## stamp
##
## The following objects are masked from 'package:tidyr':
##
## expand, smiths
##
## The following object is masked from 'package:dplyr':
##
## rename
library("reticulate")
Defining the URL and ID number I want R to use before I recreate the code for more IDs
website <- paste0("https://www.ncbi.nlm.nih.gov/biosample/?term=",31280773)
This vector will store the data from the website file
Webpage<-read_html(website)
turing data frame into a table, changing column names, and checking format
WebsiteTable <- Webpage %>%
html_node("table") %>%
html_table()
names(WebsiteTable)<-c('Question','Response')
head(WebsiteTable)
## # A tibble: 6 × 2
## Question Response
## <chr> <chr>
## 1 dominant hand I am left handed
## 2 environmental medium feces
## 3 environmental package human-gut
## 4 host body habitat UBERON:feces
## 5 host body mass index 25.4
## 6 host body product UBERON:feces
Applying code to multiple IDs at once:
Creating a function that will scrape data from given IDs, extract the data as a table, and rename the columns
Scrapedata <-function(SampleID) {
url <- paste0("https://www.ncbi.nlm.nih.gov/biosample/?term=", SampleID)
webpage <- read_html(url)
Website_Tables <- webpage %>% html_node("table") %>% html_table()
names(Website_Tables) <- c('Question', 'Response')
return(Website_Tables)
player = url %>% html_element(xpath = '//*[@id="content"]/div/table') %>% html_table()
}
Loading in sample IDs and applying the above function to extract data and rename columns
SampleID<-31280770:31280775
Results <- list()
for (SampleID in SampleID) {
Results[[as.character(SampleID)]] <- Scrapedata(SampleID)
}
checking the begining of each survey
lapply(Results, head)
## $`31280770`
## # A tibble: 6 × 2
## Question Response
## <chr> <chr>
## 1 dominant hand I am right handed
## 2 environmental medium feces
## 3 environmental package human-gut
## 4 host body habitat UBERON:feces
## 5 host body mass index 21.0
## 6 host body product UBERON:feces
##
## $`31280771`
## # A tibble: 6 × 2
## Question Response
## <chr> <chr>
## 1 dominant hand I am right handed
## 2 environmental medium feces
## 3 environmental package human-gut
## 4 host body habitat UBERON:feces
## 5 host body mass index 26.0
## 6 host body product UBERON:feces
##
## $`31280772`
## # A tibble: 6 × 2
## Question Response
## <chr> <chr>
## 1 dominant hand I am left handed
## 2 environmental medium feces
## 3 environmental package human-gut
## 4 host body habitat UBERON:feces
## 5 host body mass index 25.7
## 6 host body product UBERON:feces
##
## $`31280773`
## # A tibble: 6 × 2
## Question Response
## <chr> <chr>
## 1 dominant hand I am left handed
## 2 environmental medium feces
## 3 environmental package human-gut
## 4 host body habitat UBERON:feces
## 5 host body mass index 25.4
## 6 host body product UBERON:feces
##
## $`31280774`
## # A tibble: 6 × 2
## Question Response
## <chr> <chr>
## 1 dominant hand I am right handed
## 2 environmental medium feces
## 3 environmental package human-gut
## 4 host body habitat UBERON:feces
## 5 host body mass index 20.4
## 6 host body product UBERON:feces
##
## $`31280775`
## # A tibble: 6 × 2
## Question Response
## <chr> <chr>
## 1 dominant hand I am right handed
## 2 environmental medium feces
## 3 environmental package human-gut
## 4 host body habitat UBERON:feces
## 5 host body mass index 33.3
## 6 host body product UBERON:feces
melted results so I can access data within the large Results data frame, and then search data frame by “question” column and specify I want data on ‘milk_cheese_frequency’
myDF <- melt(Results)
## Using Question, Response as id variables
## Using Question, Response as id variables
## Using Question, Response as id variables
## Using Question, Response as id variables
## Using Question, Response as id variables
## Using Question, Response as id variables
CheeseHabit<-filter(myDF, Question == 'milk_cheese_frequency')
names(CheeseHabit) <- c('Question', 'Response', 'Survey Responder ID')
making a pie chart of the responses about cheese consumption
ggplot(data = CheeseHabit, aes(x = "", y = Question, fill = Response)) +
geom_col() +
coord_polar(theta = "y")+
scale_fill_brewer(name = NULL, palette = "Pastel2")+
guides(fill = guide_legend(title = "Frequency of Milk Cheese Consumption"))+
theme_void(base_size = 10)