Introduction

The goal of this porject is to scrape survey data from NCBI Biosample pages using rvest and summarize one common survey question across 8 samples.

Define a function to Scrape a Single Biosample ID

This function takes a numeric NCBI Biosample ID and downloads the page to extract the first HTML table and cleans the column names to “Question” and “Response.” It also adds a column noting the SampleID for tracking.

get_gut_survey <- function(biosample_id) {
  url <- paste0("https://www.ncbi.nlm.nih.gov/biosample/", biosample_id)

page <- read_html(url)

tbl <- page %>%
  html_element("table") %>%
  html_table(fill = TRUE)

names(tbl) <- c("Questions", "Response")

tbl$SampleID <- biosample_id

return(tbl)
}

Test the Function with the Required Sample ID

Scrape the survey for BioSample ID 31280776 and looking at rows.

gut_31280776 <- get_gut_survey(31280776)

head(gut_31280776)
## # A tibble: 6 × 3
##   Questions             Response          SampleID
##   <chr>                 <chr>                <dbl>
## 1 dominant hand         I am right handed 31280776
## 2 environmental medium  feces             31280776
## 3 environmental package human-gut         31280776
## 4 host body habitat     UBERON:feces      31280776
## 5 host body mass index  28.9              31280776
## 6 host body product     UBERON:feces      31280776

Download surverys for all 8 sample IDs

Downloading ID ranges from 31280770 through 31280777 and using lapply to run survery on all. Naming each element for easy reference.

ids <- 31280770:31280777

gut_list <- lapply(ids, get_gut_survey)

names(gut_list) <- ids

Identify a common question across all surverys

Summarizing a single question requires one present in every data frame. So I will locate the intersection across all data frames. Then I view all common questions and pick one.

common_questions <- Reduce(
  intersect,
  lapply(gut_list, function(df) df$Questions)
)

print(common_questions)
##   [1] "dominant hand"                                                                              
##   [2] "environmental medium"                                                                       
##   [3] "environmental package"                                                                      
##   [4] "host body habitat"                                                                          
##   [5] "host body mass index"                                                                       
##   [6] "host body product"                                                                          
##   [7] "host tissue sampled"                                                                        
##   [8] "host height"                                                                                
##   [9] "life stage"                                                                                 
##  [10] "race"                                                                                       
##  [11] "sample name"                                                                                
##  [12] "sample type"                                                                                
##  [13] "sex"                                                                                        
##  [14] "ENA-CHECKLIST"                                                                              
##  [15] "ENA-FIRST-PUBLIC"                                                                           
##  [16] "ENA-LAST-UPDATE"                                                                            
##  [17] "External Id"                                                                                
##  [18] "INSDC center name"                                                                          
##  [19] "INSDC first public"                                                                         
##  [20] "INSDC last update"                                                                          
##  [21] "INSDC status"                                                                               
##  [22] "Submitter Id"                                                                               
##  [23] "acid_reflux"                                                                                
##  [24] "acne_medication"                                                                            
##  [25] "acne_medication_otc"                                                                        
##  [26] "add_adhd"                                                                                   
##  [27] "age_cat"                                                                                    
##  [28] "alcohol_consumption"                                                                        
##  [29] "alcohol_frequency"                                                                          
##  [30] "alcohol_types_beercider"                                                                    
##  [31] "alcohol_types_red_wine"                                                                     
##  [32] "alcohol_types_sour_beers"                                                                   
##  [33] "alcohol_types_spiritshard_alcohol"                                                          
##  [34] "alcohol_types_unspecified"                                                                  
##  [35] "alcohol_types_white_wine"                                                                   
##  [36] "allergic_to_i_have_no_food_allergies_that_i_know_of"                                        
##  [37] "allergic_to_peanuts"                                                                        
##  [38] "allergic_to_shellfish"                                                                      
##  [39] "allergic_to_tree_nuts"                                                                      
##  [40] "allergic_to_unspecified"                                                                    
##  [41] "alzheimers"                                                                                 
##  [42] "antibiotic_history"                                                                         
##  [43] "appendix_removed"                                                                           
##  [44] "artificial_sweeteners"                                                                      
##  [45] "asd"                                                                                        
##  [46] "autoimmune"                                                                                 
##  [47] "beet_frequency"                                                                             
##  [48] "birth_year"                                                                                 
##  [49] "bmi_cat"                                                                                    
##  [50] "bowel_movement_frequency"                                                                   
##  [51] "bowel_movement_quality"                                                                     
##  [52] "breastmilk_formula_ensure"                                                                  
##  [53] "cancer"                                                                                     
##  [54] "cancer_treatment"                                                                           
##  [55] "cardiovascular_disease"                                                                     
##  [56] "cat"                                                                                        
##  [57] "cdiff"                                                                                      
##  [58] "chickenpox"                                                                                 
##  [59] "clinical_condition"                                                                         
##  [60] "consume_animal_products_abx"                                                                
##  [61] "contraceptive"                                                                              
##  [62] "cosmetics_frequency"                                                                        
##  [63] "country_of_birth"                                                                           
##  [64] "country_residence"                                                                          
##  [65] "covid_anxious"                                                                              
##  [66] "covid_chronic_conditions_any_autoimmune_disease"                                            
##  [67] "covid_chronic_conditions_arthritis"                                                         
##  [68] "covid_chronic_conditions_asthma_or_other_lung_problems"                                     
##  [69] "covid_chronic_conditions_atrial_fibrillation_or_atrial_flutter"                             
##  [70] "covid_chronic_conditions_cancer"                                                            
##  [71] "covid_chronic_conditions_diabetes_or_high_blood_sugar"                                      
##  [72] "covid_chronic_conditions_frequent_or_very_bad_headaches"                                    
##  [73] "covid_chronic_conditions_heart_disease__myocardial_infarction"                              
##  [74] "covid_chronic_conditions_heart_problems"                                                    
##  [75] "covid_chronic_conditions_hiv"                                                               
##  [76] "covid_chronic_conditions_hypertension"                                                      
##  [77] "covid_chronic_conditions_immune_disorder"                                                   
##  [78] "covid_chronic_conditions_kidney_problems"                                                   
##  [79] "covid_chronic_conditions_seasonal_allergies"                                                
##  [80] "covid_chronic_conditions_serious_acne_or_skin_problems"                                     
##  [81] "covid_chronic_conditions_serious_stomach_or_bowel_problems"                                 
##  [82] "covid_chronic_conditions_unspecified"                                                       
##  [83] "covid_depressed"                                                                            
##  [84] "covid_family_members_fallen_ill_physically"                                                 
##  [85] "covid_family_members_hospitalized"                                                          
##  [86] "covid_family_members_lost_job"                                                              
##  [87] "covid_family_members_none_of_the_above"                                                     
##  [88] "covid_family_members_passed_away"                                                           
##  [89] "covid_family_members_put_into_self_quarantine_with_symptoms"                                
##  [90] "covid_family_members_put_into_self_quarantine_without_symptoms_eg_due_to_possible_exposure" 
##  [91] "covid_family_members_reduced_ability_to_earn_money"                                         
##  [92] "covid_family_members_unspecified"                                                           
##  [93] "covid_happened_to_you_fallen_ill_physically"                                                
##  [94] "covid_happened_to_you_hospitalized"                                                         
##  [95] "covid_happened_to_you_lost_job"                                                             
##  [96] "covid_happened_to_you_none_of_the_above"                                                    
##  [97] "covid_happened_to_you_put_into_self_quarantine_with_symptoms"                               
##  [98] "covid_happened_to_you_put_into_self_quarantine_without_symptoms_eg_due_to_possible_exposure"
##  [99] "covid_happened_to_you_reduced_ability_to_earn_money"                                        
## [100] "covid_happened_to_you_unspecified"                                                          
## [101] "covid_interest_pleasure"                                                                    
## [102] "covid_left_home"                                                                            
## [103] "covid_level_of_wellbeing"                                                                   
## [104] "covid_likely_exposure_no"                                                                   
## [105] "covid_likely_exposure_unspecified"                                                          
## [106] "covid_likely_exposure_yes_someone_with_medical_diagnosis_but_no_test"                       
## [107] "covid_likely_exposure_yes_someone_with_positive_test"                                       
## [108] "covid_likely_exposure_yes_someone_with_possible_symptoms_but_no_diagnosis_by_doctor"        
## [109] "covid_patient_care"                                                                         
## [110] "covid_patient_care_seven_days"                                                              
## [111] "covid_patient_care_seven_days_confirmed"                                                    
## [112] "covid_quality_of_life"                                                                      
## [113] "covid_rideshare"                                                                            
## [114] "covid_sleep_interference"                                                                   
## [115] "covid_sleep_pattern"                                                                        
## [116] "covid_suspected_positive"                                                                   
## [117] "covid_symptoms_chest_pain_or_tightness_in_chest"                                            
## [118] "covid_symptoms_cough"                                                                       
## [119] "covid_symptoms_diarrhea"                                                                    
## [120] "covid_symptoms_fatigue"                                                                     
## [121] "covid_symptoms_fever"                                                                       
## [122] "covid_symptoms_lack_of_appetitie"                                                           
## [123] "covid_symptoms_loss_of_taste_or_smell"                                                      
## [124] "covid_symptoms_nausea"                                                                      
## [125] "covid_symptoms_shortness_of_breath"                                                         
## [126] "covid_symptoms_sore_throat"                                                                 
## [127] "covid_symptoms_stay_home"                                                                   
## [128] "covid_symptoms_unspecified"                                                                 
## [129] "covid_worried_sleep"                                                                        
## [130] "covid_worrying"                                                                             
## [131] "csection"                                                                                   
## [132] "deodorant_use"                                                                              
## [133] "diabetes"                                                                                   
## [134] "diabetes_type"                                                                              
## [135] "diet_type"                                                                                  
## [136] "dog"                                                                                        
## [137] "drinking_water_source"                                                                      
## [138] "drinks_per_session"                                                                         
## [139] "empo_3"                                                                                     
## [140] "epilepsy_or_seizure_disorder"                                                               
## [141] "exercise_frequency"                                                                         
## [142] "exercise_location"                                                                          
## [143] "fed_as_infant"                                                                              
## [144] "fermented_plant_frequency"                                                                  
## [145] "flossing_frequency"                                                                         
## [146] "flu_vaccine_date"                                                                           
## [147] "frozen_dessert_frequency"                                                                   
## [148] "fruit_frequency"                                                                            
## [149] "fungal_overgrowth"                                                                          
## [150] "gluten"                                                                                     
## [151] "high_fat_red_meat_frequency"                                                                
## [152] "homecooked_meals_frequency"                                                                 
## [153] "host_weight"                                                                                
## [154] "ibd"                                                                                        
## [155] "ibd_diagnosis_refined"                                                                      
## [156] "ibs"                                                                                        
## [157] "kidney_disease"                                                                             
## [158] "lactose"                                                                                    
## [159] "last_move"                                                                                  
## [160] "last_travel"                                                                                
## [161] "level_of_education"                                                                         
## [162] "liver_disease"                                                                              
## [163] "livingwith"                                                                                 
## [164] "lung_disease"                                                                               
## [165] "meat_eggs_frequency"                                                                        
## [166] "mental_illness"                                                                             
## [167] "mental_illness_type_bipolar_disorder"                                                       
## [168] "mental_illness_type_bulimia_nervosa"                                                        
## [169] "mental_illness_type_depression"                                                             
## [170] "mental_illness_type_unspecified"                                                            
## [171] "migraine"                                                                                   
## [172] "milk_cheese_frequency"                                                                      
## [173] "milk_substitute_frequency"                                                                  
## [174] "multivitamin"                                                                               
## [175] "nail_biter"                                                                                 
## [176] "non_food_allergies_beestings"                                                               
## [177] "non_food_allergies_drug_eg_penicillin"                                                      
## [178] "non_food_allergies_pet_dander"                                                              
## [179] "non_food_allergies_poison_ivyoak"                                                           
## [180] "non_food_allergies_sun"                                                                     
## [181] "non_food_allergies_unspecified"                                                             
## [182] "olive_oil"                                                                                  
## [183] "one_liter_of_water_a_day_frequency"                                                         
## [184] "other_supplement_frequency"                                                                 
## [185] "pets_other"                                                                                 
## [186] "pku"                                                                                        
## [187] "plant_protein_frequency"                                                                    
## [188] "pool_frequency"                                                                             
## [189] "poultry_frequency"                                                                          
## [190] "pregnant"                                                                                   
## [191] "prepared_meals_frequency"                                                                   
## [192] "probiotic_frequency"                                                                        
## [193] "ready_to_eat_meals_frequency"                                                               
## [194] "red_meat_frequency"                                                                         
## [195] "roommates_in_study"                                                                         
## [196] "salted_snacks_frequency"                                                                    
## [197] "seafood_frequency"                                                                          
## [198] "seasonal_allergies"                                                                         
## [199] "sibo"                                                                                       
## [200] "skin_condition"                                                                             
## [201] "sleep_duration"                                                                             
## [202] "smoking_frequency"                                                                          
## [203] "softener"                                                                                   
## [204] "specialized_diet_exclude_dairy"                                                             
## [205] "specialized_diet_exclude_refined_sugars"                                                    
## [206] "specialized_diet_fodmap"                                                                    
## [207] "specialized_diet_i_do_not_eat_a_specialized_diet"                                           
## [208] "specialized_diet_kosher"                                                                    
## [209] "specialized_diet_modified_paleo_diet"                                                       
## [210] "specialized_diet_other_restrictions_not_described_here"                                     
## [211] "specialized_diet_paleo_diet_or_primal_diet"                                                 
## [212] "specialized_diet_unspecified"                                                               
## [213] "specialized_diet_weston_price_or_other_low_grain_low_processed_food_diet"                   
## [214] "sugar_sweetened_drink_frequency"                                                            
## [215] "sugary_sweets_frequency"                                                                    
## [216] "teethbrushing_frequency"                                                                    
## [217] "thyroid"                                                                                    
## [218] "tonsils_removed"                                                                            
## [219] "types_of_plants"                                                                            
## [220] "vegetable_frequency"                                                                        
## [221] "vitamin_b_supplement_frequency"                                                             
## [222] "vitamin_d_supplement_frequency"                                                             
## [223] "vivid_dreams"                                                                               
## [224] "weight_change"                                                                              
## [225] "whole_eggs"                                                                                 
## [226] "whole_grain_frequency"
chosen_question <- "sex"

Combine responses for the chosen question

Filtering each data frame for chosen question and merging into one table

combined_responses <- lapply(gut_list, function(df){
  dplyr::filter(df, Questions == chosen_question) %>% 
    select(SampleID, Response)
}) %>% bind_rows()

summary_counts <- combined_responses %>%
  count(Response)

print(summary_counts)
## # A tibble: 2 × 2
##   Response     n
##   <chr>    <int>
## 1 female       7
## 2 male         1

Create a Pie Chart of summarized responses

converting the summary counts into a pie chart using ggplot2

ggplot(summary_counts, aes(x = "", y = n, fill = Response)) +
  geom_col(width = 1) +
  coord_polar(theta = "y") +
  labs(
    title = paste("Responses to", chosen_question, "Questions"),
    fill = "Response"
  ) +
  theme_void()