The goal of this porject is to scrape survey data from NCBI Biosample pages using rvest and summarize one common survey question across 8 samples.
This function takes a numeric NCBI Biosample ID and downloads the page to extract the first HTML table and cleans the column names to “Question” and “Response.” It also adds a column noting the SampleID for tracking.
get_gut_survey <- function(biosample_id) {
url <- paste0("https://www.ncbi.nlm.nih.gov/biosample/", biosample_id)
page <- read_html(url)
tbl <- page %>%
html_element("table") %>%
html_table(fill = TRUE)
names(tbl) <- c("Questions", "Response")
tbl$SampleID <- biosample_id
return(tbl)
}
Scrape the survey for BioSample ID 31280776 and looking at rows.
gut_31280776 <- get_gut_survey(31280776)
head(gut_31280776)
## # A tibble: 6 × 3
## Questions Response SampleID
## <chr> <chr> <dbl>
## 1 dominant hand I am right handed 31280776
## 2 environmental medium feces 31280776
## 3 environmental package human-gut 31280776
## 4 host body habitat UBERON:feces 31280776
## 5 host body mass index 28.9 31280776
## 6 host body product UBERON:feces 31280776
Downloading ID ranges from 31280770 through 31280777 and using lapply to run survery on all. Naming each element for easy reference.
ids <- 31280770:31280777
gut_list <- lapply(ids, get_gut_survey)
names(gut_list) <- ids
Summarizing a single question requires one present in every data frame. So I will locate the intersection across all data frames. Then I view all common questions and pick one.
common_questions <- Reduce(
intersect,
lapply(gut_list, function(df) df$Questions)
)
print(common_questions)
## [1] "dominant hand"
## [2] "environmental medium"
## [3] "environmental package"
## [4] "host body habitat"
## [5] "host body mass index"
## [6] "host body product"
## [7] "host tissue sampled"
## [8] "host height"
## [9] "life stage"
## [10] "race"
## [11] "sample name"
## [12] "sample type"
## [13] "sex"
## [14] "ENA-CHECKLIST"
## [15] "ENA-FIRST-PUBLIC"
## [16] "ENA-LAST-UPDATE"
## [17] "External Id"
## [18] "INSDC center name"
## [19] "INSDC first public"
## [20] "INSDC last update"
## [21] "INSDC status"
## [22] "Submitter Id"
## [23] "acid_reflux"
## [24] "acne_medication"
## [25] "acne_medication_otc"
## [26] "add_adhd"
## [27] "age_cat"
## [28] "alcohol_consumption"
## [29] "alcohol_frequency"
## [30] "alcohol_types_beercider"
## [31] "alcohol_types_red_wine"
## [32] "alcohol_types_sour_beers"
## [33] "alcohol_types_spiritshard_alcohol"
## [34] "alcohol_types_unspecified"
## [35] "alcohol_types_white_wine"
## [36] "allergic_to_i_have_no_food_allergies_that_i_know_of"
## [37] "allergic_to_peanuts"
## [38] "allergic_to_shellfish"
## [39] "allergic_to_tree_nuts"
## [40] "allergic_to_unspecified"
## [41] "alzheimers"
## [42] "antibiotic_history"
## [43] "appendix_removed"
## [44] "artificial_sweeteners"
## [45] "asd"
## [46] "autoimmune"
## [47] "beet_frequency"
## [48] "birth_year"
## [49] "bmi_cat"
## [50] "bowel_movement_frequency"
## [51] "bowel_movement_quality"
## [52] "breastmilk_formula_ensure"
## [53] "cancer"
## [54] "cancer_treatment"
## [55] "cardiovascular_disease"
## [56] "cat"
## [57] "cdiff"
## [58] "chickenpox"
## [59] "clinical_condition"
## [60] "consume_animal_products_abx"
## [61] "contraceptive"
## [62] "cosmetics_frequency"
## [63] "country_of_birth"
## [64] "country_residence"
## [65] "covid_anxious"
## [66] "covid_chronic_conditions_any_autoimmune_disease"
## [67] "covid_chronic_conditions_arthritis"
## [68] "covid_chronic_conditions_asthma_or_other_lung_problems"
## [69] "covid_chronic_conditions_atrial_fibrillation_or_atrial_flutter"
## [70] "covid_chronic_conditions_cancer"
## [71] "covid_chronic_conditions_diabetes_or_high_blood_sugar"
## [72] "covid_chronic_conditions_frequent_or_very_bad_headaches"
## [73] "covid_chronic_conditions_heart_disease__myocardial_infarction"
## [74] "covid_chronic_conditions_heart_problems"
## [75] "covid_chronic_conditions_hiv"
## [76] "covid_chronic_conditions_hypertension"
## [77] "covid_chronic_conditions_immune_disorder"
## [78] "covid_chronic_conditions_kidney_problems"
## [79] "covid_chronic_conditions_seasonal_allergies"
## [80] "covid_chronic_conditions_serious_acne_or_skin_problems"
## [81] "covid_chronic_conditions_serious_stomach_or_bowel_problems"
## [82] "covid_chronic_conditions_unspecified"
## [83] "covid_depressed"
## [84] "covid_family_members_fallen_ill_physically"
## [85] "covid_family_members_hospitalized"
## [86] "covid_family_members_lost_job"
## [87] "covid_family_members_none_of_the_above"
## [88] "covid_family_members_passed_away"
## [89] "covid_family_members_put_into_self_quarantine_with_symptoms"
## [90] "covid_family_members_put_into_self_quarantine_without_symptoms_eg_due_to_possible_exposure"
## [91] "covid_family_members_reduced_ability_to_earn_money"
## [92] "covid_family_members_unspecified"
## [93] "covid_happened_to_you_fallen_ill_physically"
## [94] "covid_happened_to_you_hospitalized"
## [95] "covid_happened_to_you_lost_job"
## [96] "covid_happened_to_you_none_of_the_above"
## [97] "covid_happened_to_you_put_into_self_quarantine_with_symptoms"
## [98] "covid_happened_to_you_put_into_self_quarantine_without_symptoms_eg_due_to_possible_exposure"
## [99] "covid_happened_to_you_reduced_ability_to_earn_money"
## [100] "covid_happened_to_you_unspecified"
## [101] "covid_interest_pleasure"
## [102] "covid_left_home"
## [103] "covid_level_of_wellbeing"
## [104] "covid_likely_exposure_no"
## [105] "covid_likely_exposure_unspecified"
## [106] "covid_likely_exposure_yes_someone_with_medical_diagnosis_but_no_test"
## [107] "covid_likely_exposure_yes_someone_with_positive_test"
## [108] "covid_likely_exposure_yes_someone_with_possible_symptoms_but_no_diagnosis_by_doctor"
## [109] "covid_patient_care"
## [110] "covid_patient_care_seven_days"
## [111] "covid_patient_care_seven_days_confirmed"
## [112] "covid_quality_of_life"
## [113] "covid_rideshare"
## [114] "covid_sleep_interference"
## [115] "covid_sleep_pattern"
## [116] "covid_suspected_positive"
## [117] "covid_symptoms_chest_pain_or_tightness_in_chest"
## [118] "covid_symptoms_cough"
## [119] "covid_symptoms_diarrhea"
## [120] "covid_symptoms_fatigue"
## [121] "covid_symptoms_fever"
## [122] "covid_symptoms_lack_of_appetitie"
## [123] "covid_symptoms_loss_of_taste_or_smell"
## [124] "covid_symptoms_nausea"
## [125] "covid_symptoms_shortness_of_breath"
## [126] "covid_symptoms_sore_throat"
## [127] "covid_symptoms_stay_home"
## [128] "covid_symptoms_unspecified"
## [129] "covid_worried_sleep"
## [130] "covid_worrying"
## [131] "csection"
## [132] "deodorant_use"
## [133] "diabetes"
## [134] "diabetes_type"
## [135] "diet_type"
## [136] "dog"
## [137] "drinking_water_source"
## [138] "drinks_per_session"
## [139] "empo_3"
## [140] "epilepsy_or_seizure_disorder"
## [141] "exercise_frequency"
## [142] "exercise_location"
## [143] "fed_as_infant"
## [144] "fermented_plant_frequency"
## [145] "flossing_frequency"
## [146] "flu_vaccine_date"
## [147] "frozen_dessert_frequency"
## [148] "fruit_frequency"
## [149] "fungal_overgrowth"
## [150] "gluten"
## [151] "high_fat_red_meat_frequency"
## [152] "homecooked_meals_frequency"
## [153] "host_weight"
## [154] "ibd"
## [155] "ibd_diagnosis_refined"
## [156] "ibs"
## [157] "kidney_disease"
## [158] "lactose"
## [159] "last_move"
## [160] "last_travel"
## [161] "level_of_education"
## [162] "liver_disease"
## [163] "livingwith"
## [164] "lung_disease"
## [165] "meat_eggs_frequency"
## [166] "mental_illness"
## [167] "mental_illness_type_bipolar_disorder"
## [168] "mental_illness_type_bulimia_nervosa"
## [169] "mental_illness_type_depression"
## [170] "mental_illness_type_unspecified"
## [171] "migraine"
## [172] "milk_cheese_frequency"
## [173] "milk_substitute_frequency"
## [174] "multivitamin"
## [175] "nail_biter"
## [176] "non_food_allergies_beestings"
## [177] "non_food_allergies_drug_eg_penicillin"
## [178] "non_food_allergies_pet_dander"
## [179] "non_food_allergies_poison_ivyoak"
## [180] "non_food_allergies_sun"
## [181] "non_food_allergies_unspecified"
## [182] "olive_oil"
## [183] "one_liter_of_water_a_day_frequency"
## [184] "other_supplement_frequency"
## [185] "pets_other"
## [186] "pku"
## [187] "plant_protein_frequency"
## [188] "pool_frequency"
## [189] "poultry_frequency"
## [190] "pregnant"
## [191] "prepared_meals_frequency"
## [192] "probiotic_frequency"
## [193] "ready_to_eat_meals_frequency"
## [194] "red_meat_frequency"
## [195] "roommates_in_study"
## [196] "salted_snacks_frequency"
## [197] "seafood_frequency"
## [198] "seasonal_allergies"
## [199] "sibo"
## [200] "skin_condition"
## [201] "sleep_duration"
## [202] "smoking_frequency"
## [203] "softener"
## [204] "specialized_diet_exclude_dairy"
## [205] "specialized_diet_exclude_refined_sugars"
## [206] "specialized_diet_fodmap"
## [207] "specialized_diet_i_do_not_eat_a_specialized_diet"
## [208] "specialized_diet_kosher"
## [209] "specialized_diet_modified_paleo_diet"
## [210] "specialized_diet_other_restrictions_not_described_here"
## [211] "specialized_diet_paleo_diet_or_primal_diet"
## [212] "specialized_diet_unspecified"
## [213] "specialized_diet_weston_price_or_other_low_grain_low_processed_food_diet"
## [214] "sugar_sweetened_drink_frequency"
## [215] "sugary_sweets_frequency"
## [216] "teethbrushing_frequency"
## [217] "thyroid"
## [218] "tonsils_removed"
## [219] "types_of_plants"
## [220] "vegetable_frequency"
## [221] "vitamin_b_supplement_frequency"
## [222] "vitamin_d_supplement_frequency"
## [223] "vivid_dreams"
## [224] "weight_change"
## [225] "whole_eggs"
## [226] "whole_grain_frequency"
chosen_question <- "sex"
Filtering each data frame for chosen question and merging into one table
combined_responses <- lapply(gut_list, function(df){
dplyr::filter(df, Questions == chosen_question) %>%
select(SampleID, Response)
}) %>% bind_rows()
summary_counts <- combined_responses %>%
count(Response)
print(summary_counts)
## # A tibble: 2 × 2
## Response n
## <chr> <int>
## 1 female 7
## 2 male 1
converting the summary counts into a pie chart using ggplot2
ggplot(summary_counts, aes(x = "", y = n, fill = Response)) +
geom_col(width = 1) +
coord_polar(theta = "y") +
labs(
title = paste("Responses to", chosen_question, "Questions"),
fill = "Response"
) +
theme_void()