Contents

suppressPackageStartupMessages({
    library(dplyr)
    library(dbgap2x)
})

1 dbGaP data

1.1 Query

  1. Study Disease/Focuse: Atrial Fibrillation, Cardiomyopathies, Cardiomyopathy (Dilated), Cardiovascular Disease, Cardiovascular Diseases, Coronary Artery Disease, Coronary Disease, Heart Conduction System, Heart Defects (Congenital), Heart Diseases, Heart Septal Defects, Hypertension, Hypertension (Pulmonary), Myocardial Infarction, Myocardial Revasularization, Stroke, Venous Thromboembolism, Venous Thrombosis

  2. Study Molecular Data type: SNP/CNV Genotypes (NGS), SNP Genotypes (Array), SNP Genotypes (NGS), SNP Genotypes (imputed), SNP Genotypes (PCR), Legacy Genotypes

1.2 Find parent study

# Downloaded
x = read.csv("~/data2/BioDataCatalyst/genotype_171.csv")
study = stringr::str_extract(x$Study, "phs.{6}")
x$studyId = study

# Mark the 'parent' study
for (i in seq_along(x$studyId)) {
    
    if (is.parent(x$studyId[i]) == TRUE) {
        x$parent[i] = x$studyId[i]
    } else {
        parent = parent.study(x$studyId[i])[1]
        x$parent[i] = parent
    }
}

# Save
write.csv(x, "~/data2/BioDataCatalyst/genotype_171_annotated.csv", row.names = FALSE)

1.3 Unique parent studies

y = x[,c("studyId", "parent")] %>% unique  # merge multiple studyId
y$parent = gsub("'", "", y$parent)
length(unique(y$parent)) # the number of unique parent studies
## [1] 77

2 dbGaP data subset

2.1 More than 10,000 subjects

# Downloaded
x = read.csv("~/data2/BioDataCatalyst/genotype_21.csv")
study = stringr::str_extract(x$Study, "phs.{6}")
x$studyId = study

# Mark the 'parent' study
for (i in seq_along(x$studyId)) {
    
    if (is.parent(x$studyId[i]) == TRUE) {
        x$parent[i] = x$studyId[i]
    } else {
        parent = parent.study(x$studyId[i])[1]
        x$parent[i] = parent
    }
}

y = x[,c("studyId", "parent")] %>% unique  # merge multiple studyId
y$parent = gsub(" '", "", y$parent)
length(unique(y$parent)) # the number of unique parent studies
## [1] 7
unique(y$parent)
## [1] "phs000280.v6.p1"   "phs000007.v30.p11" "phs000810.v1.p1"  
## [4] "phs000925"         "phs001211"         "phs001237"        
## [7] "phs001644"

2.2 Participants

the number of participants included in each consent group (total 183,077)

a = unique(y$parent) %>% stringr::str_extract(., "phs.{6}")
for (i in seq_along(a)) {
    pop = n.pop(a[i])
    print(a[i])
    print(pop)
}
## [1] "phs000280"
##   consent_group  male female total
## 1       HMB-IRB  7055   8621 15682
## 2       HMB-IRB  7018   8586 15610
## 3    DS-CVD-IRB    35     33    68
## 4         TOTAL 14108  17240 31360
## [1] "phs000007"
##     consent_group  male female total
## 1     HMB-IRB-MDS  8964   9292 18267
## 2     HMB-IRB-MDS  6200   6926 13126
## 3 HMB-IRB-NPU-MDS   842   1169  2011
## 4           TOTAL 16006  17387 33404
## [1] "phs000810"
##   consent_group  male female total
## 1           HMB  5466   7780 13296
## 2       HMB-NPU  1448   2233  3681
## 3           HMB  3839   5375  9214
## 4         TOTAL 10753  15388 26191
## [1] "phs000925"
##   consent_group male female total
## 1           GRU 4949   7955 13245
## 2           GRU 4864   7881 13067
## 3         TOTAL 9813  15836 26312
## [1] "phs001211"
##   consent_group  male female total
## 1       HMB-IRB  6110   7436 13546
## 2       HMB-IRB  6083   7403 13486
## 3    DS-CVD-IRB    27     33    60
## 4         TOTAL 12220  14872 27092
## [1] "phs001237"
##   consent_group female total
## 1       HMB-IRB  11357 11357
## 2       HMB-IRB   9271  9271
## 3   HMB-IRB-NPU   2086  2086
## 4         TOTAL  22714 22714
## [1] "phs001644"
##   consent_group male female total
## 1       HMB-NPU 6751   9244 16004
## 2         TOTAL 6751   9244 16004