This analysis fine-maps a 1MB region on chr1 for proprotein convertase subtilisin/kexin type 9 (PCSK9) using SuSiE. We focus on a 500KB upstream and downstream of the sentinel SNP (rs11591147) and include only variants with a MAF >0.01. The purpose of the analysis is to obtain a positive control to sanity check if the methods I’m developing reproduce a known signal.
Direct quote from the synapse the UK Biobank Pharma Proteomics Project (UKB-PPP) website:
The UK Biobank Pharma Proteomics Project (UKB-PPP) is a collaboration between the UK Biobank (UKB) and thirteen biopharmaceutical companies characterising the plasma proteomic profiles of 54,219 UKB participants. As part of a collaborative analysis across the thirteen UKB-PPP partners, we conducted comprehensive protein quantitative trait loci (pQTL) mapping of 2,923 proteins that identifies 14,287 primary genetic associations, of which 85% are newly discovered, in addition to ancestry-specific pQTL mapping in non-Europeans. We identify independent secondary associations in 87% of cis and 30% of trans loci, expanding the catalogue of genetic instruments for downstream analyses.
NB: My understanding is that a larger list of UKB-PPP pQTL summary statistics are available somewhere. (Certainly a larger list of individual-level UKB protein measurements are available via UKB with application access.)
The UKB-PPP pQTL summary statistics are available via synapse for the “discovery” UKB EUR population. To download, it requires creating a synapse account, obtaining a personal access token, and adding files to a download list.
cd /Users/charleenadams/temp_BI/susie_pcsk9
conda create -n synapse_env -c conda-forge mamba
conda activate synapse_env
pip install synapseclient
synapse -h
synapse get-download-list
synapse login "cda...." eyJ0eXAiOiJKV1QiLCJraWQiOiJXN05OOldMSlQ6SjVSSzpMN1RMOlQ3TDc6M1ZYNjpKRU9VOjY0NFI6VTNJWDo1S1oyOjdaQ0s6RlBUSCIsImFsZyI6IlJTMjU2In0.eyJhY2Nlc3MiOnsic2NvcGUiOlsidmlldyIsImRvd25sb2FkIiwibW9kaWZ5Il0sIm9pZGNfY2xhaW1zIjp7fX0sInRva2VuX3R5cGUiOiJQRVJTT05BTF9BQ0NFU1NfVE9LRU4iLCJpc3MiOiJodHRwczovL3JlcG8tcHJvZC5wcm9kLnNhZ2ViYXNlLm9yZy9hdXRoL3YxIiwiYXVkIjoiMCIsIm5iZiI6MTcyOTcwNjgzOSwiaWF0IjoxNzI5NzA2ODM5LCJqdGkiOiIxMzA3MyIsInN1YiI6IjM0NzI3ODMifQ.B65yULl25FZ1vmFOXEE6RaNYOhXTXZ5ov5JLVP47nJpejRA6oMfcwOcY1z4RH7bLFQ48xzRhc1I8kSbexid8jpoKnQJlBI8mVNItvv92dZNDrNN_8cvXtHNLVb6O12jwIZ0XDuQxcLuxm0kF52IZSXWMtwBzBeyKckkNYL9LTbQKDmFRmOpwZGVjkZyT7wsiuLLjj9bTcMfFcY-IstD7xnK4O51D_UejK6vg-Eh0Uw23i4NHhMAA02-D-tZq1yV0X8yI0EIOFoLQ3AKtXgrr4sE6zDGz4KTPaqVM557Vv9Aqk2KtnfiZRSWWxWYPbJm1Ty8AYLBDv6Iq-tdAval8kw
Maps are provided per chromosome.
In terminal window, look at the rsid map for chromosome 1
zless - olink_rsid_map_mac5_info03_b0_7_chr1_patched_v2.tsv.gz
ID REF ALT rsid POS19 POS38
1:10177:A:AC:imp:v1 A AC rs367896724 10177 10177
1:10352:T:TA:imp:v1 T TA rs201106462 10352 10352
1:10511:G:A:imp:v1 G A rs534229142 10511 10511
grep "PCSK9" olink_protein_map_3k_v1.tsv
PCSK9:Q8NBP7:OID20235:v1 Proprotein convertase subtilisin/kexin type 9 OID20235 Q8NBP7 PCSK9 Cardiometabolic B04413 Q8NBP7 PCSK9 ENSG0000016917455039447 55064852 1 1:100 C no
grep "PCSK9" olink_protein_map_1.5k_v1.tsv
PCSK9:Q8NBP7:OID20235:v1 Proprotein convertase subtilisin/kexin type 9 OID20235 Q8NBP7 PCSK9 Cardiometabolic B04413 Q8NBP7 PCSK9 ENSG0000016917455039447 55064852 1 1:100 C
tar -xf PCSK9_Q8NBP7_OID20235_v1_Cardiometabolic.tar
cd PCSK9_Q8NBP7_OID20235_v1_Cardiometabolic
zless discovery_chr1_PCSK9:Q8NBP7:OID20235:v1:Cardiometabolic.gz
CHROM GENPOS ID ALLELE0 ALLELE1 A1FREQ INFO N TEST BETA SE CHISQ LOG10P EXTRA
1 17641 1:17641:G:A:imp:v1 G A 0.000857406 0.868653 34090 ADD 0.0543698 0.134404 0.16364 0.163785 NA
1 55057 1:55057:A:G:imp:v1 A G 0.000905533 0.722233 34090 ADD -0.177095 0.143424 1.52463 0.6637 NA
1 101551 1:101551:T:C:imp:v1 T C 0.000760446 0.771951 34090 ADD -0.185624 0.151463 1.50195 0.656845 NA
mapch1=fread("/Users/charleenadams/temp_BI/susie_pcsk9/olink_rsid_map_mac5_info03_b0_7_chr1_patched_v2.tsv")
psck9ch1=fread("/Users/charleenadams/temp_BI/susie_pcsk9/PCSK9_Q8NBP7_OID20235_v1_Cardiometabolic/discovery_chr1_PCSK9:Q8NBP7:OID20235:v1:Cardiometabolic")
merged_data <- merge(mapch1, psck9ch1, by = "ID")
write.table(merged_data, "/Users/charleenadams/temp_BI/susie_pcsk9/PCSK9_Q8NBP7_OID20235_v1_Cardiometabolic/merged_chr1_map.txt",
row.names = FALSE, col.names = TRUE, quote = FALSE, sep = '\t')
merged_data <- fread("/Users/charleenadams/temp_BI/susie_pcsk9/PCSK9_Q8NBP7_OID20235_v1_Cardiometabolic/merged_chr1_map.txt")
# Convert LOG10P to regular p-values
merged_data$Pval <- 10^(-merged_data$LOG10P)
# Sort by the newly created Pval column to find the SNP with the smallest p-value
sorted_data <- merged_data[order(merged_data$Pval), ]
# Display the SNP with the smallest p-value (which will be the first row after sorting)
head(sorted_data, 1)
## ID REF ALT rsid POS19 POS38 CHROM
## <char> <char> <char> <char> <int> <int> <int>
## 1: 1:55505647:G:T:imp:v1 G T rs11591147 55505647 55039974 1
## GENPOS ALLELE0 ALLELE1 A1FREQ INFO N TEST BETA SE
## <int> <char> <char> <num> <num> <int> <char> <num> <num>
## 1: 55039974 G T 0.0176885 1.00225 34090 ADD -1.15328 0.0277856
## CHISQ LOG10P EXTRA Pval
## <num> <num> <lgcl> <num>
## 1: 1722.8 375.816 NA 0
# Identify the SNP of interest
snp_of_interest <- sorted_data %>% filter(rsid == "rs11591147")
# Extract position and chromosome
snp_position <- snp_of_interest$POS38[1]
chromosome <- snp_of_interest$CHROM[1]
# Define 500KB upstream and downstream boundaries
upstream_limit <- snp_position - 500000
downstream_limit <- snp_position + 500000
# Subset data within the 500KB window
subset_data <- sorted_data %>%
filter(CHROM == chromosome &
POS38 >= upstream_limit &
POS38 <= downstream_limit)
# Save the subset data
write.table(subset_data, "/Users/charleenadams/temp_BI/susie_pcsk9/500kb_up_and_downstream_sentinel_PCSK9.txt",
row.names = FALSE, col.names = TRUE, quote = FALSE, sep = '\t')
write.csv(subset_data, "/Users/charleenadams/temp_BI/susie_pcsk9/500kb_up_and_downstream_sentinel_PCSK9.csv")
500KB up and downstream of sentinel SNP on chr1
NB: I want a simpler and more consistent convention for naming files and folders….
# Location of 1000 Genomes vcf for EUR
cd /Users/charleenadams/temp_BI/LD_matrices/1000Genomes_scratch/annotated/eur
# Create the bfiles with 0.01 MAF threshold as our sentinel's MAF is between 0.01 and 0.05
nano creat_bfiles_from_vcf_MAF01.sh
tabix -p vcf ALL.chr1.shapeit2_integrated_snvindels_v2a_27022019.GRCh38.phased_annotated_EUR.vcf.gz
chmod +x create_bfiles_from_vcf_MAF01.sh
nohup ./create_bfiles_from_vcf_MAF01.sh > create_bfiles_from_vcf_MAF01_log.txt 2>&1 &
# Verfiy the bfiles were generated
cd
/Users/charleenadams/temp_BI/LD_matrices/1000Genomes_scratch/annotated/eur/eur_bfiles_filtered/eur_bfiles_MAF01
ls
ALL.chr1.maf01_filtered_EUR.bed ALL.chr1.maf01_filtered_EUR.bim ALL.chr1.maf01_filtered_EUR.fam ALL.chr1.maf01_filtered_EUR.log
# Navigate to the directory where I'd created the matrix for MAF 0.05 for chr10 and make one there for ch1 for maf01 in both square and df formats
cd ..
pwd
/Users/charleenadams/temp_BI/LD_matrices/1000Genomes_scratch/annotated/eur/eur_bfiles_filtered
nano make_eur_filtered_matrix_maf01.sh
nano make_eur_filtered_matrix_maf01_df.sh
nohup ./make_eur_filtered_matrix_maf01.sh > make_eur_filtered_matrix_maf01_log.txt 2>&1 &
nohup ./make_eur_filtered_matrix_maf01_df.sh > make_eur_filtered_matrix_maf01_df_log.txt 2>&1 &
# Verify the matrices are there and didn't populate with NaNs: they are good
cd
/Users/charleenadams/temp_BI/LD_matrices/1000Genomes_scratch/annotated/eur/eur_bfiles_filtered/eur_bfiles_MAF01/chr1_ld_matrixMAF01
ls
LD_matrix_chr1_54539974_55539974_maf01_filtered_EUR2.log
LD_matrix_chr1_54539974_55539974_maf01_filtered_EUR2_df.log
LD_matrix_chr1_54539974_55539974_maf01_filtered_EUR2.phased.vcor1
LD_matrix_chr1_54539974_55539974_maf01_filtered_EUR2_df.vcor
LD_matrix_chr1_54539974_55539974_maf01_filtered_EUR2.phased.vcor1.vars
# Location for square LD matrix and SNP list
ld_matrix_file1 <- "/Users/charleenadams/temp_BI/LD_matrices/1000Genomes_scratch/annotated/eur/eur_bfiles_filtered/eur_bfiles_MAF01/chr1_ld_matrixMAF01/LD_matrix_chr1_54539974_55539974_maf01_filtered_EUR2.phased.vcor1"
snp_list_file1 <- "/Users/charleenadams/temp_BI/LD_matrices/1000Genomes_scratch/annotated/eur/eur_bfiles_filtered/eur_bfiles_MAF01/chr1_ld_matrixMAF01/LD_matrix_chr1_54539974_55539974_maf01_filtered_EUR2.phased.vcor1.vars"
# Load square LD matrix and SNP list
ld_matrix1 <- as.matrix(read.table(ld_matrix_file1, header = FALSE, stringsAsFactors = FALSE))
snp_list1 <- read.table(snp_list_file1, header = FALSE, stringsAsFactors = FALSE)$V1
rownames(ld_matrix1) <- colnames(ld_matrix1) <- snp_list1
# Ensure LD matrix is symmetric
if (!isSymmetric(ld_matrix1)) {
ld_matrix1 <- (ld_matrix1 + t(ld_matrix1)) / 2
}
# Load LD dataframe for MAJ_A annotation
ld_matrix_file_df <- fread("/Users/charleenadams/temp_BI/LD_matrices/1000Genomes_scratch/annotated/eur/eur_bfiles_filtered/eur_bfiles_MAF01/chr1_ld_matrixMAF01/LD_matrix_chr1_54539974_55539974_maf01_filtered_EUR2_df.vcor")
# Read-in the chr1 regional subset for the *PCSK9*
# Convert both datasets to data.table if not already
pcsk9_ch1=read.csv("/Users/charleenadams/temp_BI/susie_pcsk9/500kb_up_and_downstream_sentinel_PCSK9.csv")
dim(pcsk9_ch1)
## [1] 6665 21
setDT(pcsk9_ch1)
setDT(ld_matrix_file_df)
# Remove duplicates based on ID_A, keeping the first occurrence
ld_matrix_file_df2_unique <- ld_matrix_file_df %>%
distinct(ID_A, .keep_all = TRUE)
# Merge with subset_data on rsid and ID_A
plink_merged_data <- pcsk9_ch1 %>%
inner_join(ld_matrix_file_df2_unique, by = c("rsid" = "ID_A"))
# colnames(plink_merged_data)
# dim(plink_merged_data)
# Create a column to indicate mismatched alleles
plink_merged_data <- plink_merged_data %>%
mutate(
mismatch = MAJ_A != ALLELE0
)
table(plink_merged_data$mismatch)
##
## FALSE TRUE
## 2547 861
# Add variable for mismatched MAJ_A and ALLELE0
mismatches=plink_merged_data[which(plink_merged_data$mismatch=="TRUE"),]
#head(mismatches)
# Look at the A1FREQ for the mismatches: most are the reference allele...
summary(mismatches$A1FREQ)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.002226 0.625754 0.739411 0.723622 0.809163 0.993406
plink_merged_data_copy <- copy(plink_merged_data) # retain original for sanity checking
# Harmonize alleles where there is a mismatch in the copied data
plink_merged_data[mismatch == TRUE, `:=`(
# Swap ALLELE0 and ALLELE1 if there is a mismatch
ALLELE0 = ALLELE1,
ALLELE1 = ALLELE0,
# Flip the BETA sign if there is a mismatch
BETA = -BETA
)]
# Check the results to verify that mismatches are resolved
print(plink_merged_data[mismatch == TRUE])
## X ID REF ALT rsid POS19 POS38
## <int> <char> <char> <char> <char> <int> <int>
## 1: 3 1:55521313:G:T:imp:v1 G T rs472495 55521313 55055640
## 2: 4 1:55521109:G:A:imp:v1 G A rs693668 55521109 55055436
## 3: 5 1:55519174:G:A:imp:v1 G A rs499883 55519174 55053501
## 4: 8 1:55521242:T:G:imp:v1 T G rs471705 55521242 55055569
## 5: 10 1:55520212:G:A:imp:v1 G A rs1165287 55520212 55054539
## ---
## 857: 6614 1:55125417:T:C:imp:v1 T C rs652245 55125417 54659744
## 858: 6622 1:55135161:A:G:imp:v1 A G rs484251 55135161 54669488
## 859: 6624 1:55271495:G:T:imp:v1 G T rs2077061 55271495 54805822
## 860: 6656 1:55921073:A:G:imp:v1 A G rs12743014 55921073 55455400
## 861: 6663 1:55354302:C:A:imp:v1 C A rs3170766 55354302 54888629
## CHROM GENPOS ALLELE0 ALLELE1 A1FREQ INFO N TEST BETA
## <int> <int> <char> <char> <num> <num> <int> <char> <num>
## 1: 1 55055640 T G 0.650489 0.996690 34090 ADD -1.57226e-01
## 2: 1 55055436 A G 0.649923 0.995948 34090 ADD -1.56640e-01
## 3: 1 55053501 A G 0.613184 0.960763 34090 ADD -1.54644e-01
## 4: 1 55055569 G T 0.640686 0.998569 34090 ADD -1.48991e-01
## 5: 1 55054539 A G 0.640543 0.989352 34090 ADD -1.49169e-01
## ---
## 857: 1 54659744 C T 0.755154 0.983153 34090 ADD -1.49105e-04
## 858: 1 54669488 G A 0.719152 0.975224 34090 ADD 1.23720e-04
## 859: 1 54805822 T G 0.609815 0.990566 34090 ADD -1.07789e-04
## 860: 1 55455400 G A 0.545352 0.993168 34090 ADD 2.55647e-05
## 861: 1 54888629 A C 0.583485 0.995522 34090 ADD 6.02401e-06
## SE CHISQ LOG10P EXTRA Pval #CHROM_A POS_A
## <num> <num> <num> <lgcl> <num> <int> <int>
## 1: 0.00770493 4.16401e+02 9.18292e+01 NA 1.481836e-92 1 55055640
## 2: 0.00770475 4.13320e+02 9.11585e+01 NA 6.942246e-92 1 55055436
## 3: 0.00768205 4.05241e+02 8.93999e+01 NA 3.981988e-90 1 55053501
## 4: 0.00764990 3.79323e+02 8.37576e+01 NA 1.747431e-84 1 55055569
## 5: 0.00768484 3.76779e+02 8.32037e+01 NA 6.256047e-84 1 55054539
## ---
## 857: 0.00860106 3.00526e-04 6.04874e-03 NA 9.861688e-01 1 54659744
## 858: 0.00826730 2.23949e-04 5.21661e-03 NA 9.880602e-01 1 54669488
## 859: 0.00755540 2.03532e-04 4.97176e-03 NA 9.886174e-01 1 54805822
## 860: 0.00740328 1.19243e-05 1.19823e-03 NA 9.972448e-01 1 55455400
## 861: 0.00745639 6.52702e-07 2.80041e-04 NA 9.993554e-01 1 54888629
## MAJ_A CHROM_B POS_B ID_B MAJ_B PHASED_R mismatch
## <char> <int> <int> <char> <char> <num> <lgcl>
## 1: T 1 55056468 rs479910 G 0.930836 TRUE
## 2: A 1 55055569 rs471705 G 0.981177 TRUE
## 3: A 1 55053565 rs521662 T 0.843058 TRUE
## 4: G 1 55055640 rs472495 T 0.981177 TRUE
## 5: A 1 55054592 rs634272 C 0.820034 TRUE
## ---
## 857: C 1 54664069 rs689258 T 0.761521 TRUE
## 858: G 1 54670394 rs2251719 G 0.897987 TRUE
## 859: T 1 54806101 rs757581 G 0.985492 TRUE
## 860: G 1 55455817 rs12124470 T 0.600031 TRUE
## 861: A 1 54888662 rs4927176 T 1.000000 TRUE
# print(head(plink_merged_data))
# print(head(plink_merged_data_copy))
# Create a column to indicate mismatched alleles have been resolved
plink_merged_data <- plink_merged_data %>%
mutate(
mismatch_resolved = MAJ_A != ALLELE0
)
table(plink_merged_data$mismatch_resolved)
##
## FALSE TRUE
## 3407 1
unresolved=plink_merged_data[which(plink_merged_data$mismatch_resolved=="TRUE"),]
unresolved
## X ID REF ALT rsid POS19 POS38 CHROM
## <int> <char> <char> <char> <char> <int> <int> <int>
## 1: 5810 1:55723635:A:T:imp:v1 A T rs2802860 55723635 55257962 1
## GENPOS ALLELE0 ALLELE1 A1FREQ INFO N TEST BETA
## <int> <char> <char> <num> <num> <int> <char> <num>
## 1: 55257962 T A 0.00222606 0.681933 34090 ADD -0.0287456
## SE CHISQ LOG10P EXTRA Pval #CHROM_A POS_A MAJ_A
## <num> <num> <num> <lgcl> <num> <int> <int> <char>
## 1: 0.0942173 0.0930852 0.11902 NA 0.7602913 1 55257962 C
## CHROM_B POS_B ID_B MAJ_B PHASED_R mismatch mismatch_resolved
## <int> <int> <char> <char> <num> <lgcl> <lgcl>
## 1: 1 55258144 rs10888907 A 0.59065 TRUE TRUE
# Hot damn, those multi-allelic variants!!!
# Look up the unresolved snp in dbsnp:
# rs2802860 [Homo sapiens]
# Variant type:SNVAlleles:A>C,T [Show Flanks]Chromosome:1:55257962 (GRCh38)
# 1:55723635 (GRCh37)
plink_merged_data_no_mulitalleles <- plink_merged_data[which(plink_merged_data$mismatch_resolved=="FALSE"),]
dim(plink_merged_data_no_mulitalleles)
## [1] 3407 31
# Identify palindromes but keep for now...
plink_merged_data_no_mulitalleles_palindromesID <- plink_merged_data_no_mulitalleles %>%
mutate(
palindromic = (ALLELE1 %in% c("A", "T") & ALLELE0 %in% c("A", "T")) |
(ALLELE1 %in% c("C", "G") & ALLELE0 %in% c("C", "G"))
)
table(plink_merged_data_no_mulitalleles_palindromesID$palindromic)
##
## FALSE TRUE
## 2939 468
# Extract the SNPs from plink_merged_data_no_mulitalleles_palindromesID
snps_to_include <- plink_merged_data_no_mulitalleles_palindromesID$rsid
# Subset ld_matrix1 to include only the SNPs in snps_to_include
# Also ensure the order of the rows and columns follows the order in snps_to_include
ld_matrix1_filtered <- ld_matrix1[snps_to_include, snps_to_include]
dim(ld_matrix1_filtered)
## [1] 3407 3407
str(ld_matrix1_filtered)
## num [1:3407, 1:3407] 1 0.00386 0.00386 0.01769 0.01415 ...
## - attr(*, "dimnames")=List of 2
## ..$ : chr [1:3407] "rs11591147" "rs472495" "rs693668" "rs499883" ...
## ..$ : chr [1:3407] "rs11591147" "rs472495" "rs693668" "rs499883" ...
write.table(plink_merged_data_no_mulitalleles_palindromesID,
"/Users/charleenadams/temp_BI/susie_pcsk9/PCSK9_Q8NBP7_OID20235_v1_Cardiometabolic/plink_merged_data_no_mulitalleles_palindromesID.txt", row.names = FALSE, col.names = TRUE, quote = FALSE, sep = '\t')
# SuSiE needs a single value for the sample size:
summary(plink_merged_data_no_mulitalleles_palindromesID$N)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 34090 34090 34090 34090 34090 34090
dataset1QC <- list(
beta = plink_merged_data_no_mulitalleles_palindromesID$BETA,
varbeta = (plink_merged_data_no_mulitalleles_palindromesID$SE)^2,
type = "quant",
snp = plink_merged_data_no_mulitalleles_palindromesID$rsid,
position = plink_merged_data_no_mulitalleles_palindromesID$POS38,
N = plink_merged_data_no_mulitalleles_palindromesID$N[1],
MAF = plink_merged_data_no_mulitalleles_palindromesID$A1FREQ,
LD = ld_matrix1_filtered,
sdY = 1
)
cat("Running SuSiE for PCSK9 ...\n")
## Running SuSiE for PCSK9 ...
susie_trait1QC <- susie_rss(
z = dataset1QC$beta / sqrt(dataset1QC$varbeta),
R = dataset1QC$LD,
n = dataset1QC$N,
var_y = dataset1QC$sdY^2,
L = 10,
verbose = FALSE
)
cat("SuSiE run for Trait 1 completed.\n")
## SuSiE run for Trait 1 completed.
summary(susie_trait1QC)
##
## Variables in credible sets:
##
## variable variable_prob cs
## 1 1.00000000 1
## 4 1.00000000 2
## 6 1.00000000 6
## 135 1.00000000 9
## 68 1.00000000 8
## 3058 1.00000000 3
## 1039 0.99994243 5
## 344 0.99384049 7
## 749 0.54530243 10
## 45 0.40976783 4
## 48 0.33576116 4
## 47 0.25002138 4
## 751 0.22997779 10
## 778 0.16324414 10
## 814 0.02165504 10
##
## Credible sets summary:
##
## cs cs_log10bf cs_avg_r2 cs_min_r2 variable
## 1 Inf 1.0000000 1.0000000 1
## 2 116.331764 1.0000000 1.0000000 4
## 3 25.734170 1.0000000 1.0000000 3058
## 5 20.982245 1.0000000 1.0000000 1039
## 6 139.228980 1.0000000 1.0000000 6
## 7 42.333242 1.0000000 1.0000000 344
## 8 22.979664 1.0000000 1.0000000 68
## 9 109.676113 1.0000000 1.0000000 135
## 4 9.511335 0.9964352 0.9946552 45,47,48
## 10 23.412318 0.9843960 0.9686733 749,751,778,814
# Run also with Chris Wallace's wrapper: `runsusie`
susie_trait1_runsusieQC <- runsusie(dataset1QC, verbose = FALSE)
summary(susie_trait1_runsusieQC)
##
## Variables in credible sets:
##
## variable variable_prob cs
## 1 1.00000000 1
## 4 1.00000000 2
## 6 1.00000000 6
## 135 1.00000000 9
## 68 1.00000000 8
## 3058 1.00000000 3
## 1039 0.99994243 5
## 344 0.99384049 7
## 749 0.54530243 10
## 45 0.40976783 4
## 48 0.33576116 4
## 47 0.25002138 4
## 751 0.22997779 10
## 778 0.16324414 10
## 814 0.02165504 10
##
## Credible sets summary:
##
## cs cs_log10bf cs_avg_r2 cs_min_r2 variable
## 1 Inf 1.0000000 1.0000000 1
## 2 116.331764 1.0000000 1.0000000 4
## 3 25.734170 1.0000000 1.0000000 3058
## 5 20.982245 1.0000000 1.0000000 1039
## 6 139.228980 1.0000000 1.0000000 6
## 7 42.333242 1.0000000 1.0000000 344
## 8 22.979664 1.0000000 1.0000000 68
## 9 109.676113 1.0000000 1.0000000 135
## 4 9.511335 0.9964352 0.9946552 45,47,48
## 10 23.412318 0.9843960 0.9686733 749,751,778,814
# Extract the variables data frame from the SuSiE object
variables_df <- susie_trait1_runsusieQC$variables
if(is.null(variables_df)) {
# This path assumes 'variables' is within the 'sets' list, adjust if necessary
variables_df <- data.frame(
variable = unlist(lapply(susie_trait1_runsusieQC$sets$cs, function(x) x)),
cs = rep(names(susie_trait1_runsusieQC$sets$cs), times = lengths(susie_trait1_runsusieQC$sets$cs)),
variable_prob = susie_trait1_runsusieQC$pip[unlist(susie_trait1_runsusieQC$sets$cs)]
)
}
# Sum 'variable_prob' by 'cs'
sum_by_cs <- aggregate(variable_prob ~ cs, data = variables_df, FUN = sum)
# Print or further use 'sum_by_cs'
print(sum_by_cs)
## cs variable_prob
## 1 L1 1.0000000
## 2 L10 0.9601794
## 3 L2 1.0000000
## 4 L3 1.0000000
## 5 L4 0.9955504
## 6 L5 0.9999424
## 7 L6 1.0000000
## 8 L7 0.9938405
## 9 L8 1.0000000
## 10 L9 1.0000000
# Extract the variable strings for each credible set
susie_variable_strings_pcsk9 <- summary(susie_trait1QC)$cs$variable
susie_variable_strings_pcsk9
## [1] "1" "4" "3058" "1039"
## [5] "6" "344" "68" "135"
## [9] "45,47,48" "749,751,778,814"
# Split the strings by commas and convert them into a numeric vector
susie_variable_indices_pcsk9 <- unlist(strsplit(susie_variable_strings_pcsk9 , ",")) %>% as.numeric()
susie_variable_indices_pcsk9
## [1] 1 4 3058 1039 6 344 68 135 45 47 48 749 751 778 814
# Extract the corresponding SNPs from the dataset using the indices
selected_snps_pcsk9 <- dataset1QC[["snp"]][susie_variable_indices_pcsk9]
# Print the selected SNPs
print(selected_snps_pcsk9)
## [1] "rs11591147" "rs499883" "rs78080497" "rs41294810" "rs12750160"
## [6] "rs112339942" "rs12732125" "rs34324306" "rs2479404" "rs2495491"
## [11] "rs2495489" "rs17109407" "rs4262587" "rs199775702" "rs7538000"
see_snps_in_data_pcsk9=plink_merged_data_no_mulitalleles_palindromesID[which(plink_merged_data_no_mulitalleles_palindromesID$rsid%in%selected_snps_pcsk9),]
# Subset the LD matrix to include only SNPs in all credible sets
ld_matrix_selected_snps_pcsk9 <- ld_matrix1_filtered[selected_snps_pcsk9, selected_snps_pcsk9]
str(ld_matrix_selected_snps_pcsk9)
## num [1:15, 1:15] 1 0.0177 0.2689 0.0786 0.3806 ...
## - attr(*, "dimnames")=List of 2
## ..$ : chr [1:15] "rs11591147" "rs499883" "rs78080497" "rs41294810" ...
## ..$ : chr [1:15] "rs11591147" "rs499883" "rs78080497" "rs41294810" ...
From Supplementary Table 16 of Plasma proteomic associations with genetics and health in the UK Biobank
NB: We centered around rs11591147. We found 5 SNPs that match those that Sun found (see below the table).
# Input as a single character string (copy-paste it directly)
sun_cs <- "rs11591147
rs472495, rs693668
rs760981293, rs199714093, rs749008649
rs2479404, rs2495491, rs2495489
rs7552350, rs613855
rs111928762, 1:55701838_ATGAG_A, rs11206517, rs113793186, rs3737810, rs2902875, rs75691641, rs113911544, rs505151, rs66679331, rs67722862, rs72662528, rs67540256, rs66499115, rs58472471, rs67293523, rs72662534, rs12067569, rs17131756, rs58447843, rs67175330, rs7525503, rs17111688, rs56833376, rs61163343, rs67276129, rs1475701, 1:55619453_ATAC_A, rs57357765, rs72911441, rs61176209, rs56235208, rs2270690, rs112339942, rs754615958, rs6697048, rs67171713
1:55515668_CACG_C, rs479832, rs531701, rs557211, rs535471, rs534473, rs41294821"
# Step 1: Replace newlines with commas to standardize the string
sun_cs_cleaned <- gsub("\n", ", ", sun_cs)
# Step 2: Split by comma and trim any extra spaces around the entries
sun_cs_cleaned_final <- trimws(unlist(strsplit(sun_cs_cleaned, ",")))
# View the cleaned result
print(sun_cs_cleaned_final)
## [1] "rs11591147" "rs472495" "rs693668"
## [4] "rs760981293" "rs199714093" "rs749008649"
## [7] "rs2479404" "rs2495491" "rs2495489"
## [10] "rs7552350" "rs613855" "rs111928762"
## [13] "1:55701838_ATGAG_A" "rs11206517" "rs113793186"
## [16] "rs3737810" "rs2902875" "rs75691641"
## [19] "rs113911544" "rs505151" "rs66679331"
## [22] "rs67722862" "rs72662528" "rs67540256"
## [25] "rs66499115" "rs58472471" "rs67293523"
## [28] "rs72662534" "rs12067569" "rs17131756"
## [31] "rs58447843" "rs67175330" "rs7525503"
## [34] "rs17111688" "rs56833376" "rs61163343"
## [37] "rs67276129" "rs1475701" "1:55619453_ATAC_A"
## [40] "rs57357765" "rs72911441" "rs61176209"
## [43] "rs56235208" "rs2270690" "rs112339942"
## [46] "rs754615958" "rs6697048" "rs67171713"
## [49] "1:55515668_CACG_C" "rs479832" "rs531701"
## [52] "rs557211" "rs535471" "rs534473"
## [55] "rs41294821"
# Find SNPs that are in both selected_snps_pcsk9 and sun_cs_cleaned_final
common_snps_adams_sun <- intersect(selected_snps_pcsk9, sun_cs_cleaned_final)
# Print the common SNPs
print(common_snps_adams_sun)
## [1] "rs11591147" "rs112339942" "rs2479404" "rs2495491" "rs2495489"
# Visualize the correlation matrix using corrplot for selected SNPs
corrplot(ld_matrix_selected_snps_pcsk9, method = "color", tl.cex = 0.6, tl.col = "black", is.corr = TRUE)
mtext("LD between SNPs in all credible sets for PCSK9", side = 3, line = 5, cex = 1.2)
# Prepare Manhattan plot
plink_merged_data_no_mulitalleles_palindromesID <- fread("/Users/charleenadams/temp_BI/susie_pcsk9/PCSK9_Q8NBP7_OID20235_v1_Cardiometabolic/plink_merged_data_no_mulitalleles_palindromesID.txt")
# colnames(plink_merged_data_no_mulitalleles_palindromesID)
# Define genome-wide significance threshold
genomewide_sig_threshold <- -log10(5e-8)
# Prepare data for Manhattan plot
# plink_merged_data_no_mulitalleles_palindromesID$LOG10P
# Convert to data frame if necessary
if (is.data.table(plink_merged_data_no_mulitalleles_palindromesID)) {
plink_merged_data_no_mulitalleles_palindromesID_df <- as.data.frame(plink_merged_data_no_mulitalleles_palindromesID)
} else {
plink_merged_data_no_mulitalleles_palindromesID_df <- plink_merged_data_no_mulitalleles_palindromesID
}
# Prepare data for labeling
snps_to_label <- unique(c(selected_snps_pcsk9)) #,
snps_label_data <- plink_merged_data_no_mulitalleles_palindromesID_df %>%
dplyr::filter(rsid %in% snps_to_label) %>%
dplyr::select(rsid, POS38, LOG10P) # , Gene
# Create Manhattan Plot
manhattan_plot <- ggplot(plink_merged_data_no_mulitalleles_palindromesID_df, aes(x = POS38, y = LOG10P)) +
geom_point(aes(color = as.factor(CHROM)), alpha = 0.75, size = 1.3) +
scale_color_manual(values = rep(c("blue", "orange"), length(unique(plink_merged_data_no_mulitalleles_palindromesID_df$CHROM)))) +
labs(
x = "Genomic Position (Chromosome 1)",
y = expression(-log[10](italic(p)-value)),
title = "Manhattan Plot with SuSiE Credible Set SNPs Labeled"
) +
geom_hline(
yintercept = genomewide_sig_threshold,
linetype = "dotted",
color = "red",
linewidth = 1
) +
theme_minimal() +
theme(
legend.position = "none",
panel.grid.major.x = element_blank(),
panel.grid.minor.x = element_blank(),
axis.text.x = element_text(angle = 45, hjust = 1, size = 8)
) +
geom_text_repel(
data = snps_label_data,
aes(label = paste(rsid, sep = " - ")),#Gene,
nudge_y = 1,
size = 3,
color = "black",
max.overlaps = Inf,
box.padding = 0.5,
point.padding = 0.5,
segment.color = "grey50"
)
# Display the plot
print(manhattan_plot)
# Save the plot
output_dir <- "/Users/charleenadams/temp_BI/susie_pcsk9/PCSK9_Q8NBP7_OID20235_v1_Cardiometabolic/figures"
if (!dir.exists(output_dir)) {
dir.create(output_dir, recursive = TRUE)
}
ggsave(
filename = file.path(output_dir, "pcsk9_susie_manhattan.pdf"),
plot = manhattan_plot,
width = 10,
height = 6,
dpi = 300
)