Sun only provided top SNPs for their most significant results in
their Supplemental Table 9 (ST9), not for those of varying P thresholds.
But I downloaded the all the primary pQTL files for the EUR discovery
set (n = 2940). These are about 1.5 TB compressed
(find /Users/charleenadams/ukbppp -type f -name "*.tar" -exec du -ch {} + | grep total).
So, I wrote a loop to sequentially decompress each extracting chromosome
22, appending the name of the assay/protein as a variable called “Assay”
and then deleting the uncompressed folder after extracting what we want.
I then also saved the header and concatenated the results.
The script below actually tries to extract based on p-value (log10P>=7.3; p<5E-8), but that part didn’t work. So, I will extract the ones we want in R.
#!/bin/bash
# Define variables
TAR_DIR="/Users/charleenadams/ukbppp"
SNP_ID="22:30529631:C:A:imp:v1"
LOG10P_THRESHOLD=7.3
OUTPUT_FILE="/Users/charleenadams/snp_extraction_results.txt"
LOG_FILE="/Users/charleenadams/test_extraction.log.txt"
HEADER="CHROM\tGENPOS\tID\tALLELE0\tALLELE1\tA1FREQ\tINFO\tN\tTEST\tBETA\tSE\tCHISQ\tLOG10P\tEXTRA\tAssay"
# Initialize the output and log files
echo -e "$HEADER" > "$OUTPUT_FILE"
echo "Extraction Log" > "$LOG_FILE"
# Function to process each tar file
process_tar_file() {
tar_file="$1"
# Extract the directory name from the tar file
base_name=$(basename "$tar_file" .tar)
untarred_dir="$TAR_DIR/$base_name"
# Log processing
echo "Processing tar file: $tar_file" | tee -a "$LOG_FILE"
# Check if the tar file exists
if [[ -f "$tar_file" ]]; then
# Untar the file into the directory
tar -xf "$tar_file" -C "$TAR_DIR" || { echo "Failed to untar: $tar_file" | tee -a "$LOG_FILE"; return; }
# Dynamically locate the chromosome 22 file in the untarred directory
chr22_gz=$(find "$untarred_dir" -type f -name "discovery_chr22_*.gz")
# Check if the chromosome 22 file exists
if [[ -n "$chr22_gz" ]]; then
echo "Processing file: $chr22_gz" | tee -a "$LOG_FILE"
# Search for the SNP, filter by LOG10P, and add the Assay column
zgrep "$SNP_ID" "$chr22_gz" | awk -v threshold="$LOG10P_THRESHOLD" -v assay="$base_name" '
BEGIN { OFS="\t" }
{
if ($NF >= threshold) print $0, assay
}' >> "$OUTPUT_FILE"
else
echo "Chromosome 22 file not found in: $untarred_dir" | tee -a "$LOG_FILE"
fi
# Clean up the untarred directory to save space
rm -rf "$untarred_dir"
else
echo "Tar file not found: $tar_file" | tee -a "$LOG_FILE"
fi
}
# Export function and variables for parallel execution
export -f process_tar_file
export TAR_DIR SNP_ID LOG10P_THRESHOLD OUTPUT_FILE LOG_FILE
# Run the script in parallel for all tar files
find "$TAR_DIR" -type f -name "*.tar" | parallel -j 4 process_tar_file
Does this look like what you and Rob had in mind?
I notice three things:
All the betas are negative.
Yes, we picked up more than the three chemokines with the less-stringent threshold. Is there a relationship between these 12 proteins?
The SNP is quite common.
library(DT)
library(data.table)
# Read the file as raw lines
lines <- readLines("/Users/charleenadams/snp_extraction_results.txt")
# Remove any leading or trailing whitespace
lines <- trimws(lines)
# Split the lines into fields by whitespace
split_lines <- strsplit(lines, split = "\\s+")
# Convert the split lines to a data frame
snp_data <- do.call(rbind, split_lines)
# Convert to a data frame with proper column names
snp_data <- as.data.frame(snp_data, stringsAsFactors = FALSE)
# Set the first row as column names
colnames(snp_data) <- snp_data[1, ]
snp_data <- snp_data[-1, ]
# Ensure data types are correct
snp_data[] <- lapply(snp_data, type.convert, as.is = TRUE)
# Extract Protein
snp_data$Protein <- sub("^([^_]*_[^_]*)_.*$", "\\1", snp_data$Assay)
write.csv(snp_data,"/Users/charleenadams/snp_data.csv")
# Add a new column for Pval
snp_data$Pval <- 10^(-snp_data$LOG10P)
snp_data$EXTRA <- NULL
snp_data$TEST <- NULL
snp_data$CHROM <- NULL
snp_data$INFO <-NULL
snp_data$CHISQ <-NULL
# Rename columns
names(snp_data)[names(snp_data) == "GENPOS"] <- "POS38"
names(snp_data)[names(snp_data) == "ID"] <- "ID_CHR_POS37"
# Reorder columns
snp_data <- snp_data[, c(
"Protein", # Place "Protein" as the first column
setdiff(names(snp_data), c("Protein", "Pval", "Assay")), # Keep all other columns except "Protein", "Pval", and "Assay"
"LOG10P", "Pval", # Place "Pval" right after "LOG10P"
"Assay" # Move "Assay" to the last column
)]
snp_data$LOG10P.1 <- NULL
thresh <- snp_data[which(snp_data$LOG10P>=7.3),]
summary(thresh$BETA)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## -0.06991 -0.05523 -0.04734 -0.05023 -0.04187 -0.03737
datatable(thresh,
options = list(
pageLength = 10, # Number of rows per page
autoWidth = TRUE, # Adjust column widths
scrollX = TRUE # Enable horizontal scrolling
),
rownames = FALSE, # Display row names
caption = '22:30529631:C:A:imp:v1 with P<5E-8')