#!/bin/sh
DIR=/path/to/directory
WD=$SCRATCH/project
PL=/path/to/PLINK
LO=/path/to/LiftOver
##### This update to build 37 (hg19) is NOT needed in this case since the chip is already in the genomic build
#update to build 37 (http://genome.sph.umich.edu/wiki/LiftOver#Lift_Merlin.2FPLINK_format)
#Generate BED files from plink bim files
$LO/liftOver <(awk '{pos0=$4-1; print "chr"$1,pos0,$4,$2}' $DIR/Project_final.bim) $LO/hg38ToHg19.over.chain.gz $DIR/output.BED $DIR/unlifted.BED
awk '{pos0=$4-1; print "chr"$1,pos0,$4,$2}' $DIR/Project_final.bim:
Uses awk to reformat the PLINK .bim file, outputting:
$LO/liftOver ...: Converts the build 38 coordinates to
build 37 using a chain file (hg38ToHg19)$DIR/output.BED is the result after the genome build
conversion (stores successfully converted loci coordinates in build 37
format)$DIR/unlifted.BED contains any loci that couldn’t be
mapped (stores loci that couldn’t be converted, useful for tracking
missing data)$PL/plink --noweb
--allow-no-sex
--bfile $WD/Project_final
--update-map <(awk '{gsub("chr","",$1); print $4,$1}' $DIR/output.BED)
--update-chr --make-bed --out $DIR/Project_final_b37
--noweb: Suppress web-based notifications or updates
(no longer need for newer version)--allow-no-sex: Allows PLINK to process files where
some individuals have undefined sex (indicated by 0 in the
.fam file). Without this flag, PLINK may terminate with
anerror if it encounters individual with undefined sex--bfile $WD/Project_final: Specifies the base file in
PLINK binary format (.bed/.bim/.fam)--update-map <(...): Updates the SNP position
coordinates in the .bim file using:
gsub("chr","",$1): Removes “chr” from the chromosome
column in output.BEDprint $4,$1: Outputs SNP position with chromosome
number, formatted for PLINK’s --update-map option--update-chr: Updates chromosome numbers as well
(required since position updates alone may not align chromosomes)--make-bed: Saves the result in PLINK binary
format$DIR/Project_final_b37: Specifies the output file
prefix, now in build 37$PL/plink
--noweb
--allow-no-sex
--bfile $DIR/Project_final_b37
--update-map <(awk '{print $4,$3}' $WD/output.BED)
--make-bed --out $DIR/Project_final_b37_b37
--noweb: Suppress web-based notifications or updates
(no longer need for newer version)--allow-no-sex: Allows PLINK to process files where
some individuals have undefined sex (indicated by 0 in the
.fam file). Without this flag, PLINK may terminate with
anerror if it encounters individual with undefined sex--bfile $DIR/Project_final_b37: Uses an already
converted PLINK file from Step 3--update-map <(awk '{print $4,$3}' $WD/output.BED):
Updates the positions using:
$4: The original SNP position$3: The newly mapped position from build 37--make-bed: Saves the output in binary PLINK
format$DIR/Project_final_b37_b37: Specifies the final output
file prefix#!/bin/sh
cd /home/w/wxu/calcium/Hamilton/Rob_hamilton_consortium_data/Post_QC
# skipped this part since I do not have the manifest for this particular chip.
# According to the recors the original chip is in platform HumanOmniExpressExome-8v1 with 585,384 markers (none of the manifests available at illumina match this)
gunzip -c HumanOmniExpressExome-8-v1-2-B.csv.gz | wc -l #964225
#Number of markers in chip (loci count):
gunzip -c HumanOmniExpressExome-8-v1-2-B.csv.gz | head #964193
# 964202=964193+8+1
gunzip -c HumanOmniExpressExome-8-v1-2-B.csv.gz | awk -F"," '(NR>7 && NR<964202){print}' | head
gunzip -c HumanOmniExpressExome-8-v1-2-B.csv.gz | awk -F"," '(NR>8 && NR<964202 && $21=="-"){print $2}' > CO20_negstrand.txt
PL="/home/w/wxu/oespinga/software/plink/plink-1.07-x86_64"
FILE_DIR="$HOME/HumanOmniExpressExome_201506"
$PL/plink --noweb
--bfile $FILE_DIR/CO20_Cutoff_TOP_20150409_FinalReport_QC
--flip CO20_negstrand.txt
--make-bed --out $FILE_DIR/CO20_Cutoff_TOP_20150409_FinalReport_QC_FWD
#! /bin/bash
export PL= /path/to/PLINK
export GTL= /path/to/gtools
export SHP= /path/to/Shapit
# Possible changes to locations:
#FILE_DIR, LOG_D, GTOOL_FDIR, SCRIPT_DIR, FILE_NAME
# Navigate to the directory where the setup files are stored
export LOCAL_DIR= /path/to/local/file
export FILE_DIR= /path/to/postQC file
cd $LOCAL_DIR
# Set the log directory for all output and error files downstream
LOG_D=logs
HAPS_FDIR=haplotypes
mkdir -p $LOG_D
mkdir -p $HAPS_FDIR
export LOG_DIR=${LOCAL_DIR}/${LOG_D}
export OUTPUT_DIR=${LOCAL_DIR}/${HAPS_FDIR}
export SCRIPT_DIR= /path/to/the/code
export FILE_NAME=Rob_Hamilton_final #The filtered file (cleaned + forward aligned)
export REFERENCE_PATH= /path/to/reference panel 1000 genomes
export GMAP_DIR= /path/to/GMAGMAP
######################################################################
# Iterate over all chromosomes and submit jobs through the script file: 1a_run_gtool_CO20.sh
export START_CHR=1
export END_CHR=22
qsub:echo "prepare files for shapeit"
JOB_NAME= /job_name
qsub -o $LOG_DIR/${JOB_NAME}.out \
-e $LOG_DIR/${JOB_NAME}.err \
-v START_CHR,END_CHR,LOCAL_DIR,FILE_DIR,LOG_DIR,FILE_NAME,OUTPUT_DIR,REFERENCE_PATH,GMAP_DIR,PL,GTL,SHP \
-q debug \
-N $JOB_NAME \
$SCRIPT_DIR/1a_run_preparefiles.sh
# Note
# qsub submit a batch job to Grid Engine
# -N the name of the job
# -o the path used for the standard output stream of the job (default job_name.ojob_id format)
# -e the path used for the standard error stream of the job (defaul job_name.ejob_id format)
JOB_NAME: Specifies a name for the job submissionqsub command: Submits the job using the
qsub command in a batch environment (typically used in
high-performance computing environments)
-o $LOG_DIR/${JOB_NAME}.out: Sets the output log file
path-e $LOG_DIR/${JOB_NAME}.err: Sets the error log file
path-v: Passes a list of variables to the job, including
chromosome range (START_CHR to END_CHR), file
directories, tool paths, and reference paths-q debug: Specifies the queue for the job (in this
case, the debug queue, typically used for quick or test runs)-N $JOB_NAME: Sets the job name$SCRIPT_DIR/1a_run_preparefiles.sh: Calls the specific
script (1a_run_preparefiles.sh) to process the files#!/bin/bash
#PBS -l nodes=1:ppn=8
#PBS -l walltime=1:00:00
#!/bin/bash: indicates the script shoudl run in the
Bash shell#PBS -l nodes=1:ppn=8: specifies to the job scheduler
to use 1 node with 8 processors per node#PBS -l walltime=1:00:00: Sets a maximum run time of 1
hourmodule load gnu-parallel
echo Local Directory: $LOCAL_DIR
echo File Directory: $FILE_DIR
echo File Name: $FILE_NAME
echo Output Directory: $OUTPUT_DIR
echo Reference Directory: $REFERENCE_PATH
cd $OUTPUT_DIR
echo Starting Directory: $OUTPUT_DIR
module load gnu-parallel: loads GNU parallel, which
allows for parallel execution of commandsecho: The script then prints out environment variables
to confirm their valuesfunction PLINK_chr {
CHR=$1
$PL/plink
--noweb
--bfile $OUTPUT_DIR/${FILE_NAME}_noduppos
--chr ${CHR}
--make-bed --recode --out $OUTPUT_DIR/${FILE_NAME}_chr${CHR}
}
export -f PLINK_chr
PLINK_chr to separate genetic data
by chromosome (chr) and convert it to PLINK binary format
(--make-bed).export -f PLINK_chr: Exports the function so it can be
used.echo "Remove duplicated positions"
1. Identify duplicates by Chromosome Position:
#duplicated positions
awk '{print $1":"$4}' $FILE_DIR/$FILE_NAME.bim | sort | uniq -c | awk '$1>1{print $2}' > duppos.txt
.bim
file, saving duplicates to duppos.txt2. Separate Duplicates by SNP ID Prefix (rs or non-rs):
#duplicated positions with snp id not starting with rs
join duppos.txt <(awk '{print $1":"$4,$2}' $FILE_DIR/$FILE_NAME.bim | sort -k 1) | awk '{print $2,$1}' | grep -v -e '^rs' > duppos_wo_rs.txt
#duplicated positions with snp id starting with rs
join duppos.txt <(awk '{print $1":"$4,$2}' $FILE_DIR/$FILE_NAME.bim | sort -k 1) | awk '{print $2,$1}' | grep -e '^rs' > duppos_w_rs.txt
rs prefix to
duppos_wo_rs.txtrs prefix to
duppos_w_rs.txt3. Filter out duplicate SNPs by certain criteria:
## remove snps with duplicated positions and the duplicated ones have rs id
join -2 2 <(join -2 2 duppos.txt duppos_w_rs.txt | awk '!seen[$1]++') duppos_wo_rs.txt | awk '{print $3}' | uniq > duppos_snpids_toremove.txt
## remove snps with duplicated positions and are not duplicated with SNPs that have rs id
join -2 2 <(join -v1 -2 2 duppos.txt duppos_w_rs.txt) duppos_wo_rs.txt | awk 'seen[$1]++{print $2}' >> duppos_snpids_toremove.txt
#the complement of above in case ever needed
#join -2 2 <(join -v1 -2 2 duppos.txt duppos_w_rs.txt) duppos_wo_rs.txt | awk '!seen[$1]++' | wc -l
duppos_snpids_toremove.txt accumulates IDs to exclude
from further processing. Here, non-rs duplicates are
prioritized for removal.4. Removal additional duplicats if necessary:
#in case duplicated positions exist within the snp ids starting with rs
extra_dups=`awk '{print $2}' duppos_w_rs.txt | uniq -c | awk '$1>1{print $2}' | wc -l`
if [ $extra_dups -gt 0 ]
then
##if more than one duplicated position appear here, keep one
join <(awk '{print $2}' duppos_w_rs.txt | uniq -c | awk '$1>1{print $2}' | sort) <(awk '{print $1":"$4,$2}' $FILE_DIR/$FILE_NAME.bim | sort -k 1) | awk '{print $2,$1}' | grep -e '^rs' | awk 'seen[$2]++{print $1}' >> duppos_snpids_toremove.txt
fi
rs prefix
(extra_dups), keeping one instance and removing
others.$PL/plink --noweb --bfile $FILE_DIR/$FILE_NAME --exclude duppos_snpids_toremove.txt --make-bed --out $OUTPUT_DIR/${FILE_NAME}_noduppos
$PL/plink --noweb --bfile $OUTPUT_DIR/${FILE_NAME}_noduppos --update-ids <(awk '{print $1,$2,$1"_"$2,$1"_"$2}' $OUTPUT_DIR/${FILE_NAME}.fam) --make-bed --out $OUTPUT_DIR/${FILE_NAME}_noduppos
duppos_snpids_toremove.txt saving the filtered data to
${FILE_NAME}_nodupposecho "Run PLINK to separate files per chromosomes"
seq $START_CHR $END_CHR | parallel -j 8 --joblog $LOG_DIR/1_prephase_plink_dups.ljob PLINK_chr {}
PLINK_chr function in parallel over a range if
chromosomes from $START_CHR to $END_CHR, using
up to 8 jobs concurrently.function Strand_alig {
CHR=$1
$SHP/shapeit -check -B $OUTPUT_DIR/${FILE_NAME}_chr${CHR} --input-ref $REFERENCE_PATH/1000GP_Phase3_chr${CHR}.hap.gz $REFERENCE_PATH/1000GP_Phase3_chr${CHR}.legend.gz $REFERENCE_PATH/1000GP_Phase3.sample --output-log $OUTPUT_DIR/chr${CHR}.alignments
nstrand=`awk '$1=="Strand" && $7==1{print $4}' $OUTPUT_DIR/chr${CHR}.alignments.snp.strand | wc -l`
if [ $nstrand -gt 0 ]
then
$PL/plink --noweb --bfile $OUTPUT_DIR/${FILE_NAME}_chr${CHR} --flip <(awk '$1=="Strand" && $7==1{print $4}' $OUTPUT_DIR/chr${CHR}.alignments.snp.strand) --make-bed --out $OUTPUT_DIR/${FILE_NAME}_chr${CHR}
$SHP/shapeit -check -B $OUTPUT_DIR/${FILE_NAME}_chr${CHR} --input-ref $REFERENCE_PATH/1000GP_Phase3_chr${CHR}.hap.gz $REFERENCE_PATH/1000GP_Phase3_chr${CHR}.legend.gz $REFERENCE_PATH/1000GP_Phase3.sample --output-log $OUTPUT_DIR/chr${CHR}.alignments
#This may not be necessary since the markers that are excluded in the original .strand.exlude file are removed later anyways.
#awk '$1=="Strand" && $7==1{print $3}' $OUTPUT_DIR/chr${CHR}.alignments.snp.strand > $OUTPUT_DIR/chr${CHR}.alignments.snp.strand.exclude
fi
if [ ! -f $OUTPUT_DIR/chr${CHR}.alignments.snp.strand.exclude ]
then
touch $OUTPUT_DIR/chr${CHR}.alignments.snp.strand.exclude
fi
}
export -f Strand_alig
Strand_alig function checks strand alignment by
comparing the chromosome data with reference data from 1000 Genomes
Projectecho "Checking missaligned alleles"
seq $START_CHR $END_CHR | parallel -j 8 --joblog $LOG_DIR/1_prephase_shapeit_align.ljob Strand_alig {}
echo "Finished Checking missaligned alleles"
Strand_align function for each chromosome in
parallel to check and correct strand alignment issues across all
specified chromosomes.# Step 1: Submit Pre-phase
# Set environment variables
PL=/path/to/plink
GTL=/path/to/gtool
SHP=/path/to/shapeit
LOCAL_DIR=/path/to/local_dir
FILE_DIR=/path/to/file_dir
LOG_D=/path/to/logs
HAPS_FDIR=/path/to/haplotypes
# Create directories for logs and haplotypes
mkdir -p $LOG_D
mkdir -p $HAPS_FDIR
# -p option ensures no error is thrown if the directory already existst
-p option prevents errors if they already do).# Loop through chromosomes
for CHR in {1,2,6}; do
qsub -o $LOG_D/2a_run_prephase_chr${CHR}.out \
-e $LOG_D/2a_run_prephase_chr${CHR}.err \
-q batch -l nodes=1:ppn=8,walltime=5:00:00 \
-N 2a_run_prephase_chr${CHR} \
2a_run_prephase.sh $CHR
done
-o and -e Directs output and error logs to
the specified log directory-q batch Runs the job in the batch queue-l nodes=1:ppn=8,walltime=5:00:00 Requests 1 node, 8
processors, and 5 hours of runtime-N Names the job with a chromosome-specific
identifier2a_run_prephase.sh $CHR Runs the
2a_run_prephase.sh script with the chromosome as an
argument.echo "Using PLINK at: $PL"
echo "Using GTOOL at: $GTL"
echo "Using SHAPEIT at: $SHP"
# Prephase data using SHAPEIT
shapeit -B ${FILE_DIR}/chr${CHR} \
-M ${FILE_DIR}/genetic_map_chr${CHR}.txt \
-O ${HAPS_FDIR}/chr${CHR}.haps \
--exclude-snp ${FILE_DIR}/exclude_snp.txt \
--thread 16
${CHR}
-B ${FILE_DIR}/chr${CHR}: Specifies the input PLINK
binary file for the chromosome-M ${FILE_DIR}/genetic_map_chr${CHR}.txt: Uses the
genetic map file for recombination rates for the chromosome-O ${HAPS_FDIR}/chr${CHR}.haps: Set the output files
for haplotype--exclude-snp ${FILE_DIR}/exclude_snp.txt: Excludes
specified SNPs from phasing based on this file--thread 16 Runs the phasing process on 16 threads for
speed.haps file was created:# Check if the .haps file was created
if [[ -s ${HAPS_FDIR}/chr${CHR}.haps ]]; then
echo "Prephasing for chromosome ${CHR} completed successfully."
else
echo "Error: Prephasing for chromosome ${CHR} failed."
fi
.haps file for the chromosome was
created successfully
-s checks if the file exists and is non-empty#! /bin/bash
export IMP2= path/to/Impute2
#Possbile changes to the locations:
# FILE_DIR, OUT_NEW_DIR, SCRIPT_DIR, CHR_END_FILE, LOG_DIR, FILE_NAME
#The home reference directory
export LOCAL_DIR= path/to/local_file
cd $LOCAL_DIR
#The directory with the haplotype files
export FILE_DIR= path/to/local_file/haplotype
#The log directory for all output and error files downstream
export LOG_DIR=$LOCAL_DIR/"logs"
#Directory to put imputed files
OUT_NEW_DIR='imputed_files_hap'
mkdir -p $OUT_NEW_DIR
export OUTPUT_DIR=$LOCAL_DIR/$OUT_NEW_DIR
SCRIPT_DIR= path/to/script
#Imputation panel reference paths (haplotype, genetic map and legend files)
export REFERENCE_PATH= /path/to/reference_1000_genome
#export LEGEND_REFERENCE_PATH=$REFERENCE_PATH/legend_files
export LEGEND_REFERENCE_PATH="$REFERENCE_PATH"
CHR_END_FILE=chr_endfile_hap.txt
#Filename (without chromosome suffixes) of the gtool output files
export FILE_NAME=FILE_NAME_final
#Chromosome end positions for modifying the intervals
### Define some parameters for looping
Impute2 software used for genotype
imputationLOCAL_DIR and navigates to
itFILE_DIR, where haplotype files are
stored, and LOG_DIR, where log files will be savedOUTPUT_DIR for imputed files, making the
directory if it doesn’t already existSCRIPT_DIR and
REFERENCE_PATH, where reference genome files are
locatedFILE_NAME for the output files
and the size of intervals (in base pairs) for splitting data for
imputation# Define analysis interval in kb (i.e. base pairs)
let interval_size=500000
export interval_size
# Iterate through each chromosome (chromosome 22 for testing purposes)
let start_chr=16 end_chr=16
# Number of processes per run
export R=8 #6
# Loop through and submit batches
for chr in 2 16 #`seq $start_chr $end_chr`
do
export chr
export start_interval=1
let CHR_LENGTH=`tail -n 1 <(gunzip -c $LEGEND_REFERENCE_PATH/1000GP_Phase3_chr${chr}.legend.gz) | awk '{print $2}'`
let end_interval="$CHR_LENGTH / $interval_size + 1"
export end_interval
echo $LEGEND_REFERENCE_PATH/1000GP_Phase3_chr${chr}.legend.gz
echo $chr $CHR_LENGTH $end_interval $interval_size
echo $chr $CHR_LENGTH $end_interval $interval_size >> $LOCAL_DIR/$CHR_END_FILE
#echo chr: $chr interval $interval out of $end_interval # prints the current chromosome
start_chr=16,
end_chr=16) and the number of processes per run
Rchr values 2 and 16
for testing)CHR_LENGTH using the last
entry in the chromosome legend fileinterval_size to get
end_interval (total intervals needed for this
chromosome)CHR_END_FILE for trackingJOB_NAME=3_impute2_chr${chr}
qsub -o $LOG_DIR/$JOB_NAME.out -e $LOG_DIR/$JOB_NAME.err -N $JOB_NAME \
-v LOCAL_DIR,FILE_DIR,OUTPUT_DIR,LOG_DIR,REFERENCE_PATH,LEGEND_REFERENCE_PATH,chr,interval_size,FILE_NAME,IMP2,start_interval,end_interval,R \
-q batch -l nodes=1:m32g:ppn=8,walltime=48:00:00 \
$SCRIPT_DIR/3a_run_imputation.sh
done
#Change the queue to which you are submitting : -q debug / -q batch
#Change the amount of resources requested nodes=1:ppn=8,walltime=0:30:00
qsub command)
with output/error files specified in LOG_DIR-l nodes=1:m32g:ppn=8,walltime=48:00:00#! /bin/bash
module load gnu-parallel
echo Local Directory: $LOCAL_DIR
echo File Directory: $FILE_DIR
echo Output Directory: $OUTPUT_DIR
echo Imputation Panel Reference Path: $REFERENCE_PATH
echo Imputation Panel Legend Files Reference Path: $LEGEND_REFERENCE_PATH
echo Chromosome Number: $chr
echo Interval Size Set to: $interval_size
echo Input Filename: $FILE_NAME
cd $LOCAL_DIR
export int_sze=$interval_size
export GENETIC_MAP_NAME="genetic_map_chr${chr}_combined_b37.txt"
#HAPLOTYPE_FILENAME="ALL_1000G_phase1integrated_v3_chr${chr}_impute.hap.gz"
export HAPLOTYPE_FILENAME="1000GP_Phase3_chr${chr}.hap.gz"
#LEGEND_FILENAME="ALL_1000G_phase1_chr${chr}_impute.legend"
export LEGEND_FILENAME="1000GP_Phase3_chr${chr}.legend.gz"
mkdir /dev/shm/workdir
mkdir -p $OUTPUT_DIR/chr${chr}
gnu-parallel to manage parallel job execution
across intervalsLOCAL_DIR, where
imputation-related files and directories are based/dev/shm/workdir) to speed up I/O operations during
imputation, and a chromosome-specific output directory in
OUTPUT_DIR.seq $start_interval $end_interval | parallel -j $R --joblog $LOG_DIR/impute2_chr${chr}.ljob '$IMP2/impute2 -use_prephased_g \
-known_haps_g $FILE_DIR/${FILE_NAME}_chr${chr}.phased.haps \
-m $REFERENCE_PATH/$GENETIC_MAP_NAME \
-h $REFERENCE_PATH/$HAPLOTYPE_FILENAME \
-l $LEGEND_REFERENCE_PATH/$LEGEND_FILENAME \
-int $(( int_sze*({}-1)+1 )) $(( int_sze*{} )) \
-Ne 20000 -buffer 250 \
-o /dev/shm/workdir/${FILE_NAME}_chr${chr}_{}_imputed \
-o_gz; \
mv /dev/shm/workdir/${FILE_NAME}_chr${chr}_{}_imputed* $OUTPUT_DIR/chr${chr} '
#sleep 5s
#mv /dev/shm/workdir/* $OUTPUT_DIR/
seq $start_interval $end_interval generates intervals
across which Impute2 will run-j $R to specify the maximum
concurrent jobs.Impute2
command with several arguments:
-use_prephased_g: Uses pre-phased genotype data-known_haps_g: Specifies the pre-phased haplotype file
for this chromosome-m, -h, -l: Specifies paths
to the genetic map, haplotype, and legend files, respectively-int: Defines the genomic interval for this run,
calculated based on int_sze and interval number
({}).-Ne: Sets the effective population size for the
model.-buffer: Adds a buffer region around each interval for
imputation accuracy-o: Specifies the output path for the imputed file
within the temporary directory /dev/shm/workdir./dev/shm/workdir to the chromosome-specific output
directory in OUTPUT_DIR./dev/shm/workdir to OUTPUT_DIR, for debugging
purposes.#!/bin/bash
module load gnu-parallel
SOURCE_DIR="/scratch/w/wxu/oespinga/Hamilton_imputation"
export SCRIPT_DIR="/home/w/wxu/oespinga/Hamilton_imputation/code"
export LOCAL_DIR=$SOURCE_DIR/"imputed_files_hap" #where the imputed files are located
export OUTPUT_DIR=$SOURCE_DIR/"imputed_plink"
export LOG_DIR=$SOURCE_DIR/"logs"
export SAMPLE_DIR=$SOURCE_DIR/"haplotypes"
#*** Submit this script as:
# ./4_imputation_to_plink.sh
export PL="/home/w/wxu/oespinga/software/plink/plink-1.07-x86_64"
export GTL="/home/w/wxu/kshestop/imputation_pipeline/Imputation_Programs/gtool_v0.7.5"
export HAP_FILE="Rob_Hamilton_final" #output name of the haplotypes
export G_FILE="Rob_Hamilton_imp_chr" #output name of the plink files
cd $LOCAL_DIR
mkdir -p $OUTPUT_DIR
module load gnu-parallel: Load the
gnu-parallel module, which allows parallel execution of
jobs for improved performanceSOURCE_DIR as the base directory for the coding
of the project.SCRIPT_DIR,
LOCAL_DIR, OUTPUT_DIR, LOG_DIR,
and SAMPLE_DIR)PLINK and
gtools toolsHAP_FILE (name for the output
haplotype files) and G_FILE (name prefix for the output
PLINK files)$LOCAL_DIR, where the
imputed files are storedmkdir -p to create the output directory
($OUTPUT_DIR) if it doesn’t exist yetfor CHR in `seq 1 8; seq 10 12`
do
export CHR
JOB_NAME=4_toplink_chr${CHR}
qsub -o $LOG_DIR/$JOB_NAME.out -e $LOG_DIR/$JOB_NAME.err -N $JOB_NAME \
-v LOCAL_DIR,OUTPUT_DIR,LOG_DIR,SAMPLE_DIR,CHR,PL,GTL,G_FILE,HAP_FILE \
-q sandy -l nodes=1:m64g:ppn=16,walltime=24:00:00 \
$SCRIPT_DIR/4a_back_to_plink_script.sh
#-q batch -l nodes=1:m32g:ppn=8,walltime=10:00:00
done
echo "jobs submitted..."
CHR) to make it
accessible in the job scriptJOB_NAME with the chromosome number for unique
job names (e.g., 4_toplink_chr1)qsub, specifying:
stdout
and stderr to files in $LOG_DIR (e.g.,
$JOB_NAME.out and $JOB_NAME.err).-N $JOB_NAME to set the
job name.LOCAL_DIR, OUTPUT_DIR,
LOG_DIR, SAMPLE_DIR, CHR,
PL, GTL, G_FILE,
HAP_FILE) using flag-v (List all
variables)4a_back_to_plink_script.sh script from
$SCRIPT_DIR#!/bin/bash
module load extras
cd $LOCAL_DIR/chr${CHR}
chr=`ls *chr${CHR}_*.gz | sort -V`
cat $chr | pigz -dc | cut -d" " -f2- | awk '{if(length($1)>100) $1=substr($1,1,100); print $1,$0}' > file${CHR}.gen
extras module for
this analysisCHR).gz files containing the chromosome data, sort
them, and concatenate using catpigz -dc, remove the first column
with cut -d" " -f2-, and truncate any value in the first
column exceeding 100 charactersfile${CHR}.gen#save to dosages (transposed, i.e. individuals in colums)
awk -v CHR=$CHR '{printf CHR" "$2" "$3" "$4" "$5; for(i=6; i<NF; i+=3){if ($(i+0)==0 && $(i+1)==0 && $(i+2)==0) printf " -9"; else printf " "$(i+0)*0+$(i+1)*1+$(i+2)*2}; printf "\n"}' file${CHR}.gen | pigz > dosages${CHR}.gz
#$GTL/gtool -G --g file${CHR}.gen --s <(awk '{print $1,$2" 0 "$4,$5}' $SAMPLE_DIR/${HAP_FILE}_chr${CHR}_chr${CHR}.phased.sample) --threshold 0.9 --phenotype plink_pheno --sex sex --chr ${CHR} --ped $OUTPUT_DIR/${G_FILE}${CHR}.ped --map $OUTPUT_DIR/${G_FILE}${CHR}.map
$GTL/gtool -G --g file${CHR}.gen --s $SAMPLE_DIR/${HAP_FILE}_chr${CHR}.phased.sample --threshold 0.9 --phenotype plink_pheno --sex sex --chr ${CHR} --ped $OUTPUT_DIR/${G_FILE}${CHR}.ped --map $OUTPUT_DIR/${G_FILE}${CHR}.map
file${CHR}.gen to dosages format,
where individuals are represented in columns-9 for missing datapigz and save it as
dosages${CHR}.gzgtool to convert the .gen file to
PLINK’s .ped and .map formats(--g): file${CHR}.gen(--s): the phased sample file for the
chromosome--threshold for quality, phenotype
and sex columns, chromosome ID (CHR), and output file paths
for .ped and .map$PL/plink --noweb --file $OUTPUT_DIR/$G_FILE${CHR} --make-bed --out $OUTPUT_DIR/$G_FILE${CHR}
rm -f $OUTPUT_DIR/${G_FILE}${CHR}.ped $OUTPUT_DIR/${G_FILE}${CHR}.map $OUTPUT_DIR/${G_FILE}${CHR}.nosex
awk '{print $2}' $OUTPUT_DIR/${G_FILE}${CHR}.bim | sort | uniq -d > $OUTPUT_DIR/${G_FILE}${CHR}_dup.snps
let DUP_SNPS=`wc -l $OUTPUT_DIR/${G_FILE}${CHR}_dup.snps | cut -d " " -f 1`
if [ $DUP_SNPS -gt 0 ]
then
$PL/plink --noweb --bfile $OUTPUT_DIR/$G_FILE${CHR} --exclude $OUTPUT_DIR/${G_FILE}${CHR}_dup.snps --make-bed --out $OUTPUT_DIR/${G_FILE}${CHR}_clean
else
cp $OUTPUT_DIR/${G_FILE}${CHR}.bed $OUTPUT_DIR/${G_FILE}${CHR}_clean.bed
cp $OUTPUT_DIR/${G_FILE}${CHR}.bim $OUTPUT_DIR/${G_FILE}${CHR}_clean.bim
cp $OUTPUT_DIR/${G_FILE}${CHR}.fam $OUTPUT_DIR/${G_FILE}${CHR}_clean.fam
fi
.ped and .map files into PLINK’s binary
format (.bed,.bim,.fam).ped,.map, and .nosex files to
clean up after conversion to binary format.bim file by extracting the second column, sorting, and
saving duplicates t a file named
${G_FILE}${CHR}_dup.snpsDUP_SNPSDUP_SNPS > 0), rerun PLINK
to exclude them and generate a “clean” binary datset.bed,
.bim, and .fam files directly as the “clean”
filesrm -f file${CHR}.gen $OUTPUT_DIR/${G_FILE}${CHR}.bed $OUTPUT_DIR/${G_FILE}${CHR}.bim $OUTPUT_DIR/${G_FILE}${CHR}.fam $OUTPUT_DIR/${G_FILE}${CHR}_dup.snps
.gen file, the
original PLINK binary files, and the duplicate SNP file. This ensures
only the cleaned, final output files are kept.#!/bin/bash
#PBS -l nodes=1:ppn=8
#PBS -l walltime=2:00:00
module load gnu-parallel
export IMP_DIR= path/to/Imputation
export SCRIPT_DIR= path/to/code
export OUTPUT_DIR=$IMP_DIR/"imputed_plink"
export LOG_DIR=$IMP_DIR/"logs"
export HAPS_DIR=$IMP_DIR/"imputed_files_hap" #where the imputed files are located
#*** Submit this script as:
# qsub -o $LOG_DIR/5_perform_qc.out -e $LOG_DIR/5_qc.err -q batch -N 5_qc_imp $SCRIPT_DIR/5_perform_qc.sh
export PL= path/to/PLINK
export G_FILE="Rob_Hamilton_imp_chr" #output name of the plink files
export FILES_PREFIX="Hamilton"
#create file with info and imputation info
module load gnu-parallel: Load the
gnu-parallel module, which allows parallel execution of
jobs for improved performance
PBS directives is what?IMP_DIR is the main directory where all
imputation-related data is storedOUTPUT_DIR and LOG_DIR are sub-directories
for storing outputs and logsPL specifies the path to PLINK toolG_FILE and FILES_PREFIX set the naming
format for output files to see the resultsqsub to submit the jobs
-o and -e options define where to save the
standard output and error logs-N assigns a name to the job for tracking in the queue
system5_qc_imp#create header for file
head_file=`ls -lh $HAPS_DIR/chr1/*_info | head -1 | awk '{print $9}'`
echo `awk 'NR==1{print "chr",$0}' $head_file | cut -d" " -f1,3-` | gzip > $IMP_DIR/${FILES_PREFIX}_imputation_loci_info.txt.gz
_info file
in the directory, adds a chr (chromosome) column, and saves
it as the header of the summary file
ls -lh $HAPS_DIR/chr1/*_info | head -1 | awk '{print $9}'
gets the name of the first _info file for chromosome 1awk 'NR==1{print "chr",$0}' adds “chr” as a new first
column to the headercut -d" " -f1,3- removes any unnecessary columns,
retaining only essential datagzip > $IMP_DIR/${FILES_PREFIX}_imputation_loci_info.txt.gz
saves the modified header as a compressed file.#loop over all _info files generated by impute2
for i in `seq 1 22`
do
for INT_FILE in `ls $HAPS_DIR/chr${i}/*chr${i}_*_info | sort -V`
do
awk -v chr=$i 'NR>1{print chr,$0}' $INT_FILE | cut -d" " -f1,3- | gzip >> $IMP_DIR/${FILES_PREFIX}_imputation_loci_info.txt.gz
done
done
_info file
_imputation_loci_info.txt.gz), excluding the header row
from each _info filefor i in '\seq 1 22' iterates through chromosome
numbers 1 to 22.ls $HAPS_DIR/chr${i}/*chr${i}_*_info | sort -V lists
and sorts all _info files for each chromosomeawk -v chr=$i 'NR>1{print chr,$0}' skips the first
row (header) of each _info file and appends chromosome
number as the first columngzip >> $IMP_DIR/${FILES_PREFIX}_imputation_loci_info.txt.gz
appends this processed data to the summary file in compressed
formatcd $OUTPUT_DIR
seq 1 22 | parallel -j 4 --joblog $LOG_DIR/5_qc.ljob $SCRIPT_DIR/5a_perform_qc_script.sh {}
#::: 7 8 16
echo "Finished..."
gnu_parallel to run a QC script
(5a_perform_qc_script.sh) on each chromosome file in
parallel, processing up to 4 chromosomes at once
cd $OUTPUT_DIR changes the directory to where QC
outputs will be savedseq 1 22 generates a sequence of numbers from 1 to 22,
representing chromosomesparallel -j 4 --joblog $LOG_DIR/5_qc.ljob executes
5a_perform_qc_script.sh with up to 4 jobs in parallel and
logs job activity$SCRIPT_DIR/5a_perform_qc_script.sh {} runs the QC
script for each chromosome, with {} replaced by chromosome
numbersecho "Finished..."print the finish message and confirm
all the tasks been executed#!/bin/bash
let CHR=$1
cd $OUTPUT_DIR
let CHR=$1 assigns the first argument passed to the
script to the variable CHR, representing the chromosome
numbercd $OUTPUT_DIR changes the working directory to where
the QC files will be savedecho "$CHR Initial_no_duplicates" `wc -l $G_FILE${CHR}_clean.bim` > Counts_${CHR}.txt
echo "$CHR SNPs_with_dot" `awk '$2=="."{print $2}' ${G_FILE}${CHR}_clean.bim | wc -l` >> Counts_${CHR}.txt
wc -l $G_FILE${CHR}_clean.bim counts the number of SNPs
in the cleaned BIM file for the chromosome, and the result is labeled
and saved to Counts_${CHR}.txt"."), which may indicate missing or problematic SNP
identifiers, appending the count to Counts_${CHR}.txtjoin <(gunzip -c ../${FILES_PREFIX}_imputation_loci_info.txt.gz | awk -v chr=${CHR} '$5>.9 && $6>.9 && $1==chr {print $2}' | sort) <(awk '$2!="."{print $2}' ${G_FILE}${CHR}_clean.bim | sort) > snp_list_${CHR}.txt
echo "$CHR After_info_and_certainty" `wc -l snp_list_${CHR}.txt` >> Counts_${CHR}.txt
gunzip -c ../${FILES_PREFIX}_imputation_loci_info.txt.gz | awk -v chr=${CHR} '$5>.9 && $6>.9 && $1==chr {print $2}' | sort
filters SNPs from the imputation info file based on info score
($5) and certainty ($6) values above 0.9, for
the specified chromosomejoin matches these SNPs with those in the BIM file
(that do not have . as their ID), creating a list of SNPs
that meet both criteria in snp_list_${CHR}.txtwc -l snp_list_${CHR}.txt counts these SNPs and logs
the result in Counts_${CHR}.txt$PL/plink --noweb --bfile $G_FILE${CHR}_clean --extract snp_list_${CHR}.txt --geno 0.05 --make-bed --out $G_FILE${CHR}_clean_qc
echo "$CHR After_missing_rate" `wc -l $G_FILE${CHR}_clean_qc.bim` >> Counts_${CHR}.txt
$PL/plink --bfile $G_FILE${CHR}_clean --extract snp_list_${CHR}.txt --geno 0.05 --make-bed --out $G_FILE${CHR}_clean_qc
uses PLINK to filter SNPs based on a missing rate of 5%
(--geno 0.05) and generates a new binary file format
(--make-bed) with this updated SNP listwc -l $G_FILE${CHR}_clean_qc.bim counts the remaining
SNPs after applying the missing rate filter and logs it in
Counts_${CHR}.txt$PL/plink --noweb --bfile $G_FILE${CHR}_clean_qc --maf 0.01 --make-bed --out $G_FILE${CHR}_clean_qc
echo "$CHR After_maf_filter" `wc -l $G_FILE${CHR}_clean_qc.bim` >> Counts_${CHR}.txt
$PL/plink --bfile $G_FILE${CHR}_clean_qc --maf 0.01 --make-bed --out $G_FILE${CHR}_clean_qc
applies a minor allele frequency (MAF) filter of 0.01, removing rare
SNPs from the dataset.wc -l $G_FILE${CHR}_clean_qc.bim counts SNPs after
applying the MAF filter and appends this count to
Counts_${CHR}.txtrm snp_list_${CHR}.txt
rm snp_list_${CHR}.txt deletes the temporary file
snp_list_${CHR}.txt, which is no longer needed after QC
processing#!/bin/bash
#PBS -l nodes=1:ppn=8
module load extras
##submit as qsub -q batch -l walltime=2:00:00 ./6_counts_imputation_info_files.sh
OUTPUT_DIR= /path/to/output
LOCAL_DIR=$OUTPUT_DIR/path/to/haplo_file #where the imputed files are located
LOCAL1_DIR=$OUTPUT_DIR/path/to/Imputed/file
FILES_PREFIX="Hamilton"
OUTFILE=${FILES_PREFIX}"_summary.txt"
cd $OUTPUT_DIR
#type=0 imputed loci
#type=2 hard genotype (from panel 2) but with imputed info from type 0
#type=3 hard genotype (from panel 2) but no imputation done from reference panel
OUTPUT_DIR sets the main output directoryLOCAL_DIR and LOCAL1_DIR define paths for
imputed haplotype files and PLINK-processed files, respectivelyFILES_PREFIX sets the prefix for the output
filenamesOUTFILE specifies the output file for QC summary
counts#raw counts of file (i.e. all attempted imputed loci)
echo -n "Total number of markers imputed: " > $OUTFILE
pigz -dc $OUTPUT_DIR/${FILES_PREFIX}_imputation_loci_info.txt.gz | wc -l >> $OUTFILE
echo >> $OUTFILE
Navigates to the output directory
Counts all markers in the imputation into file and writes to
OUTFILE
cd $OUTPUT_DIR: Changes the current directory to
OUTPUT_DIR
echo -n "Total number of markers imputed: " > $OUTFILE:
Writes the specified text to OUTFILE without a newline
(-n flag suppresses newline).
pigz -dc $OUTPUT_DIR/${FILES_PREFIX}_imputation_loci_info.txt.gz | wc -l:
pigz -dc: Decompresses a gzipped file
(.txt.gz) without fully extracting it to disk${FILES_PREFIX}_imputation_loci_info.txt.gz: Refers to
the compressed file containing imputation locus information.| wc -l: Counts lines, representing the number of
markers.>> $OUTFILE: Appends the marker count to
OUTFILE.
#high info and certainty loci (based on info and certainty fields in imputation -cols 5 and 6-)
echo -n "Number of markers high info and certainty: " >> $OUTFILE
pigz -dc $OUTPUT_DIR/${FILES_PREFIX}_imputation_loci_info.txt.gz | awk '$5>.9 && $6>.9 && NR>1{print $0}' | wc -l >> $OUTFILE
echo >> $OUTFILE
Filters markers with high information and certainty scores (both
>0.9) and appends the count to OUTFILE
echo -n "Number of markers high info and certainty: " >> $OUTFILE:
Appends a description line to OUTFILE.
pigz -dc $OUTPUT_DIR/${FILES_PREFIX}_imputation_loci_info.txt.gz | awk '$5>.9 && $6>.9 && NR>1{print $0}' | wc -l:
awk '$5>.9 && $6>.9 && NR>1{print $0}':
Filters lines where columns 5 (info score) and 6 (certainty score) are
both greater than 0.9. NR>1 skips the header row| wc -l: Counts the filtered lines (markers meeting
high-quality thresholds) >> $OUTFILE: Appends the
count to OUTFILE####Perform counts from imputed, imputed after removing duplicates and qc'd loci
echo "Counts from imputation (Impute2 output): " >> $OUTFILE
count_tot=0
for i in `seq 1 22`
do
count=`pigz -dc $LOCAL_DIR/chr${i}/*.gz | wc -l`
echo $count chr${i} >> $OUTFILE
count_tot=$((count_tot + count))
done
echo $count_tot" total" >> $OUTFILE
echo >> $OUTFILE
Iterates through chromosomes (1-22), counts markers in each chromosome’s imputed files, and calculates a total count
echo "Counts from imputation (impute2 output): " >> $OUTFILE:
Adds a heading to OUTFILE for the imputed marker counts by
chromosome
`for i in 22``: Loops through chromosome numbers 1-22.
pigz -dc $LOCAL_DIR/chr${i}/*.gz | wc -l:
pigz -dc $LOCAL_DIR/chr${i}/*.gz: Decompresses all
imputed files for chromosome i| wc -l: Counts the total markersecho $count chr${i} >> $OUTFILE: Writes the
count for each chromosome to OUTFILE
count_tot=$((count_tot + count)): Adds each
chromosome’s count to count_tot for a grand total
echo $count_tot" total" >> $OUTFILE: Writes
the total count to OUTFILE
echo "Counts from called genotypes with gtool .9 threshold after removing duplicates if any (plink conversion): " >> $OUTFILE
a=($(wc -l $LOCAL1_DIR/*_chr*_clean.bim))
#length of the array ${#a[@]}
for k in `seq 0 $((${#a[@]}-1))`
do
if [ $(($k % 2)) -eq 0 ]; then
echo -n "${a[$k]} " >> $OUTFILE
else
echo ${a[$k]} >> $OUTFILE
fi
done
echo >> $OUTFILE
Counts loci in cleaned .bim files for each
chromosome, with duplicates removed, then appends counts to
OUTFILE
echo "Counts from called genotypes ...": Adds a
heading for genotype counts post-duplicate removal to OUTFILE`
a=($(wc -l $LOCAL1_DIR/*_chr*_clean.bim)): Stores
line counts of each .bim file into array
a
for k in \seq 0 $((${#a[@]}-1)): Loops through
indices of array a
if [ $(($k % 2)) -eq 0 ]: Checks if the index is
even (i.e., file path and count are grouped)
echo -n "${a[$k]} " >> $OUTFILE: Writes the count
for each chromosome on the same lineelse echo ${a[$k]} >> $OUTFILE: Prints each
filename to a new lineecho >> $OUTFILE: Adds a blank lineecho "Counts from imputation, after QC (Info & certainty >.9, MAF>1% and genotype missing rates<5%): " >> $OUTFILE
#of loci after qc (info & certainty, maf, geno rates) and cleaning
b=($(wc -l $LOCAL1_DIR/*_chr*_clean_qc.bim))
#length of the array ${#a[@]}
for k in `seq 0 $((${#a[@]}-1))`
do
if [ $(($k % 2)) -eq 0 ]; then
echo -n "${b[$k]} " >> $OUTFILE
else
echo ${b[$k]} >> $OUTFILE
fi
done
echo >> $OUTFILE
#tar -zcvf $LOCAL1_DIR/CO20_imputed_clean_qc.tar.gz $LOCAL1_DIR/*_clean_qc.*
After QC filtering for minor allele frequency, missing rate, and high info and certainty, counts markers for each chromosome
Count lines in each QC-passed .bim file
wc -l $LOCAL1_DIR/*_chr*_clean_qc.bim: Runs
wc -l on each .bim file that passed QC in the
directory $LOCAL1_DIR (expected filenames include
_chr*_clean_qc.bim)b=($(...)): Stores the output of wc -l in
an array, b, where each element in the array contains a
line count for a corresponding .bim file.Loop through the Array to Print Chromosome Counts:
for k in \seq 0 $((${#b[@]}-1)): Iterates over the
indices of the array b, starting from 0 up to
the last index (${#b[@]}-1')if [ $(($k % 2)) -eq 0 ]: Checks if the index
k is even
echo -n "${b[$k]} " >> $OUTFILE: For even
indices, writes the line count to OUTFILE on the same line,
followed by a space. This count corresponds to the number of markers in
each chromosomeelse echo ${b[$k]} >> $OUTFILE: For odd indices,
appends the chromosome identifier (e.g., chr1,
chr2) to OUTFILE, followed by a newline#bring concordance info from files
grep "0.0-0.1" $LOCAL_DIR/chr*/*_summary | awk '{print $1,$9}' | pigz > $OUTPUT_DIR/${FILES_PREFIX}_imputation_concordance.txt.gz
#Summaries of concordance, info and certainty
pigz -dc $OUTPUT_DIR/${FILES_PREFIX}_imputation_concordance.txt.gz | awk '{print $2}' | awk '{for(i=1;i<=NF;i++) {sum[i] += $i; sumsq[i] += ($i)^2}} END {for (i=1;i<=NF;i++) {print "Concordance stats: ", sum[i]/NR, sqrt((sumsq[i]-sum[i]^2/NR)/(NR-1))}}' >> $OUTFILE
pigz -dc $OUTPUT_DIR/${FILES_PREFIX}_imputation_loci_info.txt.gz | awk 'NR>1 {print $5}' | awk '{for(i=1;i<=NF;i++) {sum[i] += $i; sumsq[i] += ($i)^2}} END {for (i=1;i<=NF;i++) {print "Info stats: ", sum[i]/NR, sqrt((sumsq[i]-sum[i]^2/NR)/(NR-1))}}' >> $OUTFILE
pigz -dc $OUTPUT_DIR/${FILES_PREFIX}_imputation_loci_info.txt.gz | awk 'NR>1 {print $6}' | awk '{for(i=1;i<=NF;i++) {sum[i] += $i; sumsq[i] += ($i)^2}} END {for (i=1;i<=NF;i++) {print "Certainty stats: ", sum[i]/NR, sqrt((sumsq[i]-sum[i]^2/NR)/(NR-1))}}' >> $OUTFILE
Searches for specific concordance info from imputation summaries and compresses the output for later statistical analysis
Search for the specific concordance info from imputation
grep "0.0-0.1" $LOCAL_DIR/chr*/*_summary: Searches for
lines in _summary files that mention a concordance range
0.0-0.1awk '{print $1,$9}': Extracts columns 1 and 9pigz > $OUTPUT_DIR/${FILES_PREFIX}_imputation_concordance.txt.gz:
Compresses the extracted information into
imputation_concordance.txt.gzCalculate the Concordance (Column 2)
pigz -dc $OUTPUT_DIR/${FILES_PREFIX}_imputation_concordance.txt.gz:
Decompresses the concordance fileawk '{print $2}': Extracts the second columnawk statement calculates the mean and
standard deviation:
for(i=1;i<=NF;i++) {sum[i] += $i; sumsq[i] += ($i)^2}:
Sums values and squares for standard deviationprint "Concordance stats: ", sum[i]/NR, sqrt((sumsq[i]-sum[i]^2/NR)/(NR-1)):
Prints mean and standard deviation for each columnSimilar for column 5 (info score) and column 6 (certainty)