CoMet CCC (Combinatorial Metrics)

# scripts: /gpfs/alpine/syb105/proj-shared/Personal/mcashman/Projects/CoMet/TEMPLATES/ccc
# test data: /gpfs/alpine/syb105/proj-shared/Personal/mcashman/Projects/CoMet/TEMPLATES/example_data/ccc_data.tped
# CoMet build: /ccs/proj/syb105/sw/summit/comet/src/build_single_release_summit/genomics_metric

# running CoMet: https://docs.google.com/document/d/1Dz4bDciSW5ZSMIkHq3LQ6uBxGzf7CR3XVz3SFRuyDHY/edit#heading=h.on7f9i5ekak9


# test data looks like this:
tig00000000 tig00000000:157495  0   157495  G   G   G   G   G   G   G   G   G   G   A   A   G   G   G   G   G   G   A
tig00000000 tig00000000:162166  0   162166  C   C   C   C   T   C   C   C   C   C   C   C   C   C   C   C   C   C   T
tig00000000 tig00000000:162175  0   162175  T   T   T   T   C   T   C   T   T   T   T   T   T   T   T   T   T   T   C
tig00000000 tig00000000:162427  0   162427  A   A   A   A   A   A   A   A   A   A   A   A   A   A   A   A   A   A   A
tig00000000 tig00000000:162430  0   162430  A   A   C   A   A   A   A   A   A   A   A   A   A   A   A   A   A   A   A
tig00000000 tig00000000:162439  0   162439  G   G   A   G   A   A   G   G   G   G   G   G   G   G   G   G   A   G   G
tig00000000 tig00000000:249674  0   249674  T   T   T   T   T   T   T   T   C   C   C   T   T   T   T   T   C   T   T
tig00000000 tig00000000:405503  0   405503  G   G   G   G   G   G   G   G   G   G   G   G   G   G   T   G   G   G   G
tig00000000 tig00000000:645635  0   645635  T   T   T   T   T   T   T   T   T   T   T   T   T   T   T   T   T   T   T
tig00000000 tig00000000:697464  0   697464  C   C   C   C   C   C   C   C   C   C   C   C   C   C   C   C   C   C   A

Pre-Processing

#!/bin/bash
#BSUB -P SYB105
#BSUB -W 00:30
#BSUB -nnodes 1
#BSUB -J duo_pre
#BSUB -o duo_pre.%J.out

module load gcc/5.4.0
#Script and Data path

spath=/gpfs/alpine/syb105/proj-shared/Personal/mcashman/Tools/summit/cometTools/ccc/preProcess/Scripts/preProcess/preProcess.sh
dpath=/gpfs/alpine/syb105/proj-shared/Personal/mcashman/Projects/CoMeT/example_data

#Run command
jsrun --nrs 1 --tasks_per_rs 1 --cpu_per_rs 42 --gpu_per_rs 6 --rs_per_host 1 $spath $dpath/ccc_data.tped
echo "Done!"

Running CoMet

# CCC vector length = 1/2 the number of non-whitespace characters
#([ head -n1 [FILE] | wc -w ] - 4) / 2
head -n1 ccc_data.tped | wc -w
### (648 - 4) / 2 = 322

# Number of Vectors: The number of vectors is equal to the number of rows.
wc -l ccc_data.tped
# 16339 

Use the spreadsheet [https://docs.google.com/spreadsheets/d/1BD2MIVE2ajlEde4UfVzO4Hx61VkxtKn-/edit#gid=965603301] to determine proper commands: These are the primary cells you should change in both cases: Cell C2 should be set to your project (e.g. syb105) Cell C10 (num_way) should be set to 2 for 2-way or 3 for 3-way Cell C11 (metric) should be set to ccc/duo/czek Cell C13 (num_vector) should be set to the number of vectors Cell C14 (num_field) should be set to the vector length Cell C16 (system) should be set to the supercomputer (summit/perlmutter/titan) Cell C19 (num_proc_vector) should be set to how many output files you want

*You need to have enough files that they are small enough that post-processing can complete in a reasonable time ** Once you set those cells, you can scroll down to the auto-computed metrics, and make sure they are in the green/yellow. Red in C47 isn’t a big deal, means the problem is small and thus not optimizing resources.

#interactive command
bsub -P syb105 -nnodes 4500 -W 10 -Is $SHELL

#submission command
### Be sure to change the path to the executable to where you installed it.  You will also need to change the --input_file and --output_file_stub in the last line.
executable=install_single_release_summit/bin/genomics_metric ar_opts='PAMI_IBV_ENABLE_DCT=1 PAMI_ENABLE_STRIPING=1 PAMI_IBV_ADAPTER_AFFINITY=0 PAMI_IBV_QP_SERVICE_LEVEL=8 PAMI_IBV_ENABLE_OOO_AR=1'
launch_command="env OMP_NUM_THREADS=7 $ar_opts jsrun --nrs 27000 --bind packed:7 --cpu_per_rs 7 --gpu_per_rs 1 --rs_per_host 6 --tasks_per_rs 1 -X 1"
$launch_command $executable --num_way 2 --metric_type ccc --sparse yes --num_vector 16339 --num_field 322 --all2all yes --compute_method GPU --num_proc_vector 27000 --num_proc_field 1 --num_proc_repl 1 --tc 2 --num_tc_steps 4 --num_phase 1 --num_stage 1 --verbosity 1 --checksum no --phase_min 0 --phase_max 0 --stage_min 0 --stage_max 0
#BSUB -P SYB105
#BSUB -W 00:15
#BSUB -nnodes 17
#BSUB -J ccc-test
#BSUB -o ccc-test-100files.%J.out

#bsub -P syb106 -nnodes 80 -W 15 -Is $SHELL

executable=/gpfs/alpine/syb105/proj-shared/Personal/mcashman/Tools/summit/CoMeT/build_release_summit/genomics_metric ar_opts='PAMI_IBV_ENABLE_DCT=1 PAMI_ENABLE_STRIPING=1 PAMI_IBV_ADAPTER_AFFINITY=0 PAMI_IBV_QP_SERVICE_LEVEL=8 PAMI_IBV_ENABLE_OOO_AR=1'

launch_command="env OMP_NUM_THREADS=7 $ar_opts jsrun --nrs 102 --bind packed:7 --cpu_per_rs 7 --gpu_per_rs 1 --rs_per_host 6 --tasks_per_rs 1 -X 1"

$launch_command $executable --metric_type ccc --sparse yes --num_vector 16339 --num_field 322 --threshold 0 --all2all yes --compute_method GPU --num_proc_vector 100 --num_proc_field 1 --num_proc_repl 1 --tc 1 --num_tc_steps 1 --num_phase 1 --num_stage 1 --verbosity 1 --checksum no --phase_min 0 --phase_max 0 --stage_min 0 --stage_max 0 --input_file /gpfs/alpine/syb105/proj-shared/Personal/mcashman/Projects/CoMeT/culex/preProcess/data/ccc_data.bin --output_file_stub /gpfs/alpine/syb105/proj-shared/Personal/mcashman/Projects/CoMeT/culex/output_ccc_100files/result

Post-Processing

#!/bin/bash
#SBATCH -A syb105
#SBATCH -N 1
#SBATCH -t 10
#SBATCH -J post
#SBATCH -o output-postprocess-'%A'.txt

# do this somewhere before here:
#pushd $TOOLS_DIR
#./make.sh
#popd

# Location of CoMet postprocessing tools.
TOOLS_DIR=/gpfs/alpine/syb105/proj-shared/Personal/mcashman/Tools/andes/genomics_gpu/tools
export PATH="${PATH}:$TOOLS_DIR"

NUM_WAY=2

# Extract the file to be processed by this node from the file list.
BASE_PATH_PRE=/gpfs/alpine/syb105/proj-shared/Personal/mcashman/Projects/CoMet/new_examples/ccc/preprocess
FILE_IN=/gpfs/alpine/syb105/proj-shared/Personal/mcashman/Projects/CoMet/new_examples/ccc/result_0.bin
STUB=ccc_data
mkdir -p out_slurm_$STUB

echo "Postprocessing $FILE_IN"

(
  time postprocess $NUM_WAY \
    $BASE_PATH_PRE/${STUB}_allele_labels.txt \
    $BASE_PATH_PRE/${STUB}_line_labels.txt \
    $FILE_IN \
    $STUB.txt
) 2>&1 | tee out_slurm_$STUB/output-postprocess-$STUB.txt

OUD Project

4 April 2022: Discussion

  • Opioid use disorder

  • Chris Rench (London School of Hygiene and Tropical Medicine)

  • Citadel does exist (allows use of CoMet) BUT we are waiting for approval

    • proceed with Blocbuster pipeline for now…
    • another push to get CoMet on Murphy? but do we need to test blocbuster on Murphy first?
      • blocbuster is only cpu, CoMet is gpu
  • Subset cases and controls by ancestry

    • use un-imputed data (limit LD blocks)
      • look at LD scores for specific examples of SNPs with high similarity scores that are in close proximity in the genome
    • how much does depth of samples influence output?
    • what do we want the final output to be? SNP to SNP, SNP to gene, etc.
    • look at case v control - phenotype
    • HMAGMA
  • access blocbuster from the virtual kdi machine: https://kdinxmgt01.ornlkdi.org/all/misc_packages/BlocBuster_v.1.1.tar.gz

  • step 1: clarification on cases / controls from Chris (Mirko)

  • step 2: separate data by ancestry

  • step 3: run single ancestry on blocbuster on Murphy (for cases and controls separately) [african, hispanic, european]

  • step 4: determine computational power needed… can it be completed on Murphy?

  • step 5: if yes to above, continue with additional ancestries and cross-ancestry… if no, subset to chromosome by chromosome CCC runs

imputed data in PLINK format un-imputed data in ???

–> data wrangle by ancestry and case/control –> per individual… non-imputed data to allele format

poplar data

  • raw SNP dataset ~30 million SNPs —> trans 90 or 99?

    • reduce noise
    • GATK pipeline
      • trans90 (small, clean), trans99, trans99.9, trans100??
  • version 3 LD removed —> 769,000 SNPs

  • version 4… trans99 (some noise but not missing essential SNPs) /gpfs/alpine/syb105/proj-shared/Data/P_tri/Variants/v4

  • filters to get 7.7million SNPs in version 3 and 4.8million SNPs in version 4

    • minor allele frequency 0.01 or 0.05 is the biggest filter
    • remove SNP positions that have >15% missing data
    • remove genotypes with >15% missing data
    • hardy weinberg filter (pval < 1e-50) —> remove SNPs with sequencing errors
  • LD > 0.7 removed —> ~800,000 SNPs used for GWAS

    • plink2 command
    • default 1kb??

####v3 poplar data /gpfs/alpine/syb105/proj-shared/Personal/gabrielgaz/Runs/cccLOCO/01/data/originalTPED-7.7M /gpfs/alpine/syb105/proj-shared/Personal/gabrielgaz/Runs/comet/summit/ccc/poplar

--> /gpfs/alpine/syb105/proj-shared/Personal/gabrielgaz/Shared/CCC/1-Poplar7.7M
Hello Team,
  
I ran CoMet 2-Way CCC on file /lustre/or-hydra/cades-bsd/dk0/VCF3/gatk_882_PASS.maf01.CR0.9.annotgeneid.bescID.tped provided by David with threshold 0.7.

The provided TPED had 7 710 681 SNPs

~ Number of Possible Pairs = 7710681 * 7710681 / 2 = 29727300741880

~ Number of Lines CoMet saved with threshold > 0.7 = 294095095

Conversion Rate = 29727300741880 / 294095095 = 101080

With Threshold 0.7 we saved 1 for every 101K lines
(we were thinking 1 to 3M)

The results are available here: /gpfs/alpine/syb105/proj-shared/Personal/gabrielgaz/Shared/CCC/1-Poplar7.7M/7.7MPoplar-CCC-Threshold0.7.txt

Best,

Joao Gabriel Gazolla

/gpfs/alpine/syb105/proj-shared/Personal/gabrielgaz/Projects/archives/cccLOCO/integrator/data/7.7MPoplar-CCC-Threshold0.7.txt

  • 7.7 million SNPs
  • Threshold 0.7
  • version 3
  • gene-to-gene (SNPs in genes only)
    • Poplar CCC networks: /gpfs/alpine/syb105/proj-shared/Data/P_tri/NetworkLayers/CCC

863 genotypes

library(tidyr)
library(dplyr)
setwd("/Users/27n/Dropbox (ORNL)/ORNL.Noshay/Projects/Poplar/CCC/v3")
gid <- read.delim("Ptrichocarpa_210_v3.0.geneOnly.sorted.gff3", header=F, sep="\t")
#ccc <- read.delim("7.7MPoplar-CCC-Threshold0.7-100.txt", header=F, sep=" ")
ccc <- read.delim("7.7MPoplar-CCC-Threshold0.7.txt", header=F, sep=" ")

colnames(gid) <- c("chromosome", "source", "feature", "start", "end", "dot", "strand", "dot2", "ID")
gid$chr <- as.numeric(substr(gid$chromosome, 4, 5))
gid.df <- gid %>% separate(ID, c("gid", "name"), sep=";")

ccc.snp1 <- ccc %>% separate(V5, c("snp1.chr", "snp1.pos", "snp1.allele"), sep="_")
ccc.coord <- ccc.snp1 %>% separate(V6, c("snp2.chr", "snp2.pos", "snp2.allele"), sep="_")

library(tidygenomics)
ccc.snp1.coord <- unique(ccc.coord[,c(5:6)])
ccc.snp1.coord$chr <- as.numeric(ccc.snp1.coord$snp1.chr)
ccc.snp1.coord$start <- as.numeric(ccc.snp1.coord$snp1.pos)
ccc.snp1.coord$end <- as.numeric(ccc.snp1.coord$snp1.pos)
ccc.snp2.coord <- unique(ccc.coord[,c(8:9)])
ccc.snp2.coord$chr <- as.numeric(ccc.snp2.coord$snp2.chr)
ccc.snp2.coord$start <- as.numeric(ccc.snp2.coord$snp2.pos)
ccc.snp2.coord$end <- as.numeric(ccc.snp2.coord$snp2.pos)

ccc.snp1.gid <- genome_join_closest(ccc.snp1.coord, gid.df, by=c("chr", "start", "end"), distance_column_name="snp1.distance", mode="full")
ccc.snp2.gid <- genome_join_closest(ccc.snp2.coord, gid.df, by=c("chr", "start", "end"), distance_column_name="snp2.distance", mode="full")

ccc.coord.snp1.gid <- left_join(ccc.coord, ccc.snp1.gid[,c(1,2,14,17)], by=c("snp1.chr", "snp1.pos"))
ccc.coord.gid <- left_join(ccc.coord.snp1.gid, ccc.snp2.gid[,c(1,2,14,17)], by=c("snp2.chr", "snp2.pos"))
ccc.coord.gid.df <- ccc.coord.gid[,5:15]
colnames(ccc.coord.gid.df) <- c("snp1.chr", "snp1.pos", "snp1.allele", "snp2.chr", "snp2.pos", "snp2.allele", "ccc.cor", "snp1.gene", "snp1.gene.dist", "snp2.gene", "snp2.gene.dist")
#write.table(ccc.coord.gid.df, "7.7MPoplar-CCC-Threshold0.7-100.Ptrichocarpa_210_v3.0.txt", quote=F, row.names=F, sep="\t")
# 1481
write.table(ccc.coord.gid.df, "7.7MPoplar-CCC-Threshold0.7.Ptrichocarpa_210_v3.0.txt", quote=F, row.names=F, sep="\t")

ccc.coord.gid.df.distinct <- ccc.coord.gid.df %>% distinct(snp1.gene, snp2.gene, .keep_all = T)
# 295
ccc.coord.gid.df.distinct.pair <- subset(ccc.coord.gid.df.distinct, ccc.coord.gid.df.distinct$snp1.gene != ccc.coord.gid.df.distinct$snp2.gene)
# 292
#write.table(ccc.coord.gid.df.distinct.pair, "7.7MPoplar-CCC-Threshold0.7-100.Ptrichocarpa_210_v3.0.distinctgenepair.txt", quote=F, row.names=F, sep="\t")
write.table(ccc.coord.gid.df.distinct.pair, "7.7MPoplar-CCC-Threshold0.7.Ptrichocarpa_210_v3.0.distinctgenepair.txt", quote=F, row.names=F, sep="\t")

### filter to genes within 1kb of SNP
ccc.coord.gid.df.distinct.pair.1kb <- subset(ccc.coord.gid.df.distinct.pair, abs(ccc.coord.gid.df.distinct.pair$snp1.gene.dist) < 1000 & abs(ccc.coord.gid.df.distinct.pair$snp2.gene.dist) < 1000)
# 70
#write.table(ccc.coord.gid.df.distinct.pair.1kb, "7.7MPoplar-CCC-Threshold0.7-100.Ptrichocarpa_210_v3.0.distinctgenepair.1kb.txt", quote=F, row.names=F, sep="\t")
write.table(ccc.coord.gid.df.distinct.pair.1kb, "7.7MPoplar-CCC-Threshold0.7.Ptrichocarpa_210_v3.0.distinctgenepair.1kb.txt", quote=F, row.names=F, sep="\t")
Gabriel’s CCC run script

/gpfs/alpine/syb105/proj-shared/Personal/gabrielgaz/Runs/comet/summit/ccc/poplar/01-Poplar7M/run bsub -P syb105 -nnodes 100 -W 15 -Is $SHELL

executable=/gpfs/alpine/syb105/proj-shared/Personal/gabrielgaz/Apps/comet/build_release_summit/genomics_metric ar_opts='PAMI_IBV_ENABLE_DCT=1 PAMI_ENABLE_STRIPING=1 PAMI_IBV_ADAPTER_AFFINITY=0 PAMI_IBV_QP_SERVICE_LEVEL=8 PAMI_IBV_ENABLE_OOO_AR=1'

launch_command="env OMP_NUM_THREADS=7 $ar_opts jsrun --nrs 600 --bind packed:7 --cpu_per_rs 7 --gpu_per_rs 1 --rs_per_host 6 --tasks_per_rs 1 -X 1"

$launch_command $executable --num_way 2 --metric_type ccc --sparse yes --num_vector 7710681 --num_field 882 --all2all yes --compute_method GPU --num_proc_vector 600 --num_proc_field 1 --num_proc_repl 1 --tc 1 --num_tc_steps 1 --num_phase 50 --num_stage 1 --verbosity 1 --checksum no --phase_min 0 --phase_max 49 --stage_min 0 --stage_max 0 --input_file /gpfs/alpine/syb105/proj-shared/Personal/gabrielgaz/Runs/CCC/01-Poplar7M/preprocess/7M.bin --threshold 0.7 --output_file_stub /gpfs/alpine/syb105/proj-shared/Personal/gabrielgaz/Runs/CCC/01-Poplar7M/run/output/7M_

ctime 227.696209 ops 2.134692e+17 ops_rate 9.375174e+14 ops_rate/proc 1.562529e+12 vcmp 2.973130e+13 vcmpout 2.940951e+08 cmp 1.048920e+17 ecmp 2.622301e+16 ecmp_rate 1.151666e+14 ecmp_rate/proc 1.919444e+11 vctime 0.068704 mctime 16.262581 intime 0.738858 outtime 213.069819 cpumem 5.154618e+10 gpumem 5.386325e+09 tottime 457.86526
filter out LD
  • any pairs within Xkb?
cd /Users/27n/Dropbox (ORNL)/ORNL.Noshay/Projects/Poplar/CCC/v3
sed 's/_/ /g' 7.7MPoplar-CCC-Threshold0.7.txt | awk '{print ($5"\t"$6"\t"$7"\t"$8"\t"$9"\t"$10"\t"$11"\t"$8-$5"\t"$9-$6)}' > 7.7MPoplar-CCC-Threshold0.7.distance.txt

awk '{if ($8 != 0 || $9 < -5000 || $9 > 5000) print $0}' 7.7MPoplar-CCC-Threshold0.7.distance.txt > 7.7MPoplar-CCC-Threshold0.7.distance.5kb.txt
awk '{if ($8 != 0 || $9 < -50000 || $9 > 50000) print $0}' 7.7MPoplar-CCC-Threshold0.7.distance.txt > 7.7MPoplar-CCC-Threshold0.7.distance.50kb.txt
awk '{if ($8 != 0 || $9 < -500000 || $9 > 500000) print $0}' 7.7MPoplar-CCC-Threshold0.7.distance.txt > 7.7MPoplar-CCC-Threshold0.7.distance.500kb.txt
awk '{if ($8 != 0) print $0}' 7.7MPoplar-CCC-Threshold0.7.distance.txt > 7.7MPoplar-CCC-Threshold0.7.distance.chr.txt

#  294095095 7.7MPoplar-CCC-Threshold0.7.distance.txt
#  141975233 7.7MPoplar-CCC-Threshold0.7.distance.50kb.txt
#  216770824 7.7MPoplar-CCC-Threshold0.7.distance.5kb.txt
#  1961386 7.7MPoplar-CCC-Threshold0.7.distance.chr.txt


library(tidyr)
library(dplyr)
setwd("/Users/27n/Dropbox (ORNL)/ORNL.Noshay/Projects/Poplar/CCC/v3")
gid <- read.delim("Ptrichocarpa_210_v3.0.geneOnly.sorted.gff3", header=F, sep="\t")
ccc.coord <- read.delim("7.7MPoplar-CCC-Threshold0.7.distance.chr.txt", header=F, sep="\t")
colnames(ccc.coord) <- c("snp1.chr", "snp1.pos", "snp1.allele", "snp2.chr", "snp2.pos", "snp2.allele", "ccc.cor", "chr.diff", "pos.diff")

colnames(gid) <- c("chromosome", "source", "feature", "start", "end", "dot", "strand", "dot2", "ID")
gid$chr <- as.numeric(substr(gid$chromosome, 4, 5))
gid.df <- gid %>% separate(ID, c("gid", "name"), sep=";")

library(tidygenomics)
ccc.snp1.coord <- unique(ccc.coord[,c(1:3)])
ccc.snp1.coord$chr <- as.numeric(ccc.snp1.coord$snp1.chr)
ccc.snp1.coord$start <- as.numeric(ccc.snp1.coord$snp1.pos)
ccc.snp1.coord$end <- as.numeric(ccc.snp1.coord$snp1.pos)
ccc.snp2.coord <- unique(ccc.coord[,c(4:6)])
ccc.snp2.coord$chr <- as.numeric(ccc.snp2.coord$snp2.chr)
ccc.snp2.coord$start <- as.numeric(ccc.snp2.coord$snp2.pos)
ccc.snp2.coord$end <- as.numeric(ccc.snp2.coord$snp2.pos)

ccc.snp1.gid <- genome_join_closest(ccc.snp1.coord, gid.df, by=c("chr", "start", "end"), distance_column_name="snp1.distance", mode="full")
ccc.snp2.gid <- genome_join_closest(ccc.snp2.coord, gid.df, by=c("chr", "start", "end"), distance_column_name="snp2.distance", mode="full")

ccc.coord.snp1.gid <- left_join(ccc.coord, ccc.snp1.gid, by=c("snp1.chr", "snp1.pos", "snp1.allele"))
ccc.coord.gid <- left_join(ccc.coord.snp1.gid, ccc.snp2.gid, by=c("snp2.chr", "snp2.pos", "snp2.allele"))
ccc.coord.gid.df <- ccc.coord.gid[,c(1:7,22,24,37,39)]
colnames(ccc.coord.gid.df) <- c("snp1.chr", "snp1.pos", "snp1.allele", "snp2.chr", "snp2.pos", "snp2.allele", "ccc.cor", "snp1.gene", "snp1.gene.dist", "snp2.gene", "snp2.gene.dist")
write.table(ccc.coord.gid.df, "7.7MPoplar-CCC-Threshold0.7.distance.chr.gene.txt", quote=F, row.names=F, sep="\t")

ccc.coord.gid.df.distinct <- ccc.coord.gid.df %>% distinct(snp1.gene, snp2.gene, .keep_all = T)
ccc.coord.gid.df.distinct.pair <- subset(ccc.coord.gid.df.distinct, ccc.coord.gid.df.distinct$snp1.gene != ccc.coord.gid.df.distinct$snp2.gene)
# 5557
write.table(ccc.coord.gid.df.distinct.pair, "7.7MPoplar-CCC-Threshold0.7.distance.chr.gene.distinct.txt", quote=F, row.names=F, sep="\t")

ccc.coord.gid.df.INgene <- subset(ccc.coord.gid.df.distinct.pair, ccc.coord.gid.df.distinct.pair$snp1.gene.dist == 0 & ccc.coord.gid.df.distinct.pair$snp2.gene.dist == 0)
# 129
## I have 129 gene pairs for potential epistatic interaction based on SNPs on different chromosomes with a correlation >0.7 and annotated within a known gene (in v3 poplar data) <-- this does not include any clustering

ccc.coord.gid.df.NEARgene <- subset(ccc.coord.gid.df.distinct.pair, ccc.coord.gid.df.distinct.pair$snp1.gene.dist < 1000 & ccc.coord.gid.df.distinct.pair$snp2.gene.dist < 1000)
# 493
ccc.coord.gid.df.NEARgene <- subset(ccc.coord.gid.df.distinct.pair, ccc.coord.gid.df.distinct.pair$snp1.gene.dist < 5000 & ccc.coord.gid.df.distinct.pair$snp2.gene.dist < 5000)
# 1992
breadth-first-search (BFS)

https://favtutor.com/blogs/breadth-first-search-python

  • The time complexity of the Breadth first Search algorithm is in the form of O(V+E), where V is the representation of the number of nodes and E is the number of edges. Also, the space complexity of the BFS algorithm is O(V).
## first need to make a graph from the ABC-format file

install.packages("igraph")
install.packages("network")
install.packages("sna")
install.packages("ndtv")

setwd("/gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/CCC/poplar.v3")
abc <- read.delim("7.7MPoplar-CCC-Threshold0.7.distance.50kb.abc.txt", header=F, sep="\t", stringsAsFactors = F)

### create a node graph file



### visualizing the network
library(igraph)
net <- graph.data.frame(links, nodes, directed=T)
net <- simplify(net, remove.multiple = F, remove.loops = T)
plot(net, edge.arrow.size=.4,vertex.label=NA)

# Generate colors base on media type:
colrs <- c("gray50", "tomato", "gold")
V(net)$color <- colrs[V(net)$media.type]
# Compute node degrees (#links) and use that to set node size:
deg <- degree(net, mode="all")
14
V(net)$size <- deg*3
# We could also use the audience size value:
V(net)$size <- V(net)$audience.size*0.6
# The labels are currently node IDs.
# Setting them to NA will render no labels:
V(net)$label <- NA
# Set edge width based on weight:
E(net)$width <- E(net)$weight/6
#change arrow size and edge color:
E(net)$arrow.size <- .2
E(net)$edge.color <- "gray80"
E(net)$width <- 1+E(net)$weight/12
plot(net)
source /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/scripts/loadcondaandes.sh
conda activate /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/Libraries/andes/envs/test

## or generate node graph file in python
python

import pandas as pd

df = pd.read_csv("7.7MPoplar-CCC-Threshold0.7.distance50kb.abc.txt", sep="\t", header=None, names=["SNP1", "SNP2", "edge.weight"])
print(df)
#                     SNP1          SNP2  edge.weight
# 0              1_58944_T      1_8323_T     0.700286
# 1              1_59246_T      1_8323_T     0.705242
# 2              1_59246_T      1_8378_G     0.700243
# 3              1_59246_T      1_8556_T     0.700634
# 4              1_59246_T      1_8616_T     0.700973
# [141975233 rows x 3 columns]

graph = df.groupby('SNP1')['SNP2'].agg(list).to_dict()

## then with graph can run BFS in python
# graph = {
#   '5' : ['3','7'],
#   '3' : ['2', '4'],
#   '7' : ['8'],
#   '2' : [],
#   '4' : ['8'],
#   '8' : []
# }

visited = [] # List for visited nodes.
queue = []     #Initialize a queue

def bfs(visited, graph, node): #function for BFS
  visited.append(node)
  queue.append(node)
  
  while queue:          # Creating loop to visit each node
    m = queue.pop(0) 
    print (m, end = " ") 
    
    for neighbour in graph[m]:
      if neighbour not in visited:
        visited.append(neighbour)
        queue.append(neighbour)

# Driver Code
print("Following is the Breadth-First Search")
bfs(visited, graph, '1_58944_T')    # function calling
markov clustering (mcl)

https://micans.org/mcl/

source /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/scripts/loadcondaandes.sh
conda activate /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/Libraries/andes/envs/test

conda install -c conda-forge -c bioconda mcl

#mcl INPUT_FILE_NAME --abc -I 1.5 -o OUTPUT_FILE_NAME

cd /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/CCC/poplar.v3
# cp /gpfs/alpine/syb105/proj-shared/Personal/gabrielgaz/Projects/archives/cccLOCO/integrator/data/7.7MPoplar-CCC-Threshold0.7.txt .
cut -f 5-7 7.7MPoplar-CCC-Threshold0.7.txt > 7.7MPoplar-CCC-Threshold0.7.abc.txt
# mcl 7.7MPoplar-CCC-Threshold0.7.abc.txt --abc -I 1.5 -o 7.7MPoplar-CCC-Threshold0.7.abc.mcl.txt

sed 's/_/ /g' 7.7MPoplar-CCC-Threshold0.7.txt | awk '{print ($5"\t"$6"\t"$7"\t"$8"\t"$9"\t"$10"\t"$11"\t"$8-$5"\t"$9-$6)}' > 7.7MPoplar-CCC-Threshold0.7.distance.txt

awk '{if ($8 != 0 || $9 < -5000 || $9 > 5000) print $0}' 7.7MPoplar-CCC-Threshold0.7.distance.txt > 7.7MPoplar-CCC-Threshold0.7.distance.5kb.txt
awk '{if ($8 != 0 || $9 < -50000 || $9 > 50000) print $0}' 7.7MPoplar-CCC-Threshold0.7.distance.txt > 7.7MPoplar-CCC-Threshold0.7.distance.50kb.txt
awk '{if ($8 != 0 || $9 < -500000 || $9 > 500000) print $0}' 7.7MPoplar-CCC-Threshold0.7.distance.txt > 7.7MPoplar-CCC-Threshold0.7.distance.500kb.txt
awk '{if ($8 != 0) print $0}' 7.7MPoplar-CCC-Threshold0.7.distance.txt > 7.7MPoplar-CCC-Threshold0.7.distance.chr.txt

awk '{print ($1"_"$2"_"$3"\t"$4"_"$5"_"$6"\t"$7)}' 7.7MPoplar-CCC-Threshold0.7.distance.50kb.txt > 7.7MPoplar-CCC-Threshold0.7.distance.50kb.abc.txt
awk '{print ($1"_"$2"_"$3"\t"$4"_"$5"_"$6"\t"$7)}'  7.7MPoplar-CCC-Threshold0.7.distance.500kb.txt > 7.7MPoplar-CCC-Threshold0.7.distance.500kb.abc.txt
awk '{print ($1"_"$2"_"$3"\t"$4"_"$5"_"$6"\t"$7)}'  7.7MPoplar-CCC-Threshold0.7.distance.chr.txt > 7.7MPoplar-CCC-Threshold0.7.distance.chr.abc.txt
#!/bin/bash
#SBATCH -A SYB105
#SBATCH -J ccc.mcl
#SBATCH -N 2
#SBATCH -t 10:00:00

source /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/scripts/loadcondaandes.sh
conda activate /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/Libraries/andes/envs/test

cd /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/CCC/poplar.v3
mcl 7.7MPoplar-CCC-Threshold0.7.abc.txt --abc -I 1.5 -o 7.7MPoplar-CCC-Threshold0.7.abc.mcl.txt
mcl 7.7MPoplar-CCC-Threshold0.7.distance.50kb.abc.txt --abc -I 1.5 -o 7.7MPoplar-CCC-Threshold0.7.distance.50kb.abc.mcl.txt
# 8470
mcl 7.7MPoplar-CCC-Threshold0.7.distance.500kb.abc.txt --abc -I 1.5 -o 7.7MPoplar-CCC-Threshold0.7.distance.500kb.abc.mcl.txt
# 1594 clusters
mcl 7.7MPoplar-CCC-Threshold0.7.distance.chr.abc.txt --abc -I 1.5 -o 7.7MPoplar-CCC-Threshold0.7.distance.chr.abc.mcl.txt
# 937 clusters

#sbatch /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/CCC/poplar.v3/ccc.mcl.sh
Count by phenotype
  • for each cluster… calculate frequency of entire SNP pattern for cases v controls <- need to identify case v control samples…
  • run G-test of independence
Phenotype
  • assess putative epistatic interactions for association with specific phenotypes
library(tidyr)
library(dplyr)
setwd("/Users/27n/Dropbox (ORNL)/ORNL.Noshay/Projects/Poplar/CCC/v3")
ccc.gid <- read.delim("7.7MPoplar-CCC-Threshold0.7.distance.chr.gene.txt", header=T, sep="\t")
trait.gid <- read.delim("Poplar.GRIN_from_GWATS.All_genes.PerClimateVariable.txt", header=F, sep="\t")
colnames(trait.gid) <- c("trait", "gid")
trait.gid.count <- trait.gid %>% group_by(trait) %>% mutate(count = n()) %>% select(trait, count) %>% distinct()
#   trait   count
#   <chr>   <int>
# 1 aridity    33
# 2 cloud     276
# 3 pet        32
# 4 prec      218
# 5 srad      456
# 6 tmax      138
# 7 tmin      127
# 8 vapr       86
# 9 wind       51

ccc.gid$snp1.gene <- substr(ccc.gid$snp1.gene, 6, 21)
ccc.gid$snp2.gene <- substr(ccc.gid$snp2.gene, 6, 21)
ccc.gid.id <- ccc.gid %>% unite("snp1.id", c(snp1.chr, snp1.pos, snp1.allele), sep="_") %>% unite("snp2.id", c(snp2.chr, snp2.pos, snp2.allele), sep="_")

nrow(trait.gid)
# 1417
nrow(ccc.gid.id)
# 1963239

trait.gid.snp1 <- trait.gid
trait.gid.snp1$snp1.gene <- trait.gid.snp1$gid
ccc.snp1.trait <- inner_join(trait.gid.snp1, ccc.gid.id, by="snp1.gene")
trait.gid.snp2 <- trait.gid
trait.gid.snp2$snp2.gene <- trait.gid.snp2$gid
ccc.snp2.trait <- inner_join(trait.gid.snp2, ccc.gid.id, by="snp2.gene")

ccc.snp1.snp2.trait <- inner_join(ccc.snp1.trait[,c(1,3:6,8)], ccc.snp2.trait[,c(1,3:7)], by=c("trait", "snp1.gene", "snp1.id", "snp2.gene", "snp2.id", "ccc.cor"))
# 25
ccc.snp1.snp2.trait.pair <- subset(ccc.snp1.snp2.trait, ccc.snp1.snp2.trait$snp1.gene != ccc.snp1.snp2.trait$snp2.gene)
# 25
ccc.snp1.snp2.trait.uniq <- unique(ccc.snp1.snp2.trait.pair[,c(1,2,6)])
# 7
#    trait        snp1.gene        snp2.gene
# 1   prec Potri.010G079500 Potri.006G275900
# 5   prec Potri.016G017500 Potri.006G275900
# 7   prec Potri.016G017500 Potri.006G275800
# 11  prec Potri.010G080600 Potri.006G275900
# 13  prec Potri.015G126900 Potri.006G275800
# 14  prec Potri.015G126900 Potri.006G275900
# 22  vapr Potri.010G079500 Potri.006G275900


### in order to really look at this I need to do a clustering analysis and would need to know which samples have each of the phenotypes...

–> what about SNPs 50kb apart instead of on separate chromosomes

library(tidyr)
library(dplyr)
setwd("/Users/27n/Dropbox (ORNL)/ORNL.Noshay/Projects/Poplar/CCC/v3")
gid <- read.delim("Ptrichocarpa_210_v3.0.geneOnly.sorted.gff3", header=F, sep="\t")
ccc.coord <- read.delim("7.7MPoplar-CCC-Threshold0.7.distance.50kb.txt", header=F, sep="\t")
colnames(ccc.coord) <- c("snp1.chr", "snp1.pos", "snp1.allele", "snp2.chr", "snp2.pos", "snp2.allele", "ccc.cor", "chr.diff", "pos.diff")

colnames(gid) <- c("chromosome", "source", "feature", "start", "end", "dot", "strand", "dot2", "ID")
gid$chr <- as.numeric(substr(gid$chromosome, 4, 5))
gid.df <- gid %>% separate(ID, c("gid", "name"), sep=";")

library(tidygenomics)
ccc.snp1.coord <- unique(ccc.coord[,c(1:3)])
ccc.snp1.coord$chr <- as.numeric(ccc.snp1.coord$snp1.chr)
ccc.snp1.coord$start <- as.numeric(ccc.snp1.coord$snp1.pos)
ccc.snp1.coord$end <- as.numeric(ccc.snp1.coord$snp1.pos)
ccc.snp2.coord <- unique(ccc.coord[,c(4:6)])
ccc.snp2.coord$chr <- as.numeric(ccc.snp2.coord$snp2.chr)
ccc.snp2.coord$start <- as.numeric(ccc.snp2.coord$snp2.pos)
ccc.snp2.coord$end <- as.numeric(ccc.snp2.coord$snp2.pos)

ccc.snp1.gid <- genome_join_closest(ccc.snp1.coord, gid.df, by=c("chr", "start", "end"), distance_column_name="snp1.distance", mode="full")
ccc.snp2.gid <- genome_join_closest(ccc.snp2.coord, gid.df, by=c("chr", "start", "end"), distance_column_name="snp2.distance", mode="full")

ccc.coord.snp1.gid <- left_join(ccc.coord, ccc.snp1.gid, by=c("snp1.chr", "snp1.pos", "snp1.allele"))
ccc.coord.gid <- left_join(ccc.coord.snp1.gid, ccc.snp2.gid, by=c("snp2.chr", "snp2.pos", "snp2.allele"))
ccc.coord.gid.df <- ccc.coord.gid[,c(1:7,22,24,37,39)]
colnames(ccc.coord.gid.df) <- c("snp1.chr", "snp1.pos", "snp1.allele", "snp2.chr", "snp2.pos", "snp2.allele", "ccc.cor", "snp1.gene", "snp1.gene.dist", "snp2.gene", "snp2.gene.dist")
write.table(ccc.coord.gid.df, "7.7MPoplar-CCC-Threshold0.7.distance.50kb.gene.txt", quote=F, row.names=F, sep="\t")


library(tidyr)
library(dplyr)
setwd("/Users/27n/Dropbox (ORNL)/ORNL.Noshay/Projects/Poplar/CCC/v3")
ccc.gid <- read.delim("7.7MPoplar-CCC-Threshold0.7.distance.50kb.gene.txt", header=T, sep="\t")
trait.gid <- read.delim("Poplar.GRIN_from_GWATS.All_genes.PerClimateVariable.txt", header=F, sep="\t")
colnames(trait.gid) <- c("trait", "gid")
trait.gid.count <- trait.gid %>% group_by(trait) %>% mutate(count = n()) %>% select(trait, count) %>% distinct()

ccc.gid$snp1.gene <- substr(ccc.gid$snp1.gene, 6, 21)
ccc.gid$snp2.gene <- substr(ccc.gid$snp2.gene, 6, 21)
ccc.gid.id <- ccc.gid %>% unite("snp1.id", c(snp1.chr, snp1.pos, snp1.allele), sep="_") %>% unite("snp2.id", c(snp2.chr, snp2.pos, snp2.allele), sep="_")

nrow(trait.gid)
nrow(ccc.gid.id)

trait.gid.snp1 <- trait.gid
trait.gid.snp1$snp1.gene <- trait.gid.snp1$gid
ccc.snp1.trait <- inner_join(trait.gid.snp1, ccc.gid.id, by="snp1.gene")
trait.gid.snp2 <- trait.gid
trait.gid.snp2$snp2.gene <- trait.gid.snp2$gid
ccc.snp2.trait <- inner_join(trait.gid.snp2, ccc.gid.id, by="snp2.gene")

ccc.snp1.snp2.trait <- inner_join(ccc.snp1.trait[,c(1,3:6,8)], ccc.snp2.trait[,c(1,3:7)], by=c("trait", "snp1.gene", "snp1.id", "snp2.gene", "snp2.id", "ccc.cor"))
ccc.snp1.snp2.trait.pair <- subset(ccc.snp1.snp2.trait, ccc.snp1.snp2.trait$snp1.gene != ccc.snp1.snp2.trait$snp2.gene)
ccc.snp1.snp2.trait.uniq <- unique(ccc.snp1.snp2.trait.pair[,c(1,2,6)])
Potri.003G137800

v4 poplar data

/gpfs/alpine/syb105/proj-shared/Data/P_tri/Variants/v4 1419 genotypes

tped file generation
cd /Users/27n/Dropbox (ORNL)/ORNL.Noshay/Projects/Poplar/CCC/v4
#scp noshayjm@dtn.ccs.ornl.gov:/gpfs/alpine/syb105/proj-shared/Data/P_tri/Variants/v4/tranche99/maf05/Ptri_V4_Nisq1.1419.VQSRrecal9_tranche99_allchr.biallelicSNP.geno015.hwe1e50.maf05.mind015.bed .
#scp noshayjm@dtn.ccs.ornl.gov:/gpfs/alpine/syb105/proj-shared/Data/P_tri/Variants/v4/tranche99/maf05/Ptri_V4_Nisq1.1419.VQSRrecal9_tranche99_allchr.biallelicSNP.geno015.hwe1e50.maf05.mind015.bim .
#scp noshayjm@dtn.ccs.ornl.gov:/gpfs/alpine/syb105/proj-shared/Data/P_tri/Variants/v4/tranche99/maf05/Ptri_V4_Nisq1.1419.VQSRrecal9_tranche99_allchr.biallelicSNP.geno015.hwe1e50.maf05.mind015.fam .

salloc -A SYB105 -p gpu -N 2 -t 4:00:00
source /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/scripts/loadcondaandes.sh
conda activate /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/Libraries/andes/envs/test

#conda install -c conda-forge -c bioconda plink2
#conda install -c conda-forge -c bioconda plink
#mkdir /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/CCC/poplar.v4

cd /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/CCC/poplar.v4

# to generate tped need to use plink/1.9
plink --bfile /gpfs/alpine/syb105/proj-shared/Data/P_tri/Variants/v4/tranche99/maf05/Ptri_V4_Nisq1.1419.VQSRrecal9_tranche99_allchr.biallelicSNP.geno015.hwe1e50.maf05.mind015 --recode compound-genotypes --out /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/CCC/poplar.v4/Ptri_V4_Nisq1.1419.VQSRrecal9_tranche99_allchr.biallelicSNP.geno015.hwe1e50.maf05.mind015.tped

# transform .ped and .map to generate input CCC file (tped) 
## Columns:
  # 1            chromosome (without leading chr)
  # 2            Locus name
  # 3            Genetic distance, left empty
  # 4            Physical position
  # 5            genotype
## need to transpose .ped file and input position from .map file

##transpose.py
# with open('Ptri_V4_Nisq1.1419.VQSRrecal9_tranche99_allchr.biallelicSNP.geno015.hwe1e50.maf05.mind015.tped.ped') as file:
#     lis = [x.replace('\n', '').split(' ') for x in file]
# 
# for x in zip(*lis):
#     for y in x:
#         print(y + ' ', end="")
#     print('')

python transpose.py > Ptri_V4_Nisq1.1419.VQSRrecal9_tranche99_allchr.biallelicSNP.geno015.hwe1e50.maf05.mind015.tped
# 4955674 Ptri_V4_Nisq1.1419.VQSRrecal9_tranche99_allchr.biallelicSNP.geno015.hwe1e50.maf05.mind015.tped.map
# 9911354 Ptri_V4_Nisq1.1419.VQSRrecal9_tranche99_allchr.biallelicSNP.geno015.hwe1e50.maf05.mind015.tped
### why are there 2x the .tped as the .map ?? because it was in compound-genotype format... go back and specify in PLINK run
# 4955680 Ptri_V4_Nisq1.1419.VQSRrecal9_tranche99_allchr.biallelicSNP.geno015.hwe1e50.maf05.mind015.tped

sed -i '1,6d' Ptri_V4_Nisq1.1419.VQSRrecal9_tranche99_allchr.biallelicSNP.geno015.hwe1e50.maf05.mind015.tped
sed -e 's/\t/ /g' Ptri_V4_Nisq1.1419.VQSRrecal9_tranche99_allchr.biallelicSNP.geno015.hwe1e50.maf05.mind015.tped.map > Ptri_V4_Nisq1.1419.VQSRrecal9_tranche99_allchr.biallelicSNP.geno015.hwe1e50.maf05.mind015.tped.space.map

# need to add space between compound-genotype
#awk '$1=$1' FS= OFS=" " Ptri_V4_Nisq1.1419.VQSRrecal9_tranche99_allchr.biallelicSNP.geno015.hwe1e50.maf05.mind015.tped | sed 's/   / /g' > Ptri_V4_Nisq1.1419.VQSRrecal9_tranche99_allchr.biallelicSNP.geno015.hwe1e50.maf05.mind015.space.tped
### something going wrong with this command as I am only getting 4941416 total lines instead of 4955674
#sed 's/\(.\)/\1 /g;s/ $//' Ptri_V4_Nisq1.1419.VQSRrecal9_tranche99_allchr.biallelicSNP.geno015.hwe1e50.maf05.mind015.tped > test2.txt
# 4955674 <- this works properly

sed 's/\(.\)/\1 /g;s/ $//' Ptri_V4_Nisq1.1419.VQSRrecal9_tranche99_allchr.biallelicSNP.geno015.hwe1e50.maf05.mind015.tped | sed 's/   / /g' > Ptri_V4_Nisq1.1419.VQSRrecal9_tranche99_allchr.biallelicSNP.geno015.hwe1e50.maf05.mind015.space.tped

paste -d ' ' Ptri_V4_Nisq1.1419.VQSRrecal9_tranche99_allchr.biallelicSNP.geno015.hwe1e50.maf05.mind015.tped.space.map Ptri_V4_Nisq1.1419.VQSRrecal9_tranche99_allchr.biallelicSNP.geno015.hwe1e50.maf05.mind015.space.tped > ccc_data.tped
Pre-Processing
#!/bin/bash
#BSUB -P SYB105
#BSUB -W 24:00
#BSUB -nnodes 4
#BSUB -J poplar.v4.ccc.pre
#BSUB -o poplar.v4.ccc.pre.%J.out
#BSUB -q batch-hm

module load gcc/5.4.0
#Script and Data path

spath=/gpfs/alpine/syb105/proj-shared/Personal/mcashman/Tools/summit/cometTools/ccc/preProcess/Scripts/preProcess/preProcess.sh
dpath=/gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/CCC/poplar.v4/raw.data

#Run command
jsrun --nrs 1 --tasks_per_rs 1 --cpu_per_rs 42 --gpu_per_rs 6 --rs_per_host 1 $spath $dpath/ccc_data.tped
echo "Done!"

# cd /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/CCC/poplar.v4
# bsub /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/CCC/poplar.v4/scripts/poplar.v4.ccc.pre.sh

### add #BSUB -q batch-hm to job submission script to allow for more time

–> not working due to time limit, can’t generate binary file?? alter to run on Andes…

#!/bin/bash
#SBATCH -A SYB105
#SBATCH -J poplar.v4.ccc.pre.andes
#SBATCH -N 4
#SBATCH -t 10:00:00
#SBATCH -o poplar.v4.ccc.pre.andes-%j.o
#SBATCH -e poplar.v4.ccc.pre.andes-%j.e

module load gcc/5.4.0
#Script and Data path

spath=/gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/CCC/poplar.v4/preProcess.andes.sh
dpath=/gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/CCC/poplar.v4

#Run command
#jsrun --nrs 1 --tasks_per_rs 1 --cpu_per_rs 42 --gpu_per_rs 6 --rs_per_host 1 $spath $dpath/ccc_data.tped
srun -N1 -n1 -c4 $spath $dpath/ccc_data.tped
echo "Done!"

# cd /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/CCC/poplar.v4
# sbatch /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/CCC/poplar.v4/poplar.v4.ccc.pre.andes.sh

–> need to generate a preProcess.andes.sh file with the appropriate andes-build ??? - TOOLS_DIR=/gpfs/alpine/syb105/proj-shared/Personal/mcashman/Tools/andes/genomics_gpu/tools - binarize=/gpfs/alpine/syb105/proj-shared/Personal/mcashman/Tools/summit/cometTools/ccc/preProcess/Tools/binarize/binarizeCCC

–> utilize other scripts?

# /gpfs/alpine/syb105/proj-shared/Personal/mcashman/Projects/CoMet/new_examples/ccc/preprocess/passed_version/preprocess_single_gabrielsBin.submit

#!/bin/bash
#SBATCH -A syb105
#SBATCH -N 1
#SBATCH -t 60
#SBATCH -J pre
#SBATCH -o output-preprocess-'%A'.txt

# Location of CoMet postprocessing tools.
TOOLS_DIR=/gpfs/alpine/syb105/proj-shared/Personal/mcashman/Tools/andes/genomics_gpu/tools
export PATH="${PATH}:$TOOLS_DIR"

# do this somewhere before here:
#pushd $TOOLS_DIR
#./make.sh
#popd

# Set the filename to be processed
#FILE_IN="/gpfs/alpine/syb105/proj-shared/Personal/mcashman/Projects/CoMet/new_examples/ccc/data/ccc_data.tped"
FILE_IN="/gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/CCC/poplar.v4/ccc_data.tped"
FPATH=$(dirname $FILE_IN)
STUB=$(basename $FILE_IN | sed -e 's/.tped$//')

echo "Preprocessing $FILE_IN"
#time $TOOLS_DIR/line_labels.sh $FILE_IN ${STUB}_line_labels.txt

binarize=/gpfs/alpine/syb105/proj-shared/Personal/mcashman/Tools/andes/cometTools/ccc/preProcess/Tools/binarize/binarizeCCC

(
  time line_labels.sh $FILE_IN ${STUB}_line_labels.txt
  time line_indices $FILE_IN ${STUB}_line_indices.bin
  time allele_labels.sh $FILE_IN ${STUB}_allele_labels.txt
  time $binarize $FILE_IN ${STUB}.bin
  time preprocess_validate ccc ${STUB}.bin $FILE_IN ${STUB}_allele_labels.txt

) 2>&1 | tee output-preprocess-$STUB.txt

# To save time when using DUO use this altnative to allele_labels creation 
#time sed -e 's/.*/AT/' <${STUB}_line_labels.txt >${STUB}_allele_labels.txt
Running CoMet
# CCC vector length = 1/2 the number of non-whitespace characters
#([ head -n1 [FILE] | wc -w ] - 4) / 2
head -n1 ccc_data.tped | wc -w
### (2842 - 4) / 2 = 1419

# Number of Vectors: The number of vectors is equal to the number of rows.
wc -l ccc_data.tped
# 4955674 
#BSUB -P SYB105
#BSUB -W 10:00
#BSUB -nnodes 500
#BSUB -J poplar.v4.ccc.run
#BSUB -o poplar.v4.ccc.run.%J.out

# bsub -P syb105 -nnodes 100 -W 10 -Is $SHELL

executable=/gpfs/alpine/syb105/proj-shared/Personal/mcashman/Tools/summit/comet/build_release_summit/genomics_metric ar_opts='PAMI_IBV_ENABLE_DCT=1 PAMI_ENABLE_STRIPING=1 PAMI_IBV_ADAPTER_AFFINITY=0 PAMI_IBV_QP_SERVICE_LEVEL=8 PAMI_IBV_ENABLE_OOO_AR=1'

launch_command="env OMP_NUM_THREADS=7 $ar_opts jsrun --nrs 600 --bind packed:7 --cpu_per_rs 7 --gpu_per_rs 1 --rs_per_host 6 --tasks_per_rs 1 -X 1"

$launch_command $executable --num_way 2 --metric_type ccc --sparse yes --num_vector 4955674 --num_field 1419 --all2all yes --compute_method GPU --num_proc_vector 600 --num_proc_field 1 --num_proc_repl 1 --tc 1 --num_tc_steps 1 --num_phase 5 --num_stage 1 --verbosity 1 --checksum no --phase_min 4 --phase_max 4 --stage_min 0 --stage_max 0 --input_file /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/CCC/poplar.v4/raw.data/ccc_data.tped.bin --output_file_stub /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/CCC/poplar.v4/ccc.run/poplar.v4.ccc

# bsub /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/CCC/poplar.v4/scripts/poplar.v4.ccc.run.sh

# Assertion error: "env->num_phase() <= 1 + env->num_block_vector() / 2 && "num_phase must be at most 1 + num_proc_vector/2."". At file /gpfs/alpine/syb105/proj-shared/Personal/mcashman/Tools/summit/comet/genomics_gpu/src/metrics.cc, line 460, host "g19n08".

# Assertion error: "can_run() && "Invalid problem for this system and build."". At file /gpfs/alpine/syb105/proj-shared/Personal/mcashman/Tools/summit/comet/genomics_gpu/src/env.cc, line 261, host "e27n13".
### set --tc 1

# ERROR: One or more process (first noticed rank 452) terminated with signal 9 
### try adjusting the number of phases parameter.  You will need to set not only --num_phases but also the min and max (to 0 and num_phases-1).  You can also just try running one phase as a test (e.g. --num_phases N --min_phases 0 --max_phases 0).

################# increase nodes and phases ################# 

# try with -nnodes 500 and -W 10:00

#BSUB -P SYB105
#BSUB -W 10:00
#BSUB -nnodes 500
#BSUB -J poplar.v4.ccc.run
#BSUB -o poplar.v4.ccc.run.%J.out

## Note: changed --nrs 2160
env OMP_NUM_THREADS=7 PAMI_IBV_ENABLE_DCT=1 PAMI_ENABLE_STRIPING=1 PAMI_IBV_ADAPTER_AFFINITY=0 PAMI_IBV_QP_SERVICE_LEVEL=8 PAMI_IBV_ENABLE_OOO_AR=1 jsrun --nrs 2160 --bind packed:7 --cpu_per_rs 7 --gpu_per_rs 1 --rs_per_host 6 --tasks_per_rs 1 -X 1 /gpfs/alpine/syb105/proj-shared/Personal/mcashman/Tools/summit/comet/build_release_summit/genomics_metric --num_way 2 --metric_type ccc --sparse yes --num_vector 4955674 --num_field 1419 --all2all yes --compute_method GPU --num_proc_vector 2160 --num_proc_field 1 --num_proc_repl 1 --tc 1 --num_tc_steps 1 --num_phase 8 --num_stage 1 --verbosity 1 --checksum no --phase_min 0 --phase_max 7 --stage_min 0 --stage_max 0 --input_file /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/CCC/poplar.v4/raw.data/ccc_data.tped.bin --output_file_stub /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/CCC/poplar.v4/ccc.run/poplar.v4.ccc

# bsub /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/CCC/poplar.v4/scripts/poplar.v4.ccc.run.8june.sh



################# replicate Gabriel's script ################# 
## /gpfs/alpine/syb105/proj-shared/Personal/gabrielgaz/Runs/comet/summit/ccc/poplar/01-Poplar7M/run

# major change is using num_proc_vector 600
# try with -nnodes 100 and -W 02:00 AND 50 phases
  ### runs but get Error after 10 phases completed saying 
    # User defined signal 2
    # ERROR:  One or more process (first noticed rank 520) terminated with signal 12
# try with --nnodes 360

#BSUB -P SYB105
#BSUB -W 02:00
#BSUB -nnodes 360
#BSUB -J poplar.v4.ccc.run
#BSUB -o poplar.v4.ccc.run.%J.out


executable=/gpfs/alpine/syb105/proj-shared/Personal/mcashman/Tools/summit/comet/build_release_summit/genomics_metric ar_opts='PAMI_IBV_ENABLE_DCT=1 PAMI_ENABLE_STRIPING=1 PAMI_IBV_ADAPTER_AFFINITY=0 PAMI_IBV_QP_SERVICE_LEVEL=8 PAMI_IBV_ENABLE_OOO_AR=1'

launch_command="env OMP_NUM_THREADS=7 $ar_opts jsrun --nrs 600 --bind packed:7 --cpu_per_rs 7 --gpu_per_rs 1 --rs_per_host 6 --tasks_per_rs 1 -X 1"

$launch_command $executable --num_way 2 --metric_type ccc --sparse yes --num_vector 4955674 --num_field 1419 --all2all yes --compute_method GPU --num_proc_vector 600 --num_proc_field 1 --num_proc_repl 1 --tc 1 --num_tc_steps 1 --num_phase 50 --num_stage 1 --verbosity 1 --checksum no --phase_min 0 --phase_max 49 --stage_min 0 --stage_max 0 --input_file /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/CCC/poplar.v4/raw.data/ccc_data.tped.bin --threshold 0.7 --output_file_stub /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/CCC/poplar.v4/ccc.run/poplar.v4.ccc

# bsub /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/CCC/poplar.v4/scripts/poplar.v4.ccc.run.9june.sh
Post-Processing
#!/bin/bash
#SBATCH -A syb105
#SBATCH -N 1
#SBATCH -t 10
#SBATCH -J poplar.v4.ccc.post
#SBATCH -o poplar.v4.ccc.post.'%A'.txt

# do this somewhere before here:
#pushd $TOOLS_DIR
#./make.sh
#popd

# Location of CoMet postprocessing tools.
TOOLS_DIR=/gpfs/alpine/syb105/proj-shared/Personal/mcashman/Tools/andes/genomics_gpu/tools
export PATH="${PATH}:$TOOLS_DIR"

NUM_WAY=2

# Extract the file to be processed by this node from the file list.
BASE_PATH_PRE=/gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/CCC/poplar.v4/preprocess
FILE_IN=/gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/CCC/poplar.v4/raw.data/poplar.v4.ccc.bin
STUB=ccc_data
mkdir -p out_slurm_$STUB

echo "Postprocessing $FILE_IN"

(
  time postprocess $NUM_WAY \
    $BASE_PATH_PRE/${STUB}_allele_labels.txt \
    $BASE_PATH_PRE/${STUB}_line_labels.txt \
    $FILE_IN \
    $STUB.txt
) 2>&1 | tee out_slurm_$STUB/output-postprocess-$STUB.txt

# sbatch /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/CCC/poplar.v4/scritps/poplar.v4.ccc.post.sh

/gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/CCC/poplar.v4/ccc_data.txt –> output min 0.474554 max 0.523073 –> how are there no SNPs with a higher correlation? –> what does a permuted dataset look like? try to run CCC on the permuted dataset output from blocbuster

# 4955674 ccc_data.tped  
# 805920 ccc_data.txt #### shouldn't the output have MORE lines than the tped file??

4955674 / 805920
## [1] 6.149089
# 4024284 ccc_data.txt #### still only capturing chr 1???
# cut -d ':' -f 1 ccc_data_line_labels.txt | awk '{if ($1 == "01") print $0}' | wc -l
# 64720
64720
## [1] 64720

*** NEED TO RUN THE POST_PROCESSING FOR EACH INDIVIDUAL .BIN FILE *** - run on poplar.v4.ccc_0.bin, poplar.v4.ccc_1.bin, poplar.v4.ccc_2.bin, poplar.v4.ccc_3.bin, poplar.v4.ccc_4.bin, and poplar.v4.ccc_5.bin - after each run copy ccc_data.txt file to output directory and rename…. then merge all together?

–> USE GABRIEL’s POST-PROCESSING SCRIPT as reference… ultimately run Piet’s script to do post-processing on each phase output and then merge

###PostProcessing

# python
out = open("commands_postprocess.txt", "w")
#for i in range(600):             ## need to specify the 3 digits (001-600 instead of 1-600)
phase = ["%03d" % i for i in range(600)]
for i in phase:
  postprocess = "/gpfs/alpine/syb105/proj-shared/Personal/mcashman/Tools/andes/genomics_gpu/tools/postprocess_all.sh 2 /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/CCC/poplar.v4/raw.data/ccc_data_allele_labels.txt /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/CCC/poplar.v4/raw.data/ccc_data_labels.txt /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/CCC/poplar.v4/ccc.run/poplar.v4.ccc_" + str(i) + ".bin"
  out.write("%s\n" % postprocess)
out.close()

# Piets Script
/gpfs/alpine/syb105/proj-shared/piet/codebase/andes/submit --name cccInputs --time 48:00:00 --maxpernode 32 --nodes 1 /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/CCC/poplar.v4/scripts/commands_postprocess.txt
# /gpfs/alpine/syb105/proj-shared/Personal/mcashman/Tools/andes/genomics_gpu/tools/postprocess_all.sh 2 /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/CCC/poplar.v4/raw.data/ccc_data_allele_labels.txt /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/CCC/poplar.v4/raw.data/ccc_data_labels.txt /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/CCC/poplar.v4/ccc.run/poplar.v4.ccc_000.bin



#Put *.txt together, postprocessed bin files
cd /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/CCC/poplar.v4/ccc.run
cat $(find ./ -name "poplar.v4.ccc_*.txt" | sort -V) > poplar.v4.ccc.threshold0.7.txt
BlocBuster
  • run CCC on blocbuster instead of comet
#!/bin/sh
#SBATCH -A SYB105
#SBATCH -N 1
#SBATCH -p batch
#SBATCH -t 48:00:00
#SBATCH --mem-per-cpu=0
#SBATCH -o blocbuster.ccc-%j.o
#SBATCH -e blocbuster.ccc-%j.e

genFile=ccc_data_compound.tped
numInd=1419
numSnps=4955674
thresh=0.7
numHeadRows=0
numHeadCols=4

# Assumed rows represent SNPs 

Gml=ccc.gml

/gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/CCC/BlocBuster/ccc/ccc ${genFile} ${Gml} ${thresh} ${numInd} ${numSnps} ${numHeadRows} ${numHeadCols}

echo Finished
echo

# cd /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/CCC/poplar.v4
# sbatch /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/CCC/poplar.v4/sharlee.ccc.sh
Permutation
  • generate a randomly permute the matrix and run CCC to determine what the highest CCC value is that you get by chance. Then use that as a cut off, but this can be computationally insane b/c u have to run it 10,000 times…
  • randomize the SNPs within each sample…
https://zzz.bwh.harvard.edu/plink/perm.shtml
# For some tests, however, these procedures are not available (e.g. SNP x SNP epistasis tests).

http://www.cs.umsl.edu/~climer/blocBuster/README_perm
# blocBuster: perm
#    perm input.txt output.txt numDataCols numDataRows numHeadCols numHeadRows
# where 
# - 'input.txt' is your genotype data
# - 'output.txt' will hold the permuted data
# - 'numDataCols' is the number of individuals
# - 'numDataRows' is the number of SNPs
# - 'numHeadCols' is the number of header columns
# - 'numHeadRows' is the number of header rows

cd /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/CCC/poplar.v4
paste -d ' ' Ptri_V4_Nisq1.1419.VQSRrecal9_tranche99_allchr.biallelicSNP.geno015.hwe1e50.maf05.mind015.tped.space.map Ptri_V4_Nisq1.1419.VQSRrecal9_tranche99_allchr.biallelicSNP.geno015.hwe1e50.maf05.mind015.tped > ccc_data_compound.tped

#scp noshayjm@dtn.ccs.ornl.gov:/gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/CCC/poplar.v4/ccc_data_compound.tped /Users/27n/Dropbox (ORNL)/ORNL.Noshay/Projects/Poplar/CCC/v4/.

cd /Users/27n/Dropbox (ORNL)/ORNL.Noshay/Projects/Poplar/CCC/v4/
../../../../Resources/BlocBuster/perm/perm ccc_data_compound.tped ccc_data_compound.perm.txt 1419 4955674 4 0

# run blocBuster: http://www.cs.umsl.edu/~climer/blocBuster/README
## http://www.cs.umsl.edu/~climer/blocBuster/README_ccc
tr ' ' \\t < ccc_data_compound.perm.txt > ccc_data_compound.perm.tab.txt
tr ' ' \\t < ccc_data_compound.tped > ccc_data_compound.tab.txt
sed 's/00/NN/g' ccc_data_compound.perm.tab.txt > ccc_data_compound.perm.blocBuster.input.txt
sed 's/00/NN/g' ccc_data_compound.tab.txt > ccc_data_compound.blocBuster.input.txt

##***Important: Assumed rows represent individuals and columns represent SNPs in input file.***
## Need to adjust [ROWS_R_SNPS in 'bloc.h' to 1]
## or transpose file --> in R


../../../../Resources/BlocBuster/ccc/ccc ccc_data_compound.perm.blocBuster.input.txt ccc_data_compound.perm.gml 0.7 1419 4955674 0 4
# 1419 individuals and 4955674 SNPs in entire input file.
# Threshold of 0.7 used.
# 
# Reading in data...
# 0 snps have only one allele in dataset.
# 27989824 and 27989824 missing values in first and second SNP sets, respectively.
# 
# Computing CCC values...


../../../../Resources/BlocBuster/ccc/ccc ccc_data_compound.blocBuster.input.txt ccc_data_compound.gml 0.7 1419 4955674 0 4
# 1419 individuals and 4955674 SNPs in entire input file.
# Threshold of 0.7 used.
# 
# Reading in data...
# 0 snps have only one allele in dataset.
# 27989824 and 27989824 missing values in first and second SNP sets, respectively.
# 
# Computing CCC values...

#### takes too long to run on a local machine...
Sharlee’s blocBuster permutation method
cd /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/CCC/poplar.v4
paste -d ' ' Ptri_V4_Nisq1.1419.VQSRrecal9_tranche99_allchr.biallelicSNP.geno015.hwe1e50.maf05.mind015.tped.space.map Ptri_V4_Nisq1.1419.VQSRrecal9_tranche99_allchr.biallelicSNP.geno015.hwe1e50.maf05.mind015.tped > ccc_data_compound.tped

## http://www.cs.umsl.edu/~climer/blocBuster/README
## adjust bloc.h file: PRINTNUMEDGES = 1 && ROWS_R_SNPS = 1

#!/bin/sh
#SBATCH -A SYB105
#SBATCH -N 2
#SBATCH -p gpu
#SBATCH -t 48:00:00
#SBATCH --mem-per-cpu=0
#SBATCH -o blocbuster.perm-%j.o
#SBATCH -e blocbuster.perm-%j.e

genFile=ccc_data_compound.tped
numInd=1419
numSnps=4955674
thresh=0.7
numHeadRows=0
numHeadCols=4
numTrials=1000

# Assumed rows represent SNPs 

tempGen=temp.gen
tempGml=temp.gml

let trial=1

while test ${trial} -le ${numTrials} ; do

  /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/CCC/BlocBuster/perm/perm ${genFile} ${tempGen} ${numInd} ${numSnps} ${numHeadCols} ${numHeadRows}

  /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/CCC/BlocBuster/ccc/ccc ${tempGen} ${tempGml} ${thresh} ${numInd} ${numSnps} ${numHeadRows} ${numHeadCols}

  let trial=trial+1

done

echo Finished
echo

# cd /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/CCC/poplar.v4
# sbatch /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/CCC/poplar.v4/sharlee.perm.sh
comet on permuted dataset
# convert compound permuted data to proper tped format (add space between compound genotypes in temp.gen)
cut -d ' ' -f 1-4 temp.gen > temp.gen.labels
cut -d ' ' -f5- temp.gen | sed 's/./& /g' | sed 's/  //g' > temp.gen.alleles
paste -d ' ' temp.gen.labels temp.gen.alleles > temp.gen.tped

#!/bin/bash
#BSUB -P SYB105
#BSUB -W 24:00
#BSUB -nnodes 4
#BSUB -J perm.ccc.pre
#BSUB -o perm.ccc.pre.%J.out
#BSUB -q batch-hm

module load gcc/5.4.0
#Script and Data path

spath=/gpfs/alpine/syb105/proj-shared/Personal/mcashman/Tools/summit/cometTools/ccc/preProcess/Scripts/preProcess/preProcess.sh
#spath=/gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/CCC/poplar.v4/preProcess_binarize.sh
dpath=/gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/CCC/poplar.v4/perm

#Run command
jsrun --nrs 1 --tasks_per_rs 1 --cpu_per_rs 42 --gpu_per_rs 6 --rs_per_host 1 $spath $dpath/temp.gen.tped
echo "Done!"

# cd /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/CCC/poplar.v4/scipts
# bsub /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/CCC/poplar.v4/scripts/poplar.v4.ccc.pre.perm.sh
#BSUB -P SYB105
#BSUB -W 10:00
#BSUB -nnodes 100
#BSUB -J poplar.v4.ccc.run
#BSUB -o poplar.v4.ccc.run.%J.out

# bsub -P syb105 -nnodes 100 -W 10 -Is $SHELL

executable=/gpfs/alpine/syb105/proj-shared/Personal/mcashman/Tools/summit/comet/build_release_summit/genomics_metric ar_opts='PAMI_IBV_ENABLE_DCT=1 PAMI_ENABLE_STRIPING=1 PAMI_IBV_ADAPTER_AFFINITY=0 PAMI_IBV_QP_SERVICE_LEVEL=8 PAMI_IBV_ENABLE_OOO_AR=1'

launch_command="env OMP_NUM_THREADS=7 $ar_opts jsrun --nrs 600 --bind packed:7 --cpu_per_rs 7 --gpu_per_rs 1 --rs_per_host 6 --tasks_per_rs 1 -X 1"

$launch_command $executable --num_way 2 --metric_type ccc --sparse yes --num_vector 4955674 --num_field 1419 --all2all yes --compute_method GPU --num_proc_vector 600 --num_proc_field 1 --num_proc_repl 1 --tc 1 --num_tc_steps 1 --num_phase 1 --num_stage 1 --verbosity 1 --checksum no --phase_min 0 --phase_max 0 --stage_min 0 --stage_max 0 --input_file /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/CCC/poplar.v4/perm/temp.gen.tped.bin --output_file_stub /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/CCC/poplar.v4/perm/poplar.v4.ccc.perm

# bsub /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/CCC/poplar.v4/scripts/poplar.v4.ccc.run.perm.sh
#!/bin/bash
#SBATCH -A syb105
#SBATCH -N 1
#SBATCH -t 10
#SBATCH -J perm.ccc.post
#SBATCH -o perm.ccc.post.'%A'.txt

# do this somewhere before here:
#pushd $TOOLS_DIR
#./make.sh
#popd

# Location of CoMet postprocessing tools.
TOOLS_DIR=/gpfs/alpine/syb105/proj-shared/Personal/mcashman/Tools/andes/genomics_gpu/tools
export PATH="${PATH}:$TOOLS_DIR"

NUM_WAY=2

# Extract the file to be processed by this node from the file list.
BASE_PATH_PRE=/gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/CCC/poplar.v4
FILE_IN=/gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/CCC/poplar.v4/perm/poplar.v4.ccc.perm_0.bin
STUB=ccc_data
mkdir -p out_slurm_$STUB

echo "Postprocessing $FILE_IN"

(
  time postprocess $NUM_WAY \
    $BASE_PATH_PRE/${STUB}_allele_labels.txt \
    $BASE_PATH_PRE/${STUB}_line_labels.txt \
    $FILE_IN \
    $STUB.txt
) 2>&1 | tee out_slurm_$STUB/output-postprocess-$STUB.txt

# sbatch /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/CCC/poplar.v4/poplar.v4.ccc.post.perm.sh
Clustering
  • run MCL clustering
    • generate an adjacency matrix
    • cluster SNP pairs
  • or just run blocbuster?