# scripts: /gpfs/alpine/syb105/proj-shared/Personal/mcashman/Projects/CoMet/TEMPLATES/ccc
# test data: /gpfs/alpine/syb105/proj-shared/Personal/mcashman/Projects/CoMet/TEMPLATES/example_data/ccc_data.tped
# CoMet build: /ccs/proj/syb105/sw/summit/comet/src/build_single_release_summit/genomics_metric
# running CoMet: https://docs.google.com/document/d/1Dz4bDciSW5ZSMIkHq3LQ6uBxGzf7CR3XVz3SFRuyDHY/edit#heading=h.on7f9i5ekak9
# test data looks like this:
tig00000000 tig00000000:157495 0 157495 G G G G G G G G G G A A G G G G G G A
tig00000000 tig00000000:162166 0 162166 C C C C T C C C C C C C C C C C C C T
tig00000000 tig00000000:162175 0 162175 T T T T C T C T T T T T T T T T T T C
tig00000000 tig00000000:162427 0 162427 A A A A A A A A A A A A A A A A A A A
tig00000000 tig00000000:162430 0 162430 A A C A A A A A A A A A A A A A A A A
tig00000000 tig00000000:162439 0 162439 G G A G A A G G G G G G G G G G A G G
tig00000000 tig00000000:249674 0 249674 T T T T T T T T C C C T T T T T C T T
tig00000000 tig00000000:405503 0 405503 G G G G G G G G G G G G G G T G G G G
tig00000000 tig00000000:645635 0 645635 T T T T T T T T T T T T T T T T T T T
tig00000000 tig00000000:697464 0 697464 C C C C C C C C C C C C C C C C C C A
#!/bin/bash
#BSUB -P SYB105
#BSUB -W 00:30
#BSUB -nnodes 1
#BSUB -J duo_pre
#BSUB -o duo_pre.%J.out
module load gcc/5.4.0
#Script and Data path
spath=/gpfs/alpine/syb105/proj-shared/Personal/mcashman/Tools/summit/cometTools/ccc/preProcess/Scripts/preProcess/preProcess.sh
dpath=/gpfs/alpine/syb105/proj-shared/Personal/mcashman/Projects/CoMeT/example_data
#Run command
jsrun --nrs 1 --tasks_per_rs 1 --cpu_per_rs 42 --gpu_per_rs 6 --rs_per_host 1 $spath $dpath/ccc_data.tped
echo "Done!"
# CCC vector length = 1/2 the number of non-whitespace characters
#([ head -n1 [FILE] | wc -w ] - 4) / 2
head -n1 ccc_data.tped | wc -w
### (648 - 4) / 2 = 322
# Number of Vectors: The number of vectors is equal to the number of rows.
wc -l ccc_data.tped
# 16339
Use the spreadsheet [https://docs.google.com/spreadsheets/d/1BD2MIVE2ajlEde4UfVzO4Hx61VkxtKn-/edit#gid=965603301] to determine proper commands: These are the primary cells you should change in both cases: Cell C2 should be set to your project (e.g. syb105) Cell C10 (num_way) should be set to 2 for 2-way or 3 for 3-way Cell C11 (metric) should be set to ccc/duo/czek Cell C13 (num_vector) should be set to the number of vectors Cell C14 (num_field) should be set to the vector length Cell C16 (system) should be set to the supercomputer (summit/perlmutter/titan) Cell C19 (num_proc_vector) should be set to how many output files you want
*You need to have enough files that they are small enough that post-processing can complete in a reasonable time ** Once you set those cells, you can scroll down to the auto-computed metrics, and make sure they are in the green/yellow. Red in C47 isn’t a big deal, means the problem is small and thus not optimizing resources.
#interactive command
bsub -P syb105 -nnodes 4500 -W 10 -Is $SHELL
#submission command
### Be sure to change the path to the executable to where you installed it. You will also need to change the --input_file and --output_file_stub in the last line.
executable=install_single_release_summit/bin/genomics_metric ar_opts='PAMI_IBV_ENABLE_DCT=1 PAMI_ENABLE_STRIPING=1 PAMI_IBV_ADAPTER_AFFINITY=0 PAMI_IBV_QP_SERVICE_LEVEL=8 PAMI_IBV_ENABLE_OOO_AR=1'
launch_command="env OMP_NUM_THREADS=7 $ar_opts jsrun --nrs 27000 --bind packed:7 --cpu_per_rs 7 --gpu_per_rs 1 --rs_per_host 6 --tasks_per_rs 1 -X 1"
$launch_command $executable --num_way 2 --metric_type ccc --sparse yes --num_vector 16339 --num_field 322 --all2all yes --compute_method GPU --num_proc_vector 27000 --num_proc_field 1 --num_proc_repl 1 --tc 2 --num_tc_steps 4 --num_phase 1 --num_stage 1 --verbosity 1 --checksum no --phase_min 0 --phase_max 0 --stage_min 0 --stage_max 0
#BSUB -P SYB105
#BSUB -W 00:15
#BSUB -nnodes 17
#BSUB -J ccc-test
#BSUB -o ccc-test-100files.%J.out
#bsub -P syb106 -nnodes 80 -W 15 -Is $SHELL
executable=/gpfs/alpine/syb105/proj-shared/Personal/mcashman/Tools/summit/CoMeT/build_release_summit/genomics_metric ar_opts='PAMI_IBV_ENABLE_DCT=1 PAMI_ENABLE_STRIPING=1 PAMI_IBV_ADAPTER_AFFINITY=0 PAMI_IBV_QP_SERVICE_LEVEL=8 PAMI_IBV_ENABLE_OOO_AR=1'
launch_command="env OMP_NUM_THREADS=7 $ar_opts jsrun --nrs 102 --bind packed:7 --cpu_per_rs 7 --gpu_per_rs 1 --rs_per_host 6 --tasks_per_rs 1 -X 1"
$launch_command $executable --metric_type ccc --sparse yes --num_vector 16339 --num_field 322 --threshold 0 --all2all yes --compute_method GPU --num_proc_vector 100 --num_proc_field 1 --num_proc_repl 1 --tc 1 --num_tc_steps 1 --num_phase 1 --num_stage 1 --verbosity 1 --checksum no --phase_min 0 --phase_max 0 --stage_min 0 --stage_max 0 --input_file /gpfs/alpine/syb105/proj-shared/Personal/mcashman/Projects/CoMeT/culex/preProcess/data/ccc_data.bin --output_file_stub /gpfs/alpine/syb105/proj-shared/Personal/mcashman/Projects/CoMeT/culex/output_ccc_100files/result
#!/bin/bash
#SBATCH -A syb105
#SBATCH -N 1
#SBATCH -t 10
#SBATCH -J post
#SBATCH -o output-postprocess-'%A'.txt
# do this somewhere before here:
#pushd $TOOLS_DIR
#./make.sh
#popd
# Location of CoMet postprocessing tools.
TOOLS_DIR=/gpfs/alpine/syb105/proj-shared/Personal/mcashman/Tools/andes/genomics_gpu/tools
export PATH="${PATH}:$TOOLS_DIR"
NUM_WAY=2
# Extract the file to be processed by this node from the file list.
BASE_PATH_PRE=/gpfs/alpine/syb105/proj-shared/Personal/mcashman/Projects/CoMet/new_examples/ccc/preprocess
FILE_IN=/gpfs/alpine/syb105/proj-shared/Personal/mcashman/Projects/CoMet/new_examples/ccc/result_0.bin
STUB=ccc_data
mkdir -p out_slurm_$STUB
echo "Postprocessing $FILE_IN"
(
time postprocess $NUM_WAY \
$BASE_PATH_PRE/${STUB}_allele_labels.txt \
$BASE_PATH_PRE/${STUB}_line_labels.txt \
$FILE_IN \
$STUB.txt
) 2>&1 | tee out_slurm_$STUB/output-postprocess-$STUB.txt
Opioid use disorder
Chris Rench (London School of Hygiene and Tropical Medicine)
Citadel does exist (allows use of CoMet) BUT we are waiting for approval
Subset cases and controls by ancestry
access blocbuster from the virtual kdi machine: https://kdinxmgt01.ornlkdi.org/all/misc_packages/BlocBuster_v.1.1.tar.gz
step 1: clarification on cases / controls from Chris (Mirko)
step 2: separate data by ancestry
step 3: run single ancestry on blocbuster on Murphy (for cases and controls separately) [african, hispanic, european]
step 4: determine computational power needed… can it be completed on Murphy?
step 5: if yes to above, continue with additional ancestries and cross-ancestry… if no, subset to chromosome by chromosome CCC runs
imputed data in PLINK format un-imputed data in ???
–> data wrangle by ancestry and case/control –> per individual… non-imputed data to allele format
raw SNP dataset ~30 million SNPs —> trans 90 or 99?
version 3 LD removed —> 769,000 SNPs
version 4… trans99 (some noise but not missing essential SNPs) /gpfs/alpine/syb105/proj-shared/Data/P_tri/Variants/v4
filters to get 7.7million SNPs in version 3 and 4.8million SNPs in version 4
LD > 0.7 removed —> ~800,000 SNPs used for GWAS
####v3 poplar data /gpfs/alpine/syb105/proj-shared/Personal/gabrielgaz/Runs/cccLOCO/01/data/originalTPED-7.7M /gpfs/alpine/syb105/proj-shared/Personal/gabrielgaz/Runs/comet/summit/ccc/poplar
--> /gpfs/alpine/syb105/proj-shared/Personal/gabrielgaz/Shared/CCC/1-Poplar7.7M
Hello Team,
I ran CoMet 2-Way CCC on file /lustre/or-hydra/cades-bsd/dk0/VCF3/gatk_882_PASS.maf01.CR0.9.annotgeneid.bescID.tped provided by David with threshold 0.7.
The provided TPED had 7 710 681 SNPs
~ Number of Possible Pairs = 7710681 * 7710681 / 2 = 29727300741880
~ Number of Lines CoMet saved with threshold > 0.7 = 294095095
Conversion Rate = 29727300741880 / 294095095 = 101080
With Threshold 0.7 we saved 1 for every 101K lines
(we were thinking 1 to 3M)
The results are available here: /gpfs/alpine/syb105/proj-shared/Personal/gabrielgaz/Shared/CCC/1-Poplar7.7M/7.7MPoplar-CCC-Threshold0.7.txt
Best,
Joao Gabriel Gazolla
/gpfs/alpine/syb105/proj-shared/Personal/gabrielgaz/Projects/archives/cccLOCO/integrator/data/7.7MPoplar-CCC-Threshold0.7.txt
863 genotypes
library(tidyr)
library(dplyr)
setwd("/Users/27n/Dropbox (ORNL)/ORNL.Noshay/Projects/Poplar/CCC/v3")
gid <- read.delim("Ptrichocarpa_210_v3.0.geneOnly.sorted.gff3", header=F, sep="\t")
#ccc <- read.delim("7.7MPoplar-CCC-Threshold0.7-100.txt", header=F, sep=" ")
ccc <- read.delim("7.7MPoplar-CCC-Threshold0.7.txt", header=F, sep=" ")
colnames(gid) <- c("chromosome", "source", "feature", "start", "end", "dot", "strand", "dot2", "ID")
gid$chr <- as.numeric(substr(gid$chromosome, 4, 5))
gid.df <- gid %>% separate(ID, c("gid", "name"), sep=";")
ccc.snp1 <- ccc %>% separate(V5, c("snp1.chr", "snp1.pos", "snp1.allele"), sep="_")
ccc.coord <- ccc.snp1 %>% separate(V6, c("snp2.chr", "snp2.pos", "snp2.allele"), sep="_")
library(tidygenomics)
ccc.snp1.coord <- unique(ccc.coord[,c(5:6)])
ccc.snp1.coord$chr <- as.numeric(ccc.snp1.coord$snp1.chr)
ccc.snp1.coord$start <- as.numeric(ccc.snp1.coord$snp1.pos)
ccc.snp1.coord$end <- as.numeric(ccc.snp1.coord$snp1.pos)
ccc.snp2.coord <- unique(ccc.coord[,c(8:9)])
ccc.snp2.coord$chr <- as.numeric(ccc.snp2.coord$snp2.chr)
ccc.snp2.coord$start <- as.numeric(ccc.snp2.coord$snp2.pos)
ccc.snp2.coord$end <- as.numeric(ccc.snp2.coord$snp2.pos)
ccc.snp1.gid <- genome_join_closest(ccc.snp1.coord, gid.df, by=c("chr", "start", "end"), distance_column_name="snp1.distance", mode="full")
ccc.snp2.gid <- genome_join_closest(ccc.snp2.coord, gid.df, by=c("chr", "start", "end"), distance_column_name="snp2.distance", mode="full")
ccc.coord.snp1.gid <- left_join(ccc.coord, ccc.snp1.gid[,c(1,2,14,17)], by=c("snp1.chr", "snp1.pos"))
ccc.coord.gid <- left_join(ccc.coord.snp1.gid, ccc.snp2.gid[,c(1,2,14,17)], by=c("snp2.chr", "snp2.pos"))
ccc.coord.gid.df <- ccc.coord.gid[,5:15]
colnames(ccc.coord.gid.df) <- c("snp1.chr", "snp1.pos", "snp1.allele", "snp2.chr", "snp2.pos", "snp2.allele", "ccc.cor", "snp1.gene", "snp1.gene.dist", "snp2.gene", "snp2.gene.dist")
#write.table(ccc.coord.gid.df, "7.7MPoplar-CCC-Threshold0.7-100.Ptrichocarpa_210_v3.0.txt", quote=F, row.names=F, sep="\t")
# 1481
write.table(ccc.coord.gid.df, "7.7MPoplar-CCC-Threshold0.7.Ptrichocarpa_210_v3.0.txt", quote=F, row.names=F, sep="\t")
ccc.coord.gid.df.distinct <- ccc.coord.gid.df %>% distinct(snp1.gene, snp2.gene, .keep_all = T)
# 295
ccc.coord.gid.df.distinct.pair <- subset(ccc.coord.gid.df.distinct, ccc.coord.gid.df.distinct$snp1.gene != ccc.coord.gid.df.distinct$snp2.gene)
# 292
#write.table(ccc.coord.gid.df.distinct.pair, "7.7MPoplar-CCC-Threshold0.7-100.Ptrichocarpa_210_v3.0.distinctgenepair.txt", quote=F, row.names=F, sep="\t")
write.table(ccc.coord.gid.df.distinct.pair, "7.7MPoplar-CCC-Threshold0.7.Ptrichocarpa_210_v3.0.distinctgenepair.txt", quote=F, row.names=F, sep="\t")
### filter to genes within 1kb of SNP
ccc.coord.gid.df.distinct.pair.1kb <- subset(ccc.coord.gid.df.distinct.pair, abs(ccc.coord.gid.df.distinct.pair$snp1.gene.dist) < 1000 & abs(ccc.coord.gid.df.distinct.pair$snp2.gene.dist) < 1000)
# 70
#write.table(ccc.coord.gid.df.distinct.pair.1kb, "7.7MPoplar-CCC-Threshold0.7-100.Ptrichocarpa_210_v3.0.distinctgenepair.1kb.txt", quote=F, row.names=F, sep="\t")
write.table(ccc.coord.gid.df.distinct.pair.1kb, "7.7MPoplar-CCC-Threshold0.7.Ptrichocarpa_210_v3.0.distinctgenepair.1kb.txt", quote=F, row.names=F, sep="\t")
/gpfs/alpine/syb105/proj-shared/Personal/gabrielgaz/Runs/comet/summit/ccc/poplar/01-Poplar7M/run bsub -P syb105 -nnodes 100 -W 15 -Is $SHELL
executable=/gpfs/alpine/syb105/proj-shared/Personal/gabrielgaz/Apps/comet/build_release_summit/genomics_metric ar_opts='PAMI_IBV_ENABLE_DCT=1 PAMI_ENABLE_STRIPING=1 PAMI_IBV_ADAPTER_AFFINITY=0 PAMI_IBV_QP_SERVICE_LEVEL=8 PAMI_IBV_ENABLE_OOO_AR=1'
launch_command="env OMP_NUM_THREADS=7 $ar_opts jsrun --nrs 600 --bind packed:7 --cpu_per_rs 7 --gpu_per_rs 1 --rs_per_host 6 --tasks_per_rs 1 -X 1"
$launch_command $executable --num_way 2 --metric_type ccc --sparse yes --num_vector 7710681 --num_field 882 --all2all yes --compute_method GPU --num_proc_vector 600 --num_proc_field 1 --num_proc_repl 1 --tc 1 --num_tc_steps 1 --num_phase 50 --num_stage 1 --verbosity 1 --checksum no --phase_min 0 --phase_max 49 --stage_min 0 --stage_max 0 --input_file /gpfs/alpine/syb105/proj-shared/Personal/gabrielgaz/Runs/CCC/01-Poplar7M/preprocess/7M.bin --threshold 0.7 --output_file_stub /gpfs/alpine/syb105/proj-shared/Personal/gabrielgaz/Runs/CCC/01-Poplar7M/run/output/7M_
ctime 227.696209 ops 2.134692e+17 ops_rate 9.375174e+14 ops_rate/proc 1.562529e+12 vcmp 2.973130e+13 vcmpout 2.940951e+08 cmp 1.048920e+17 ecmp 2.622301e+16 ecmp_rate 1.151666e+14 ecmp_rate/proc 1.919444e+11 vctime 0.068704 mctime 16.262581 intime 0.738858 outtime 213.069819 cpumem 5.154618e+10 gpumem 5.386325e+09 tottime 457.86526
cd /Users/27n/Dropbox (ORNL)/ORNL.Noshay/Projects/Poplar/CCC/v3
sed 's/_/ /g' 7.7MPoplar-CCC-Threshold0.7.txt | awk '{print ($5"\t"$6"\t"$7"\t"$8"\t"$9"\t"$10"\t"$11"\t"$8-$5"\t"$9-$6)}' > 7.7MPoplar-CCC-Threshold0.7.distance.txt
awk '{if ($8 != 0 || $9 < -5000 || $9 > 5000) print $0}' 7.7MPoplar-CCC-Threshold0.7.distance.txt > 7.7MPoplar-CCC-Threshold0.7.distance.5kb.txt
awk '{if ($8 != 0 || $9 < -50000 || $9 > 50000) print $0}' 7.7MPoplar-CCC-Threshold0.7.distance.txt > 7.7MPoplar-CCC-Threshold0.7.distance.50kb.txt
awk '{if ($8 != 0 || $9 < -500000 || $9 > 500000) print $0}' 7.7MPoplar-CCC-Threshold0.7.distance.txt > 7.7MPoplar-CCC-Threshold0.7.distance.500kb.txt
awk '{if ($8 != 0) print $0}' 7.7MPoplar-CCC-Threshold0.7.distance.txt > 7.7MPoplar-CCC-Threshold0.7.distance.chr.txt
# 294095095 7.7MPoplar-CCC-Threshold0.7.distance.txt
# 141975233 7.7MPoplar-CCC-Threshold0.7.distance.50kb.txt
# 216770824 7.7MPoplar-CCC-Threshold0.7.distance.5kb.txt
# 1961386 7.7MPoplar-CCC-Threshold0.7.distance.chr.txt
library(tidyr)
library(dplyr)
setwd("/Users/27n/Dropbox (ORNL)/ORNL.Noshay/Projects/Poplar/CCC/v3")
gid <- read.delim("Ptrichocarpa_210_v3.0.geneOnly.sorted.gff3", header=F, sep="\t")
ccc.coord <- read.delim("7.7MPoplar-CCC-Threshold0.7.distance.chr.txt", header=F, sep="\t")
colnames(ccc.coord) <- c("snp1.chr", "snp1.pos", "snp1.allele", "snp2.chr", "snp2.pos", "snp2.allele", "ccc.cor", "chr.diff", "pos.diff")
colnames(gid) <- c("chromosome", "source", "feature", "start", "end", "dot", "strand", "dot2", "ID")
gid$chr <- as.numeric(substr(gid$chromosome, 4, 5))
gid.df <- gid %>% separate(ID, c("gid", "name"), sep=";")
library(tidygenomics)
ccc.snp1.coord <- unique(ccc.coord[,c(1:3)])
ccc.snp1.coord$chr <- as.numeric(ccc.snp1.coord$snp1.chr)
ccc.snp1.coord$start <- as.numeric(ccc.snp1.coord$snp1.pos)
ccc.snp1.coord$end <- as.numeric(ccc.snp1.coord$snp1.pos)
ccc.snp2.coord <- unique(ccc.coord[,c(4:6)])
ccc.snp2.coord$chr <- as.numeric(ccc.snp2.coord$snp2.chr)
ccc.snp2.coord$start <- as.numeric(ccc.snp2.coord$snp2.pos)
ccc.snp2.coord$end <- as.numeric(ccc.snp2.coord$snp2.pos)
ccc.snp1.gid <- genome_join_closest(ccc.snp1.coord, gid.df, by=c("chr", "start", "end"), distance_column_name="snp1.distance", mode="full")
ccc.snp2.gid <- genome_join_closest(ccc.snp2.coord, gid.df, by=c("chr", "start", "end"), distance_column_name="snp2.distance", mode="full")
ccc.coord.snp1.gid <- left_join(ccc.coord, ccc.snp1.gid, by=c("snp1.chr", "snp1.pos", "snp1.allele"))
ccc.coord.gid <- left_join(ccc.coord.snp1.gid, ccc.snp2.gid, by=c("snp2.chr", "snp2.pos", "snp2.allele"))
ccc.coord.gid.df <- ccc.coord.gid[,c(1:7,22,24,37,39)]
colnames(ccc.coord.gid.df) <- c("snp1.chr", "snp1.pos", "snp1.allele", "snp2.chr", "snp2.pos", "snp2.allele", "ccc.cor", "snp1.gene", "snp1.gene.dist", "snp2.gene", "snp2.gene.dist")
write.table(ccc.coord.gid.df, "7.7MPoplar-CCC-Threshold0.7.distance.chr.gene.txt", quote=F, row.names=F, sep="\t")
ccc.coord.gid.df.distinct <- ccc.coord.gid.df %>% distinct(snp1.gene, snp2.gene, .keep_all = T)
ccc.coord.gid.df.distinct.pair <- subset(ccc.coord.gid.df.distinct, ccc.coord.gid.df.distinct$snp1.gene != ccc.coord.gid.df.distinct$snp2.gene)
# 5557
write.table(ccc.coord.gid.df.distinct.pair, "7.7MPoplar-CCC-Threshold0.7.distance.chr.gene.distinct.txt", quote=F, row.names=F, sep="\t")
ccc.coord.gid.df.INgene <- subset(ccc.coord.gid.df.distinct.pair, ccc.coord.gid.df.distinct.pair$snp1.gene.dist == 0 & ccc.coord.gid.df.distinct.pair$snp2.gene.dist == 0)
# 129
## I have 129 gene pairs for potential epistatic interaction based on SNPs on different chromosomes with a correlation >0.7 and annotated within a known gene (in v3 poplar data) <-- this does not include any clustering
ccc.coord.gid.df.NEARgene <- subset(ccc.coord.gid.df.distinct.pair, ccc.coord.gid.df.distinct.pair$snp1.gene.dist < 1000 & ccc.coord.gid.df.distinct.pair$snp2.gene.dist < 1000)
# 493
ccc.coord.gid.df.NEARgene <- subset(ccc.coord.gid.df.distinct.pair, ccc.coord.gid.df.distinct.pair$snp1.gene.dist < 5000 & ccc.coord.gid.df.distinct.pair$snp2.gene.dist < 5000)
# 1992
https://nph.onlinelibrary.wiley.com/doi/10.1111/j.1469-8137.2009.03174.x
/Users/27n/Dropbox (ORNL)/ORNL.Noshay/Projects/Poplar/CCC/v3/7.7MPoplar-CCC-Threshold0.7.txt
https://favtutor.com/blogs/breadth-first-search-python
## first need to make a graph from the ABC-format file
install.packages("igraph")
install.packages("network")
install.packages("sna")
install.packages("ndtv")
setwd("/gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/CCC/poplar.v3")
abc <- read.delim("7.7MPoplar-CCC-Threshold0.7.distance.50kb.abc.txt", header=F, sep="\t", stringsAsFactors = F)
### create a node graph file
### visualizing the network
library(igraph)
net <- graph.data.frame(links, nodes, directed=T)
net <- simplify(net, remove.multiple = F, remove.loops = T)
plot(net, edge.arrow.size=.4,vertex.label=NA)
# Generate colors base on media type:
colrs <- c("gray50", "tomato", "gold")
V(net)$color <- colrs[V(net)$media.type]
# Compute node degrees (#links) and use that to set node size:
deg <- degree(net, mode="all")
14
V(net)$size <- deg*3
# We could also use the audience size value:
V(net)$size <- V(net)$audience.size*0.6
# The labels are currently node IDs.
# Setting them to NA will render no labels:
V(net)$label <- NA
# Set edge width based on weight:
E(net)$width <- E(net)$weight/6
#change arrow size and edge color:
E(net)$arrow.size <- .2
E(net)$edge.color <- "gray80"
E(net)$width <- 1+E(net)$weight/12
plot(net)
source /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/scripts/loadcondaandes.sh
conda activate /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/Libraries/andes/envs/test
## or generate node graph file in python
python
import pandas as pd
df = pd.read_csv("7.7MPoplar-CCC-Threshold0.7.distance50kb.abc.txt", sep="\t", header=None, names=["SNP1", "SNP2", "edge.weight"])
print(df)
# SNP1 SNP2 edge.weight
# 0 1_58944_T 1_8323_T 0.700286
# 1 1_59246_T 1_8323_T 0.705242
# 2 1_59246_T 1_8378_G 0.700243
# 3 1_59246_T 1_8556_T 0.700634
# 4 1_59246_T 1_8616_T 0.700973
# [141975233 rows x 3 columns]
graph = df.groupby('SNP1')['SNP2'].agg(list).to_dict()
## then with graph can run BFS in python
# graph = {
# '5' : ['3','7'],
# '3' : ['2', '4'],
# '7' : ['8'],
# '2' : [],
# '4' : ['8'],
# '8' : []
# }
visited = [] # List for visited nodes.
queue = [] #Initialize a queue
def bfs(visited, graph, node): #function for BFS
visited.append(node)
queue.append(node)
while queue: # Creating loop to visit each node
m = queue.pop(0)
print (m, end = " ")
for neighbour in graph[m]:
if neighbour not in visited:
visited.append(neighbour)
queue.append(neighbour)
# Driver Code
print("Following is the Breadth-First Search")
bfs(visited, graph, '1_58944_T') # function calling
source /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/scripts/loadcondaandes.sh
conda activate /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/Libraries/andes/envs/test
conda install -c conda-forge -c bioconda mcl
#mcl INPUT_FILE_NAME --abc -I 1.5 -o OUTPUT_FILE_NAME
cd /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/CCC/poplar.v3
# cp /gpfs/alpine/syb105/proj-shared/Personal/gabrielgaz/Projects/archives/cccLOCO/integrator/data/7.7MPoplar-CCC-Threshold0.7.txt .
cut -f 5-7 7.7MPoplar-CCC-Threshold0.7.txt > 7.7MPoplar-CCC-Threshold0.7.abc.txt
# mcl 7.7MPoplar-CCC-Threshold0.7.abc.txt --abc -I 1.5 -o 7.7MPoplar-CCC-Threshold0.7.abc.mcl.txt
sed 's/_/ /g' 7.7MPoplar-CCC-Threshold0.7.txt | awk '{print ($5"\t"$6"\t"$7"\t"$8"\t"$9"\t"$10"\t"$11"\t"$8-$5"\t"$9-$6)}' > 7.7MPoplar-CCC-Threshold0.7.distance.txt
awk '{if ($8 != 0 || $9 < -5000 || $9 > 5000) print $0}' 7.7MPoplar-CCC-Threshold0.7.distance.txt > 7.7MPoplar-CCC-Threshold0.7.distance.5kb.txt
awk '{if ($8 != 0 || $9 < -50000 || $9 > 50000) print $0}' 7.7MPoplar-CCC-Threshold0.7.distance.txt > 7.7MPoplar-CCC-Threshold0.7.distance.50kb.txt
awk '{if ($8 != 0 || $9 < -500000 || $9 > 500000) print $0}' 7.7MPoplar-CCC-Threshold0.7.distance.txt > 7.7MPoplar-CCC-Threshold0.7.distance.500kb.txt
awk '{if ($8 != 0) print $0}' 7.7MPoplar-CCC-Threshold0.7.distance.txt > 7.7MPoplar-CCC-Threshold0.7.distance.chr.txt
awk '{print ($1"_"$2"_"$3"\t"$4"_"$5"_"$6"\t"$7)}' 7.7MPoplar-CCC-Threshold0.7.distance.50kb.txt > 7.7MPoplar-CCC-Threshold0.7.distance.50kb.abc.txt
awk '{print ($1"_"$2"_"$3"\t"$4"_"$5"_"$6"\t"$7)}' 7.7MPoplar-CCC-Threshold0.7.distance.500kb.txt > 7.7MPoplar-CCC-Threshold0.7.distance.500kb.abc.txt
awk '{print ($1"_"$2"_"$3"\t"$4"_"$5"_"$6"\t"$7)}' 7.7MPoplar-CCC-Threshold0.7.distance.chr.txt > 7.7MPoplar-CCC-Threshold0.7.distance.chr.abc.txt
#!/bin/bash
#SBATCH -A SYB105
#SBATCH -J ccc.mcl
#SBATCH -N 2
#SBATCH -t 10:00:00
source /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/scripts/loadcondaandes.sh
conda activate /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/Libraries/andes/envs/test
cd /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/CCC/poplar.v3
mcl 7.7MPoplar-CCC-Threshold0.7.abc.txt --abc -I 1.5 -o 7.7MPoplar-CCC-Threshold0.7.abc.mcl.txt
mcl 7.7MPoplar-CCC-Threshold0.7.distance.50kb.abc.txt --abc -I 1.5 -o 7.7MPoplar-CCC-Threshold0.7.distance.50kb.abc.mcl.txt
# 8470
mcl 7.7MPoplar-CCC-Threshold0.7.distance.500kb.abc.txt --abc -I 1.5 -o 7.7MPoplar-CCC-Threshold0.7.distance.500kb.abc.mcl.txt
# 1594 clusters
mcl 7.7MPoplar-CCC-Threshold0.7.distance.chr.abc.txt --abc -I 1.5 -o 7.7MPoplar-CCC-Threshold0.7.distance.chr.abc.mcl.txt
# 937 clusters
#sbatch /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/CCC/poplar.v3/ccc.mcl.sh
library(tidyr)
library(dplyr)
setwd("/Users/27n/Dropbox (ORNL)/ORNL.Noshay/Projects/Poplar/CCC/v3")
ccc.gid <- read.delim("7.7MPoplar-CCC-Threshold0.7.distance.chr.gene.txt", header=T, sep="\t")
trait.gid <- read.delim("Poplar.GRIN_from_GWATS.All_genes.PerClimateVariable.txt", header=F, sep="\t")
colnames(trait.gid) <- c("trait", "gid")
trait.gid.count <- trait.gid %>% group_by(trait) %>% mutate(count = n()) %>% select(trait, count) %>% distinct()
# trait count
# <chr> <int>
# 1 aridity 33
# 2 cloud 276
# 3 pet 32
# 4 prec 218
# 5 srad 456
# 6 tmax 138
# 7 tmin 127
# 8 vapr 86
# 9 wind 51
ccc.gid$snp1.gene <- substr(ccc.gid$snp1.gene, 6, 21)
ccc.gid$snp2.gene <- substr(ccc.gid$snp2.gene, 6, 21)
ccc.gid.id <- ccc.gid %>% unite("snp1.id", c(snp1.chr, snp1.pos, snp1.allele), sep="_") %>% unite("snp2.id", c(snp2.chr, snp2.pos, snp2.allele), sep="_")
nrow(trait.gid)
# 1417
nrow(ccc.gid.id)
# 1963239
trait.gid.snp1 <- trait.gid
trait.gid.snp1$snp1.gene <- trait.gid.snp1$gid
ccc.snp1.trait <- inner_join(trait.gid.snp1, ccc.gid.id, by="snp1.gene")
trait.gid.snp2 <- trait.gid
trait.gid.snp2$snp2.gene <- trait.gid.snp2$gid
ccc.snp2.trait <- inner_join(trait.gid.snp2, ccc.gid.id, by="snp2.gene")
ccc.snp1.snp2.trait <- inner_join(ccc.snp1.trait[,c(1,3:6,8)], ccc.snp2.trait[,c(1,3:7)], by=c("trait", "snp1.gene", "snp1.id", "snp2.gene", "snp2.id", "ccc.cor"))
# 25
ccc.snp1.snp2.trait.pair <- subset(ccc.snp1.snp2.trait, ccc.snp1.snp2.trait$snp1.gene != ccc.snp1.snp2.trait$snp2.gene)
# 25
ccc.snp1.snp2.trait.uniq <- unique(ccc.snp1.snp2.trait.pair[,c(1,2,6)])
# 7
# trait snp1.gene snp2.gene
# 1 prec Potri.010G079500 Potri.006G275900
# 5 prec Potri.016G017500 Potri.006G275900
# 7 prec Potri.016G017500 Potri.006G275800
# 11 prec Potri.010G080600 Potri.006G275900
# 13 prec Potri.015G126900 Potri.006G275800
# 14 prec Potri.015G126900 Potri.006G275900
# 22 vapr Potri.010G079500 Potri.006G275900
### in order to really look at this I need to do a clustering analysis and would need to know which samples have each of the phenotypes...
–> what about SNPs 50kb apart instead of on separate chromosomes
library(tidyr)
library(dplyr)
setwd("/Users/27n/Dropbox (ORNL)/ORNL.Noshay/Projects/Poplar/CCC/v3")
gid <- read.delim("Ptrichocarpa_210_v3.0.geneOnly.sorted.gff3", header=F, sep="\t")
ccc.coord <- read.delim("7.7MPoplar-CCC-Threshold0.7.distance.50kb.txt", header=F, sep="\t")
colnames(ccc.coord) <- c("snp1.chr", "snp1.pos", "snp1.allele", "snp2.chr", "snp2.pos", "snp2.allele", "ccc.cor", "chr.diff", "pos.diff")
colnames(gid) <- c("chromosome", "source", "feature", "start", "end", "dot", "strand", "dot2", "ID")
gid$chr <- as.numeric(substr(gid$chromosome, 4, 5))
gid.df <- gid %>% separate(ID, c("gid", "name"), sep=";")
library(tidygenomics)
ccc.snp1.coord <- unique(ccc.coord[,c(1:3)])
ccc.snp1.coord$chr <- as.numeric(ccc.snp1.coord$snp1.chr)
ccc.snp1.coord$start <- as.numeric(ccc.snp1.coord$snp1.pos)
ccc.snp1.coord$end <- as.numeric(ccc.snp1.coord$snp1.pos)
ccc.snp2.coord <- unique(ccc.coord[,c(4:6)])
ccc.snp2.coord$chr <- as.numeric(ccc.snp2.coord$snp2.chr)
ccc.snp2.coord$start <- as.numeric(ccc.snp2.coord$snp2.pos)
ccc.snp2.coord$end <- as.numeric(ccc.snp2.coord$snp2.pos)
ccc.snp1.gid <- genome_join_closest(ccc.snp1.coord, gid.df, by=c("chr", "start", "end"), distance_column_name="snp1.distance", mode="full")
ccc.snp2.gid <- genome_join_closest(ccc.snp2.coord, gid.df, by=c("chr", "start", "end"), distance_column_name="snp2.distance", mode="full")
ccc.coord.snp1.gid <- left_join(ccc.coord, ccc.snp1.gid, by=c("snp1.chr", "snp1.pos", "snp1.allele"))
ccc.coord.gid <- left_join(ccc.coord.snp1.gid, ccc.snp2.gid, by=c("snp2.chr", "snp2.pos", "snp2.allele"))
ccc.coord.gid.df <- ccc.coord.gid[,c(1:7,22,24,37,39)]
colnames(ccc.coord.gid.df) <- c("snp1.chr", "snp1.pos", "snp1.allele", "snp2.chr", "snp2.pos", "snp2.allele", "ccc.cor", "snp1.gene", "snp1.gene.dist", "snp2.gene", "snp2.gene.dist")
write.table(ccc.coord.gid.df, "7.7MPoplar-CCC-Threshold0.7.distance.50kb.gene.txt", quote=F, row.names=F, sep="\t")
library(tidyr)
library(dplyr)
setwd("/Users/27n/Dropbox (ORNL)/ORNL.Noshay/Projects/Poplar/CCC/v3")
ccc.gid <- read.delim("7.7MPoplar-CCC-Threshold0.7.distance.50kb.gene.txt", header=T, sep="\t")
trait.gid <- read.delim("Poplar.GRIN_from_GWATS.All_genes.PerClimateVariable.txt", header=F, sep="\t")
colnames(trait.gid) <- c("trait", "gid")
trait.gid.count <- trait.gid %>% group_by(trait) %>% mutate(count = n()) %>% select(trait, count) %>% distinct()
ccc.gid$snp1.gene <- substr(ccc.gid$snp1.gene, 6, 21)
ccc.gid$snp2.gene <- substr(ccc.gid$snp2.gene, 6, 21)
ccc.gid.id <- ccc.gid %>% unite("snp1.id", c(snp1.chr, snp1.pos, snp1.allele), sep="_") %>% unite("snp2.id", c(snp2.chr, snp2.pos, snp2.allele), sep="_")
nrow(trait.gid)
nrow(ccc.gid.id)
trait.gid.snp1 <- trait.gid
trait.gid.snp1$snp1.gene <- trait.gid.snp1$gid
ccc.snp1.trait <- inner_join(trait.gid.snp1, ccc.gid.id, by="snp1.gene")
trait.gid.snp2 <- trait.gid
trait.gid.snp2$snp2.gene <- trait.gid.snp2$gid
ccc.snp2.trait <- inner_join(trait.gid.snp2, ccc.gid.id, by="snp2.gene")
ccc.snp1.snp2.trait <- inner_join(ccc.snp1.trait[,c(1,3:6,8)], ccc.snp2.trait[,c(1,3:7)], by=c("trait", "snp1.gene", "snp1.id", "snp2.gene", "snp2.id", "ccc.cor"))
ccc.snp1.snp2.trait.pair <- subset(ccc.snp1.snp2.trait, ccc.snp1.snp2.trait$snp1.gene != ccc.snp1.snp2.trait$snp2.gene)
ccc.snp1.snp2.trait.uniq <- unique(ccc.snp1.snp2.trait.pair[,c(1,2,6)])
/gpfs/alpine/syb105/proj-shared/Data/P_tri/Variants/v4 1419 genotypes
cd /Users/27n/Dropbox (ORNL)/ORNL.Noshay/Projects/Poplar/CCC/v4
#scp noshayjm@dtn.ccs.ornl.gov:/gpfs/alpine/syb105/proj-shared/Data/P_tri/Variants/v4/tranche99/maf05/Ptri_V4_Nisq1.1419.VQSRrecal9_tranche99_allchr.biallelicSNP.geno015.hwe1e50.maf05.mind015.bed .
#scp noshayjm@dtn.ccs.ornl.gov:/gpfs/alpine/syb105/proj-shared/Data/P_tri/Variants/v4/tranche99/maf05/Ptri_V4_Nisq1.1419.VQSRrecal9_tranche99_allchr.biallelicSNP.geno015.hwe1e50.maf05.mind015.bim .
#scp noshayjm@dtn.ccs.ornl.gov:/gpfs/alpine/syb105/proj-shared/Data/P_tri/Variants/v4/tranche99/maf05/Ptri_V4_Nisq1.1419.VQSRrecal9_tranche99_allchr.biallelicSNP.geno015.hwe1e50.maf05.mind015.fam .
salloc -A SYB105 -p gpu -N 2 -t 4:00:00
source /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/scripts/loadcondaandes.sh
conda activate /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/Libraries/andes/envs/test
#conda install -c conda-forge -c bioconda plink2
#conda install -c conda-forge -c bioconda plink
#mkdir /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/CCC/poplar.v4
cd /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/CCC/poplar.v4
# to generate tped need to use plink/1.9
plink --bfile /gpfs/alpine/syb105/proj-shared/Data/P_tri/Variants/v4/tranche99/maf05/Ptri_V4_Nisq1.1419.VQSRrecal9_tranche99_allchr.biallelicSNP.geno015.hwe1e50.maf05.mind015 --recode compound-genotypes --out /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/CCC/poplar.v4/Ptri_V4_Nisq1.1419.VQSRrecal9_tranche99_allchr.biallelicSNP.geno015.hwe1e50.maf05.mind015.tped
# transform .ped and .map to generate input CCC file (tped)
## Columns:
# 1 chromosome (without leading chr)
# 2 Locus name
# 3 Genetic distance, left empty
# 4 Physical position
# 5 genotype
## need to transpose .ped file and input position from .map file
##transpose.py
# with open('Ptri_V4_Nisq1.1419.VQSRrecal9_tranche99_allchr.biallelicSNP.geno015.hwe1e50.maf05.mind015.tped.ped') as file:
# lis = [x.replace('\n', '').split(' ') for x in file]
#
# for x in zip(*lis):
# for y in x:
# print(y + ' ', end="")
# print('')
python transpose.py > Ptri_V4_Nisq1.1419.VQSRrecal9_tranche99_allchr.biallelicSNP.geno015.hwe1e50.maf05.mind015.tped
# 4955674 Ptri_V4_Nisq1.1419.VQSRrecal9_tranche99_allchr.biallelicSNP.geno015.hwe1e50.maf05.mind015.tped.map
# 9911354 Ptri_V4_Nisq1.1419.VQSRrecal9_tranche99_allchr.biallelicSNP.geno015.hwe1e50.maf05.mind015.tped
### why are there 2x the .tped as the .map ?? because it was in compound-genotype format... go back and specify in PLINK run
# 4955680 Ptri_V4_Nisq1.1419.VQSRrecal9_tranche99_allchr.biallelicSNP.geno015.hwe1e50.maf05.mind015.tped
sed -i '1,6d' Ptri_V4_Nisq1.1419.VQSRrecal9_tranche99_allchr.biallelicSNP.geno015.hwe1e50.maf05.mind015.tped
sed -e 's/\t/ /g' Ptri_V4_Nisq1.1419.VQSRrecal9_tranche99_allchr.biallelicSNP.geno015.hwe1e50.maf05.mind015.tped.map > Ptri_V4_Nisq1.1419.VQSRrecal9_tranche99_allchr.biallelicSNP.geno015.hwe1e50.maf05.mind015.tped.space.map
# need to add space between compound-genotype
#awk '$1=$1' FS= OFS=" " Ptri_V4_Nisq1.1419.VQSRrecal9_tranche99_allchr.biallelicSNP.geno015.hwe1e50.maf05.mind015.tped | sed 's/ / /g' > Ptri_V4_Nisq1.1419.VQSRrecal9_tranche99_allchr.biallelicSNP.geno015.hwe1e50.maf05.mind015.space.tped
### something going wrong with this command as I am only getting 4941416 total lines instead of 4955674
#sed 's/\(.\)/\1 /g;s/ $//' Ptri_V4_Nisq1.1419.VQSRrecal9_tranche99_allchr.biallelicSNP.geno015.hwe1e50.maf05.mind015.tped > test2.txt
# 4955674 <- this works properly
sed 's/\(.\)/\1 /g;s/ $//' Ptri_V4_Nisq1.1419.VQSRrecal9_tranche99_allchr.biallelicSNP.geno015.hwe1e50.maf05.mind015.tped | sed 's/ / /g' > Ptri_V4_Nisq1.1419.VQSRrecal9_tranche99_allchr.biallelicSNP.geno015.hwe1e50.maf05.mind015.space.tped
paste -d ' ' Ptri_V4_Nisq1.1419.VQSRrecal9_tranche99_allchr.biallelicSNP.geno015.hwe1e50.maf05.mind015.tped.space.map Ptri_V4_Nisq1.1419.VQSRrecal9_tranche99_allchr.biallelicSNP.geno015.hwe1e50.maf05.mind015.space.tped > ccc_data.tped
#!/bin/bash
#BSUB -P SYB105
#BSUB -W 24:00
#BSUB -nnodes 4
#BSUB -J poplar.v4.ccc.pre
#BSUB -o poplar.v4.ccc.pre.%J.out
#BSUB -q batch-hm
module load gcc/5.4.0
#Script and Data path
spath=/gpfs/alpine/syb105/proj-shared/Personal/mcashman/Tools/summit/cometTools/ccc/preProcess/Scripts/preProcess/preProcess.sh
dpath=/gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/CCC/poplar.v4/raw.data
#Run command
jsrun --nrs 1 --tasks_per_rs 1 --cpu_per_rs 42 --gpu_per_rs 6 --rs_per_host 1 $spath $dpath/ccc_data.tped
echo "Done!"
# cd /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/CCC/poplar.v4
# bsub /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/CCC/poplar.v4/scripts/poplar.v4.ccc.pre.sh
### add #BSUB -q batch-hm to job submission script to allow for more time
–> not working due to time limit, can’t generate binary file?? alter to run on Andes…
#!/bin/bash
#SBATCH -A SYB105
#SBATCH -J poplar.v4.ccc.pre.andes
#SBATCH -N 4
#SBATCH -t 10:00:00
#SBATCH -o poplar.v4.ccc.pre.andes-%j.o
#SBATCH -e poplar.v4.ccc.pre.andes-%j.e
module load gcc/5.4.0
#Script and Data path
spath=/gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/CCC/poplar.v4/preProcess.andes.sh
dpath=/gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/CCC/poplar.v4
#Run command
#jsrun --nrs 1 --tasks_per_rs 1 --cpu_per_rs 42 --gpu_per_rs 6 --rs_per_host 1 $spath $dpath/ccc_data.tped
srun -N1 -n1 -c4 $spath $dpath/ccc_data.tped
echo "Done!"
# cd /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/CCC/poplar.v4
# sbatch /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/CCC/poplar.v4/poplar.v4.ccc.pre.andes.sh
–> need to generate a preProcess.andes.sh file with the appropriate andes-build ??? - TOOLS_DIR=/gpfs/alpine/syb105/proj-shared/Personal/mcashman/Tools/andes/genomics_gpu/tools - binarize=/gpfs/alpine/syb105/proj-shared/Personal/mcashman/Tools/summit/cometTools/ccc/preProcess/Tools/binarize/binarizeCCC
–> utilize other scripts?
# /gpfs/alpine/syb105/proj-shared/Personal/mcashman/Projects/CoMet/new_examples/ccc/preprocess/passed_version/preprocess_single_gabrielsBin.submit
#!/bin/bash
#SBATCH -A syb105
#SBATCH -N 1
#SBATCH -t 60
#SBATCH -J pre
#SBATCH -o output-preprocess-'%A'.txt
# Location of CoMet postprocessing tools.
TOOLS_DIR=/gpfs/alpine/syb105/proj-shared/Personal/mcashman/Tools/andes/genomics_gpu/tools
export PATH="${PATH}:$TOOLS_DIR"
# do this somewhere before here:
#pushd $TOOLS_DIR
#./make.sh
#popd
# Set the filename to be processed
#FILE_IN="/gpfs/alpine/syb105/proj-shared/Personal/mcashman/Projects/CoMet/new_examples/ccc/data/ccc_data.tped"
FILE_IN="/gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/CCC/poplar.v4/ccc_data.tped"
FPATH=$(dirname $FILE_IN)
STUB=$(basename $FILE_IN | sed -e 's/.tped$//')
echo "Preprocessing $FILE_IN"
#time $TOOLS_DIR/line_labels.sh $FILE_IN ${STUB}_line_labels.txt
binarize=/gpfs/alpine/syb105/proj-shared/Personal/mcashman/Tools/andes/cometTools/ccc/preProcess/Tools/binarize/binarizeCCC
(
time line_labels.sh $FILE_IN ${STUB}_line_labels.txt
time line_indices $FILE_IN ${STUB}_line_indices.bin
time allele_labels.sh $FILE_IN ${STUB}_allele_labels.txt
time $binarize $FILE_IN ${STUB}.bin
time preprocess_validate ccc ${STUB}.bin $FILE_IN ${STUB}_allele_labels.txt
) 2>&1 | tee output-preprocess-$STUB.txt
# To save time when using DUO use this altnative to allele_labels creation
#time sed -e 's/.*/AT/' <${STUB}_line_labels.txt >${STUB}_allele_labels.txt
# CCC vector length = 1/2 the number of non-whitespace characters
#([ head -n1 [FILE] | wc -w ] - 4) / 2
head -n1 ccc_data.tped | wc -w
### (2842 - 4) / 2 = 1419
# Number of Vectors: The number of vectors is equal to the number of rows.
wc -l ccc_data.tped
# 4955674
#BSUB -P SYB105
#BSUB -W 10:00
#BSUB -nnodes 500
#BSUB -J poplar.v4.ccc.run
#BSUB -o poplar.v4.ccc.run.%J.out
# bsub -P syb105 -nnodes 100 -W 10 -Is $SHELL
executable=/gpfs/alpine/syb105/proj-shared/Personal/mcashman/Tools/summit/comet/build_release_summit/genomics_metric ar_opts='PAMI_IBV_ENABLE_DCT=1 PAMI_ENABLE_STRIPING=1 PAMI_IBV_ADAPTER_AFFINITY=0 PAMI_IBV_QP_SERVICE_LEVEL=8 PAMI_IBV_ENABLE_OOO_AR=1'
launch_command="env OMP_NUM_THREADS=7 $ar_opts jsrun --nrs 600 --bind packed:7 --cpu_per_rs 7 --gpu_per_rs 1 --rs_per_host 6 --tasks_per_rs 1 -X 1"
$launch_command $executable --num_way 2 --metric_type ccc --sparse yes --num_vector 4955674 --num_field 1419 --all2all yes --compute_method GPU --num_proc_vector 600 --num_proc_field 1 --num_proc_repl 1 --tc 1 --num_tc_steps 1 --num_phase 5 --num_stage 1 --verbosity 1 --checksum no --phase_min 4 --phase_max 4 --stage_min 0 --stage_max 0 --input_file /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/CCC/poplar.v4/raw.data/ccc_data.tped.bin --output_file_stub /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/CCC/poplar.v4/ccc.run/poplar.v4.ccc
# bsub /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/CCC/poplar.v4/scripts/poplar.v4.ccc.run.sh
# Assertion error: "env->num_phase() <= 1 + env->num_block_vector() / 2 && "num_phase must be at most 1 + num_proc_vector/2."". At file /gpfs/alpine/syb105/proj-shared/Personal/mcashman/Tools/summit/comet/genomics_gpu/src/metrics.cc, line 460, host "g19n08".
# Assertion error: "can_run() && "Invalid problem for this system and build."". At file /gpfs/alpine/syb105/proj-shared/Personal/mcashman/Tools/summit/comet/genomics_gpu/src/env.cc, line 261, host "e27n13".
### set --tc 1
# ERROR: One or more process (first noticed rank 452) terminated with signal 9
### try adjusting the number of phases parameter. You will need to set not only --num_phases but also the min and max (to 0 and num_phases-1). You can also just try running one phase as a test (e.g. --num_phases N --min_phases 0 --max_phases 0).
################# increase nodes and phases #################
# try with -nnodes 500 and -W 10:00
#BSUB -P SYB105
#BSUB -W 10:00
#BSUB -nnodes 500
#BSUB -J poplar.v4.ccc.run
#BSUB -o poplar.v4.ccc.run.%J.out
## Note: changed --nrs 2160
env OMP_NUM_THREADS=7 PAMI_IBV_ENABLE_DCT=1 PAMI_ENABLE_STRIPING=1 PAMI_IBV_ADAPTER_AFFINITY=0 PAMI_IBV_QP_SERVICE_LEVEL=8 PAMI_IBV_ENABLE_OOO_AR=1 jsrun --nrs 2160 --bind packed:7 --cpu_per_rs 7 --gpu_per_rs 1 --rs_per_host 6 --tasks_per_rs 1 -X 1 /gpfs/alpine/syb105/proj-shared/Personal/mcashman/Tools/summit/comet/build_release_summit/genomics_metric --num_way 2 --metric_type ccc --sparse yes --num_vector 4955674 --num_field 1419 --all2all yes --compute_method GPU --num_proc_vector 2160 --num_proc_field 1 --num_proc_repl 1 --tc 1 --num_tc_steps 1 --num_phase 8 --num_stage 1 --verbosity 1 --checksum no --phase_min 0 --phase_max 7 --stage_min 0 --stage_max 0 --input_file /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/CCC/poplar.v4/raw.data/ccc_data.tped.bin --output_file_stub /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/CCC/poplar.v4/ccc.run/poplar.v4.ccc
# bsub /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/CCC/poplar.v4/scripts/poplar.v4.ccc.run.8june.sh
################# replicate Gabriel's script #################
## /gpfs/alpine/syb105/proj-shared/Personal/gabrielgaz/Runs/comet/summit/ccc/poplar/01-Poplar7M/run
# major change is using num_proc_vector 600
# try with -nnodes 100 and -W 02:00 AND 50 phases
### runs but get Error after 10 phases completed saying
# User defined signal 2
# ERROR: One or more process (first noticed rank 520) terminated with signal 12
# try with --nnodes 360
#BSUB -P SYB105
#BSUB -W 02:00
#BSUB -nnodes 360
#BSUB -J poplar.v4.ccc.run
#BSUB -o poplar.v4.ccc.run.%J.out
executable=/gpfs/alpine/syb105/proj-shared/Personal/mcashman/Tools/summit/comet/build_release_summit/genomics_metric ar_opts='PAMI_IBV_ENABLE_DCT=1 PAMI_ENABLE_STRIPING=1 PAMI_IBV_ADAPTER_AFFINITY=0 PAMI_IBV_QP_SERVICE_LEVEL=8 PAMI_IBV_ENABLE_OOO_AR=1'
launch_command="env OMP_NUM_THREADS=7 $ar_opts jsrun --nrs 600 --bind packed:7 --cpu_per_rs 7 --gpu_per_rs 1 --rs_per_host 6 --tasks_per_rs 1 -X 1"
$launch_command $executable --num_way 2 --metric_type ccc --sparse yes --num_vector 4955674 --num_field 1419 --all2all yes --compute_method GPU --num_proc_vector 600 --num_proc_field 1 --num_proc_repl 1 --tc 1 --num_tc_steps 1 --num_phase 50 --num_stage 1 --verbosity 1 --checksum no --phase_min 0 --phase_max 49 --stage_min 0 --stage_max 0 --input_file /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/CCC/poplar.v4/raw.data/ccc_data.tped.bin --threshold 0.7 --output_file_stub /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/CCC/poplar.v4/ccc.run/poplar.v4.ccc
# bsub /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/CCC/poplar.v4/scripts/poplar.v4.ccc.run.9june.sh
#!/bin/bash
#SBATCH -A syb105
#SBATCH -N 1
#SBATCH -t 10
#SBATCH -J poplar.v4.ccc.post
#SBATCH -o poplar.v4.ccc.post.'%A'.txt
# do this somewhere before here:
#pushd $TOOLS_DIR
#./make.sh
#popd
# Location of CoMet postprocessing tools.
TOOLS_DIR=/gpfs/alpine/syb105/proj-shared/Personal/mcashman/Tools/andes/genomics_gpu/tools
export PATH="${PATH}:$TOOLS_DIR"
NUM_WAY=2
# Extract the file to be processed by this node from the file list.
BASE_PATH_PRE=/gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/CCC/poplar.v4/preprocess
FILE_IN=/gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/CCC/poplar.v4/raw.data/poplar.v4.ccc.bin
STUB=ccc_data
mkdir -p out_slurm_$STUB
echo "Postprocessing $FILE_IN"
(
time postprocess $NUM_WAY \
$BASE_PATH_PRE/${STUB}_allele_labels.txt \
$BASE_PATH_PRE/${STUB}_line_labels.txt \
$FILE_IN \
$STUB.txt
) 2>&1 | tee out_slurm_$STUB/output-postprocess-$STUB.txt
# sbatch /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/CCC/poplar.v4/scritps/poplar.v4.ccc.post.sh
/gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/CCC/poplar.v4/ccc_data.txt –> output min 0.474554 max 0.523073 –> how are there no SNPs with a higher correlation? –> what does a permuted dataset look like? try to run CCC on the permuted dataset output from blocbuster
# 4955674 ccc_data.tped
# 805920 ccc_data.txt #### shouldn't the output have MORE lines than the tped file??
4955674 / 805920
## [1] 6.149089
# 4024284 ccc_data.txt #### still only capturing chr 1???
# cut -d ':' -f 1 ccc_data_line_labels.txt | awk '{if ($1 == "01") print $0}' | wc -l
# 64720
64720
## [1] 64720
*** NEED TO RUN THE POST_PROCESSING FOR EACH INDIVIDUAL .BIN FILE *** - run on poplar.v4.ccc_0.bin, poplar.v4.ccc_1.bin, poplar.v4.ccc_2.bin, poplar.v4.ccc_3.bin, poplar.v4.ccc_4.bin, and poplar.v4.ccc_5.bin - after each run copy ccc_data.txt file to output directory and rename…. then merge all together?
–> USE GABRIEL’s POST-PROCESSING SCRIPT as reference… ultimately run Piet’s script to do post-processing on each phase output and then merge
###PostProcessing
# python
out = open("commands_postprocess.txt", "w")
#for i in range(600): ## need to specify the 3 digits (001-600 instead of 1-600)
phase = ["%03d" % i for i in range(600)]
for i in phase:
postprocess = "/gpfs/alpine/syb105/proj-shared/Personal/mcashman/Tools/andes/genomics_gpu/tools/postprocess_all.sh 2 /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/CCC/poplar.v4/raw.data/ccc_data_allele_labels.txt /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/CCC/poplar.v4/raw.data/ccc_data_labels.txt /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/CCC/poplar.v4/ccc.run/poplar.v4.ccc_" + str(i) + ".bin"
out.write("%s\n" % postprocess)
out.close()
# Piets Script
/gpfs/alpine/syb105/proj-shared/piet/codebase/andes/submit --name cccInputs --time 48:00:00 --maxpernode 32 --nodes 1 /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/CCC/poplar.v4/scripts/commands_postprocess.txt
# /gpfs/alpine/syb105/proj-shared/Personal/mcashman/Tools/andes/genomics_gpu/tools/postprocess_all.sh 2 /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/CCC/poplar.v4/raw.data/ccc_data_allele_labels.txt /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/CCC/poplar.v4/raw.data/ccc_data_labels.txt /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/CCC/poplar.v4/ccc.run/poplar.v4.ccc_000.bin
#Put *.txt together, postprocessed bin files
cd /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/CCC/poplar.v4/ccc.run
cat $(find ./ -name "poplar.v4.ccc_*.txt" | sort -V) > poplar.v4.ccc.threshold0.7.txt
#!/bin/sh
#SBATCH -A SYB105
#SBATCH -N 1
#SBATCH -p batch
#SBATCH -t 48:00:00
#SBATCH --mem-per-cpu=0
#SBATCH -o blocbuster.ccc-%j.o
#SBATCH -e blocbuster.ccc-%j.e
genFile=ccc_data_compound.tped
numInd=1419
numSnps=4955674
thresh=0.7
numHeadRows=0
numHeadCols=4
# Assumed rows represent SNPs
Gml=ccc.gml
/gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/CCC/BlocBuster/ccc/ccc ${genFile} ${Gml} ${thresh} ${numInd} ${numSnps} ${numHeadRows} ${numHeadCols}
echo Finished
echo
# cd /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/CCC/poplar.v4
# sbatch /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/CCC/poplar.v4/sharlee.ccc.sh
https://zzz.bwh.harvard.edu/plink/perm.shtml
# For some tests, however, these procedures are not available (e.g. SNP x SNP epistasis tests).
http://www.cs.umsl.edu/~climer/blocBuster/README_perm
# blocBuster: perm
# perm input.txt output.txt numDataCols numDataRows numHeadCols numHeadRows
# where
# - 'input.txt' is your genotype data
# - 'output.txt' will hold the permuted data
# - 'numDataCols' is the number of individuals
# - 'numDataRows' is the number of SNPs
# - 'numHeadCols' is the number of header columns
# - 'numHeadRows' is the number of header rows
cd /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/CCC/poplar.v4
paste -d ' ' Ptri_V4_Nisq1.1419.VQSRrecal9_tranche99_allchr.biallelicSNP.geno015.hwe1e50.maf05.mind015.tped.space.map Ptri_V4_Nisq1.1419.VQSRrecal9_tranche99_allchr.biallelicSNP.geno015.hwe1e50.maf05.mind015.tped > ccc_data_compound.tped
#scp noshayjm@dtn.ccs.ornl.gov:/gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/CCC/poplar.v4/ccc_data_compound.tped /Users/27n/Dropbox (ORNL)/ORNL.Noshay/Projects/Poplar/CCC/v4/.
cd /Users/27n/Dropbox (ORNL)/ORNL.Noshay/Projects/Poplar/CCC/v4/
../../../../Resources/BlocBuster/perm/perm ccc_data_compound.tped ccc_data_compound.perm.txt 1419 4955674 4 0
# run blocBuster: http://www.cs.umsl.edu/~climer/blocBuster/README
## http://www.cs.umsl.edu/~climer/blocBuster/README_ccc
tr ' ' \\t < ccc_data_compound.perm.txt > ccc_data_compound.perm.tab.txt
tr ' ' \\t < ccc_data_compound.tped > ccc_data_compound.tab.txt
sed 's/00/NN/g' ccc_data_compound.perm.tab.txt > ccc_data_compound.perm.blocBuster.input.txt
sed 's/00/NN/g' ccc_data_compound.tab.txt > ccc_data_compound.blocBuster.input.txt
##***Important: Assumed rows represent individuals and columns represent SNPs in input file.***
## Need to adjust [ROWS_R_SNPS in 'bloc.h' to 1]
## or transpose file --> in R
../../../../Resources/BlocBuster/ccc/ccc ccc_data_compound.perm.blocBuster.input.txt ccc_data_compound.perm.gml 0.7 1419 4955674 0 4
# 1419 individuals and 4955674 SNPs in entire input file.
# Threshold of 0.7 used.
#
# Reading in data...
# 0 snps have only one allele in dataset.
# 27989824 and 27989824 missing values in first and second SNP sets, respectively.
#
# Computing CCC values...
../../../../Resources/BlocBuster/ccc/ccc ccc_data_compound.blocBuster.input.txt ccc_data_compound.gml 0.7 1419 4955674 0 4
# 1419 individuals and 4955674 SNPs in entire input file.
# Threshold of 0.7 used.
#
# Reading in data...
# 0 snps have only one allele in dataset.
# 27989824 and 27989824 missing values in first and second SNP sets, respectively.
#
# Computing CCC values...
#### takes too long to run on a local machine...
# convert compound permuted data to proper tped format (add space between compound genotypes in temp.gen)
cut -d ' ' -f 1-4 temp.gen > temp.gen.labels
cut -d ' ' -f5- temp.gen | sed 's/./& /g' | sed 's/ //g' > temp.gen.alleles
paste -d ' ' temp.gen.labels temp.gen.alleles > temp.gen.tped
#!/bin/bash
#BSUB -P SYB105
#BSUB -W 24:00
#BSUB -nnodes 4
#BSUB -J perm.ccc.pre
#BSUB -o perm.ccc.pre.%J.out
#BSUB -q batch-hm
module load gcc/5.4.0
#Script and Data path
spath=/gpfs/alpine/syb105/proj-shared/Personal/mcashman/Tools/summit/cometTools/ccc/preProcess/Scripts/preProcess/preProcess.sh
#spath=/gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/CCC/poplar.v4/preProcess_binarize.sh
dpath=/gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/CCC/poplar.v4/perm
#Run command
jsrun --nrs 1 --tasks_per_rs 1 --cpu_per_rs 42 --gpu_per_rs 6 --rs_per_host 1 $spath $dpath/temp.gen.tped
echo "Done!"
# cd /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/CCC/poplar.v4/scipts
# bsub /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/CCC/poplar.v4/scripts/poplar.v4.ccc.pre.perm.sh
#BSUB -P SYB105
#BSUB -W 10:00
#BSUB -nnodes 100
#BSUB -J poplar.v4.ccc.run
#BSUB -o poplar.v4.ccc.run.%J.out
# bsub -P syb105 -nnodes 100 -W 10 -Is $SHELL
executable=/gpfs/alpine/syb105/proj-shared/Personal/mcashman/Tools/summit/comet/build_release_summit/genomics_metric ar_opts='PAMI_IBV_ENABLE_DCT=1 PAMI_ENABLE_STRIPING=1 PAMI_IBV_ADAPTER_AFFINITY=0 PAMI_IBV_QP_SERVICE_LEVEL=8 PAMI_IBV_ENABLE_OOO_AR=1'
launch_command="env OMP_NUM_THREADS=7 $ar_opts jsrun --nrs 600 --bind packed:7 --cpu_per_rs 7 --gpu_per_rs 1 --rs_per_host 6 --tasks_per_rs 1 -X 1"
$launch_command $executable --num_way 2 --metric_type ccc --sparse yes --num_vector 4955674 --num_field 1419 --all2all yes --compute_method GPU --num_proc_vector 600 --num_proc_field 1 --num_proc_repl 1 --tc 1 --num_tc_steps 1 --num_phase 1 --num_stage 1 --verbosity 1 --checksum no --phase_min 0 --phase_max 0 --stage_min 0 --stage_max 0 --input_file /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/CCC/poplar.v4/perm/temp.gen.tped.bin --output_file_stub /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/CCC/poplar.v4/perm/poplar.v4.ccc.perm
# bsub /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/CCC/poplar.v4/scripts/poplar.v4.ccc.run.perm.sh
#!/bin/bash
#SBATCH -A syb105
#SBATCH -N 1
#SBATCH -t 10
#SBATCH -J perm.ccc.post
#SBATCH -o perm.ccc.post.'%A'.txt
# do this somewhere before here:
#pushd $TOOLS_DIR
#./make.sh
#popd
# Location of CoMet postprocessing tools.
TOOLS_DIR=/gpfs/alpine/syb105/proj-shared/Personal/mcashman/Tools/andes/genomics_gpu/tools
export PATH="${PATH}:$TOOLS_DIR"
NUM_WAY=2
# Extract the file to be processed by this node from the file list.
BASE_PATH_PRE=/gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/CCC/poplar.v4
FILE_IN=/gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/CCC/poplar.v4/perm/poplar.v4.ccc.perm_0.bin
STUB=ccc_data
mkdir -p out_slurm_$STUB
echo "Postprocessing $FILE_IN"
(
time postprocess $NUM_WAY \
$BASE_PATH_PRE/${STUB}_allele_labels.txt \
$BASE_PATH_PRE/${STUB}_line_labels.txt \
$FILE_IN \
$STUB.txt
) 2>&1 | tee out_slurm_$STUB/output-postprocess-$STUB.txt
# sbatch /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/CCC/poplar.v4/poplar.v4.ccc.post.perm.sh