# git clone https://github.com/TerminatorJ/CRISPR-TRAP-seq.git
# git clone https://github.com/TerminatorJ/GNL_Scorer.git
# cd /Users/27n/Dropbox (ORNL)/ORNL.Noshay/Projects/Poplar/SEED/ExploratoryDataForModelGeneration
# cut -f 3 SupTable1.txt | sed '1d' | awk '{ print ">"NR"\n"$0 }' > Seq.fasta
module load python
source /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/scripts/loadcondaandes.sh
conda activate /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/Libraries/andes/envs/test
# export PATH=/gpfs/alpine/syb105/proj-shared/Personal/noshayjm/Libraries/andes/GNL_Scorer/cal_deltaG/oligoarrayaux-3.8/bin:$PATH
# pip install: python3 package: os sys pandas=0.23.4 numpy=1.15.3 time sklearn=0.19.2 Bio=1.72 pickle itertools
cd /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/Libraries/andes/GNL_Scorer/
/gpfs/alpine/syb105/proj-shared/Personal/noshayjm/Libraries/andes/GNL_Scorer/test.py GAGGAAAGCAGCCAGGACAGCAGTGGGCAG TCAGAAATAATACCAACAACTGGAGGGAGA
/gpfs/alpine/syb105/proj-shared/Personal/noshayjm/Libraries/andes/GNL_Scorer/test.single.py GAGGAAAGCAGCCAGGACAGCAGTGGGCAG TCAGAAATAATACCAACAACTGGAGGGAGA
/gpfs/alpine/syb105/proj-shared/Personal/noshayjm/Libraries/andes/GNL_Scorer/test.fasta.py /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/Seq.fasta
# sh: hybrid-ss-min: command not found
### need unafold system but can't download???
cd /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/Libraries/andes/CRISPR-TRAP-seq
#cp /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/Seq.fasta .
python
import prediction
prediction.get_score("Seq.fasta","Cas9",full_length=True,site=None)
##### Updated with CRISPR-TRAP-seq
cd /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/Libraries/andes/CRISPR-TRAP-seq
# change sklearn.grid_search to sklearn.model_selection in import for featurization.py
python
import prediction
prediction.get_score("test.fasta","Cas9",full_length=True,site=None)
gene annotation
TE annotation
GC content
Nucleosome occupancy
ATAC-seq
WGBS
ChIP-seq
module load python
source /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/scripts/loadcondaandes.sh
conda activate /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/Libraries/andes/envs/test
cd /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/yeast
# make 1kb windows of genome
# wget https://ftp.ncbi.nlm.nih.gov/genomes/all/GCA/003/086/655/GCA_003086655.1_ASM308665v1/GCA_003086655.1_ASM308665v1_genomic.fna.gz
# wget ftp://ftp.ensemblgenomes.org/pub/fungi/release-50/fasta/saccharomyces_cerevisiae/dna/*
/gpfs/alpine/syb105/proj-shared/Personal/noshayjm/Libraries/andes/samtools/samtools faidx Saccharomyces_cerevisiae.R64-1-1.dna_sm.toplevel.fa
cut -f1,2 Saccharomyces_cerevisiae.R64-1-1.dna_sm.toplevel.fa.fai | sort -k 1,1 -k 2,2n > yeast.sizes.genome
bedtools makewindows -g yeast.sizes.genome -w 1000 > yeast.1kb.windows.bed
# bigwig to bedgraph
# conda install -c bioconda ucsc-bigwigtobedgraph
# scp /Users/27n/Dropbox (ORNL)/ORNL.Noshay/Projects/Poplar/SEED/GSE130946/* noshayjm@dtn.ccs.ornl.gov:/gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/yeast/.
bigWigToBedGraph GSM3756697_ATAC-seq_Nrd1-AA_-Rap_0-120bp.bigwig GSM3756697_ATAC-seq_Nrd1-AA_-Rap_0-120bp.bedGraph
# bedtools genomecov [OPTIONS] [-i|-ibam] -g (iff. -i)
#bedtools genomecov -i GSM3756697_ATAC-seq_Nrd1-AA_-Rap_0-120bp.bedGraph -g GCA_003086655.1_ASM308665v1_genomic.fna > atac.genomecov.bed
# bedtools intersect
sed 's/chr//g' GSM3756697_ATAC-seq_Nrd1-AA_-Rap_0-120bp.bedGraph > GSM3756697_ATAC-seq_Nrd1-AA_-Rap_0-120bp.bed
bedtools intersect -wo -a yeast.1kb.windows.bed -b GSM3756697_ATAC-seq_Nrd1-AA_-Rap_0-120bp.bed > yeast.atac.windows.bed
module load python
source /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/scripts/loadcondaandes.sh
conda activate /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/Libraries/andes/envs/test
# R
library(dplyr)
setwd("/gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/yeast")
atac <- read.delim("yeast.atac.windows.bed", header=F, sep="\t", stringsAsFactors = F)
window <- read.delim("yeast.1kb.windows.bed", header=F, sep="\t", stringsAsFactors = F)
atac.bin <- atac %>% group_by(V1, V2, V3) %>% mutate(count = n())
atac.count <- unique(atac.bin[,c(1:3,9)])
atac.window <- left_join(window, atac.count, by=c("V1", "V2", "V3"))
atac.window[is.na(atac.window)] <- 0
library(wavelets)
atac.mat <- as.matrix(atac.window$count)
atac.dwt <- dwt(atac.mat, filter="la8", boundary="periodic", fast=TRUE)
pdf("wavelet.yeast.atac.pdf")
plot.dwt(atac.dwt)
dev.off()
module load python
source /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/scripts/loadcondaandes.sh
conda activate /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/Libraries/andes/envs/test
cd /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/septoria
wget ftp://ftp.ensemblgenomes.org/pub/fungi/release-50/variation/vcf/zymoseptoria_tritici/*
# make 1kb windows of genome
# bedtools genomecov [OPTIONS] [-i|-ibam] -g (iff. -i)
/gpfs/alpine/syb105/proj-shared/Personal/noshayjm/Libraries/andes/samtools/samtools faidx Zymoseptoria_tritici.MG2.dna.toplevel.fa
cut -f1,2 Zymoseptoria_tritici.MG2.dna.toplevel.fa.fai > Zymoseptoria_tritici.sizes.genome
# conda install -c bioconda bedtools
bedtools makewindows -g Zymoseptoria_tritici.sizes.genome -w 1000 > Zymoseptoria_tritici.1kb.windows.bed
bedtools intersect -wo -a Zymoseptoria_tritici.1kb.windows.bed -b zymoseptoria_tritici.vcf > vcf.windows.bed
grep -v 'deletion' zymoseptoria_tritici.vcf | grep -v 'indel' | grep -v 'insertion' | grep -v 'sequence_alteration' > zymoseptoria_tritici.snv.vcf
bedtools intersect -wo -a Zymoseptoria_tritici.1kb.windows.bed -b zymoseptoria_tritici.snv.vcf > vcf.snv.windows.bed
bedtools intersect -wo -a Zymoseptoria_tritici.1kb.windows.bed -b Zymoseptoria_tritici.MG2.50.gene.gff > gene.windows.bed
module load python
source /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/scripts/loadcondaandes.sh
conda activate /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/Libraries/andes/envs/test
# R
library(dplyr)
setwd("/gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/septoria")
vcf <- read.delim("vcf.windows.bed", header=F, sep="\t", stringsAsFactors = F)
snv <- read.delim("vcf.snv.windows.bed", header=F, sep="\t", stringsAsFactors = F)
gene <- read.delim("gene.windows.bed", header=F, sep="\t", stringsAsFactors = F)
window <- read.delim("Zymoseptoria_tritici.1kb.windows.bed", header=F, sep="\t", stringsAsFactors = F)
vcf.bin <- vcf %>% group_by(V1, V2, V3) %>% mutate(count = n())
vcf.count <- unique(vcf.bin[,c(1:3,13)])
vcf.type.bin <- vcf %>% group_by(V1, V2, V3) %>% mutate(count = n())
vcf.type.count <- unique(vcf.type.bin[,c(1:3,11,13)])
snv.bin <- snv %>% group_by(V1, V2, V3) %>% mutate(count = n())
snv.count <- unique(snv.bin[,c(1:3,13)])
snv.window <- left_join(window, snv.count, by=c("V1", "V2", "V3"))
snv.window[is.na(snv.window)] <- 0
gene.bin <- gene %>% group_by(V1, V2, V3) %>% mutate(count = n())
gene.count <- unique(gene.bin[,c(1:3,14)])
gene.window <- left_join(window, gene.count, by=c("V1", "V2", "V3"))
gene.window[is.na(gene.window)] <- 0
library(wavelets)
snv.mat <- as.matrix(snv.window$count)
snv.dwt <- dwt(snv.mat, filter="la8", boundary="periodic", fast=TRUE)
pdf("wavelet.snv.pdf")
plot.dwt(snv.dwt)
dev.off()
gene.mat <- as.matrix(gene.window$count)
gene.dwt <- dwt(gene.mat, filter="la8", boundary="periodic", fast=TRUE)
pdf("wavelet.gene.pdf")
plot.dwt(gene.dwt)
dev.off()
pdf("wavelet.gene.snv.pdf")
list.dwt <- c(gene.dwt, snv.dwt)
plot.dwt.multiple(list.dwt)
dev.off()
# library(WaveletComp)
# colnames(snv.mat) <- "count"
# analyze.wavelet(snv.mat)
module load python
source /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/scripts/loadcondaandes.sh
conda activate /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/Libraries/andes/envs/test
cd /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/septoria
/gpfs/alpine/syb105/proj-shared/Personal/noshayjm/Libraries/andes/samtools/samtools faidx GCF_000320565.1_Septoria_musiva_SO2202_v1.0_genomic.fna
cut -f1,2 GCF_000320565.1_Septoria_musiva_SO2202_v1.0_genomic.fna.fai > Septoria_musiva.sizes.genome
# conda install -c bioconda bedtools
bedtools makewindows -g Septoria_musiva.sizes.genome -w 1000 > Septoria_musiva.1kb.windows.bed
bedtools intersect -wo -a Septoria_musiva.1kb.windows.bed -b GCF_000320565.1_Septoria_musiva_SO2202_v1.0_gene.gff > Septoria_musiva.gene.windows.bed
bedtools nuc -fi GCF_000320565.1_Septoria_musiva_SO2202_v1.0_genomic.fna -bed Septoria_musiva.1kb.windows.bed | sed '1d' > Septoria_musiva.GC.windows.bed
module load python
source /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/scripts/loadcondaandes.sh
conda activate /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/Libraries/andes/envs/test
# R
library(dplyr)
setwd("/gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/septoria")
gene <- read.delim("Septoria_musiva.gene.windows.bed", header=F, sep="\t", stringsAsFactors = F)
gc <- read.delim("Septoria_musiva.GC.windows.bed", header=F, sep="\t", stringsAsFactors = F)
window <- read.delim("Septoria_musiva.1kb.windows.bed", header=F, sep="\t", stringsAsFactors = F)
gene.bin <- gene %>% group_by(V1, V2, V3) %>% mutate(count = n())
gene.count <- unique(gene.bin[,c(1:3,14)])
gene.window <- left_join(window, gene.count, by=c("V1", "V2", "V3"))
gene.window[is.na(gene.window)] <- 0
library(wavelets)
gene.mat <- as.matrix(gene.window$count)
gene.dwt <- dwt(gene.mat, filter="la8", boundary="periodic", fast=TRUE)
pdf("wavelet.musiva.gene.pdf")
plot.dwt(gene.dwt)
dev.off()
gc.mat <- as.matrix(gc$V5)
gc.dwt <- dwt(gc.mat, filter="la8", boundary="periodic", fast=TRUE)
pdf("wavelet.musiva.gc.pdf")
plot.dwt(gc.dwt)
dev.off()
pdf("wavelet.gene.gc.snv.pdf")
list.dwt <- c(gene.dwt, gc.dwt)
plot.dwt.multiple(list.dwt)
dev.off()
# wavethresh
module load python
source /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/scripts/loadcondaandes.sh
conda activate /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/Libraries/andes/envs/test
conda config --add channels conda-forge
conda install r-wavethresh
##Function to perform a 1D wavelet transform on an input FAIMS data matrix
##
WaveletTransform <- function(dataMatrix){
library(wavethresh)
##----------------------------------------------------------------------
## FIND USEFUL VALUES --------------------------------------------------
##----------------------------------------------------------------------
nDataItems = nrow(dataMatrix)
nFeatures = ncol(dataMatrix)
nWavelets = 2^ceiling(log2(nFeatures)) - 1
data.wd = matrix(0, nDataItems, nWavelets)
row.names(data.wd) = row.names(dataMatrix)
##----------------------------------------------------------------------
## GENERATE THE WAVELET TRANSFORMED DATA -------------------------------
##----------------------------------------------------------------------
for (i in 1:nDataItems){
currentData = dataMatrix[i,]
working = numeric(nWavelets + 1)
working[1:nFeatures] = currentData
current.wd = wd(working, filter.number=10)
data.wd[i, ] = current.wd$D
}
##----------------------------------------------------------------------
## REMOVE ANY ZERO-VARIANCE FEATURES -----------------------------------
##----------------------------------------------------------------------
sigmaValues = apply(data.wd, 2, sd)
keep = which(sigmaValues>0)
data.wd = data.wd[, keep]
return(data.wd)
}
##*****************************************************************************
##*****************************************************************************
##----------------------------------------------------------------------
## ----------------------------------------
##----------------------------------------------------------------------
##Function to perform a 2D wavelet transform on an input FAIMS data matrix
## input = all data
WaveletTransform2D <- function(m,dataMatrix,targetValues,dimensions=c(512, 102),cropped=FALSE){
library(wavethresh)
library(RColorBrewer)
##----------------------------------------------------------------------
## FIND USEFUL VALUES --------------------------------------------------
##----------------------------------------------------------------------
dataMatrix = as.matrix(dataMatrix)
nDataItems = nrow(dataMatrix)
nFeatures = ncol(dataMatrix)
if (cropped==TRUE){
DIMS = dimensions
if(DIMS[1]>512) cropped = FALSE
}
dimensions=c(512, 102) ## override input for cropping to work properly (one less consideration)
##----------------------------------------------------------------------
## GENERATE THE WAVELET TRANSFORMED DATA -------------------------------
##----------------------------------------------------------------------
for (i in 1:nDataItems){
currentData = dataMatrix[i,]
dim(currentData) = dimensions
if (!cropped){
add_zeros = matrix(0,dimensions[1],(dimensions[1]-dimensions[2]))
} else if (cropped){
currentData = currentData[(256-DIMS[1]/2+1):(256+DIMS[1]/2),]
add_zeros = matrix(0,DIMS[1],DIMS[1]-ncol(currentData))
# currentData = currentData[193:320,]
# add_zeros = matrix(0,128,26)
}
working = cbind(currentData,add_zeros)
# dim(working)
### PLOT DATA
# if (m==1) {
# if (i==1) {
# colourTable = heat.colors(100)
#
# png(file="Cropped_Data_disease.png",pointsize =16)
# image(working,main=names(targetValues)[i],col=colourTable)
# dev.off()
# }
# if (i==57) {
# colourTable = heat.colors(100)
# png(file="Cropped_Data_control.png",pointsize =16)
# image(working,main=names(targetValues)[i],col=colourTable)
# dev.off()
# }
# }
#### WAVELET TRANSFORM
current.wd = imwd(working, filter.number=10, family="DaubLeAsymm", type="wavelet")
nLevels = current.wd$nlevels-1
working.data = current.wd$w0Lconstant
for (j in 0:nLevels) {
for (k in 1:4){
working.data = c(working.data,current.wd[[ lt.to.name(j, k) ]])
}
}
if (i==1){
data.wd = matrix(0, nDataItems, length(working.data))
row.names(data.wd) = row.names(dataMatrix)
}
data.wd[i, ] = working.data
}
# length(current.wd2$w4L4) ## 8 levels, level 7 has length 16384 (4 components)
## level 6 has length 4096
## level 5 has length 1024 ....
# Number of columns for wavelet transform according to matrix size:
# 512x512--> 349525
# 256x256--> 87381
# 128x128--> 21845 ## additional benefit of dimensionality reduction
return(data.wd)
}
##*****************************************************************************
##*****************************************************************************
##----------------------------------------------------------------------
## ----------------------------------------
##----------------------------------------------------------------------
cd /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/yeast
# bedgraph
bigWigToBedGraph GSM3756697_ATAC-seq_Nrd1-AA_-Rap_0-120bp.bigwig GSM3756697_ATAC-seq_Nrd1-AA_-Rap_0-120bp.bedGraph
bigWigToBedGraph GSM3756673_RNA-seq_Nrd1-AA_-Rap_Crick.bw GSM3756673_RNA-seq_Nrd1-AA_-Rap_Crick.bedGraph
bigWigToBedGraph GSM3756681_TBP-ChEC_Nrd1-AA_-Rap_0-120bp.bigwig GSM3756681_TBP-ChEC_Nrd1-AA_-Rap_0-120bp.bedGraph
bigWigToBedGraph GSM3756701_MNase-seq_Nrd1-AA_-Rap_120-200bp.bigwig GSM3756701_MNase-seq_Nrd1-AA_-Rap_120-200bp.bedGraph
bigWigToBedGraph GSM3756705_H3K36me3_Nrd1-AA_-Rap_120-200bp.bigWig GSM3756705_H3K36me3_Nrd1-AA_-Rap_120-200bp.bedGraph
bigWigToBedGraph GSM3756709_H3K18ac_Nrd1-AA_-Rap_120-200bp.bigWig GSM3756709_H3K18ac_Nrd1-AA_-Rap_120-200bp.bedGraph
bigWigToBedGraph GSM3756713_H4ac_Nrd1-AA_-Rap_120-200bp.bigWig GSM3756713_H4ac_Nrd1-AA_-Rap_120-200bp.bedGraph
# bedtools intersect
sed 's/chr//g' GSM3756697_ATAC-seq_Nrd1-AA_-Rap_0-120bp.bedGraph > GSM3756697_ATAC-seq_Nrd1-AA_-Rap_0-120bp.bed
bedtools intersect -wo -a yeast.1kb.windows.bed -b GSM3756697_ATAC-seq_Nrd1-AA_-Rap_0-120bp.bed > yeast.atac.windows.bed
sed 's/chr//g' GSM3756673_RNA-seq_Nrd1-AA_-Rap_Crick.bedGraph > GSM3756673_RNA-seq_Nrd1-AA_-Rap_Crick.bed
bedtools intersect -wo -a yeast.1kb.windows.bed -b GSM3756673_RNA-seq_Nrd1-AA_-Rap_Crick.bed > yeast.rna.windows.bed
sed 's/chr//g' GSM3756681_TBP-ChEC_Nrd1-AA_-Rap_0-120bp.bedGraph > GSM3756681_TBP-ChEC_Nrd1-AA_-Rap_0-120bp.bed
bedtools intersect -wo -a yeast.1kb.windows.bed -b GSM3756681_TBP-ChEC_Nrd1-AA_-Rap_0-120bp.bed > yeast.tbp.windows.bed
sed 's/chr//g' GSM3756701_MNase-seq_Nrd1-AA_-Rap_120-200bp.bedGraph > GSM3756701_MNase-seq_Nrd1-AA_-Rap_120-200bp.bed
bedtools intersect -wo -a yeast.1kb.windows.bed -b GSM3756701_MNase-seq_Nrd1-AA_-Rap_120-200bp.bed > yeast.mnase.windows.bed
sed 's/chr//g' GSM3756705_H3K36me3_Nrd1-AA_-Rap_120-200bp.bedGraph > GSM3756705_H3K36me3_Nrd1-AA_-Rap_120-200bp.bed
bedtools intersect -wo -a yeast.1kb.windows.bed -b GSM3756705_H3K36me3_Nrd1-AA_-Rap_120-200bp.bed > yeast.h3k36.windows.bed
sed 's/chr//g' GSM3756709_H3K18ac_Nrd1-AA_-Rap_120-200bp.bedGraph > GSM3756709_H3K18ac_Nrd1-AA_-Rap_120-200bp.bed
bedtools intersect -wo -a yeast.1kb.windows.bed -b GSM3756709_H3K18ac_Nrd1-AA_-Rap_120-200bp.bed > yeast.h3k18.windows.bed
sed 's/chr//g' GSM3756713_H4ac_Nrd1-AA_-Rap_120-200bp.bedGraph > GSM3756713_H4ac_Nrd1-AA_-Rap_120-200bp.bed
bedtools intersect -wo -a yeast.1kb.windows.bed -b GSM3756713_H4ac_Nrd1-AA_-Rap_120-200bp.bed > yeast.h3k4.windows.bed
module load python
source /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/scripts/loadcondaandes.sh
conda activate /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/Libraries/andes/envs/test
# R
library(dplyr)
setwd("/gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/yeast")
atac <- read.delim("yeast.atac.windows.bed", header=F, sep="\t", stringsAsFactors = F)
rna <- read.delim("yeast.rna.windows.bed", header=F, sep="\t", stringsAsFactors = F)
tbp <- read.delim("yeast.tbp.windows.bed", header=F, sep="\t", stringsAsFactors = F)
mnase <- read.delim("yeast.mnase.windows.bed", header=F, sep="\t", stringsAsFactors = F)
h3k36 <- read.delim("yeast.h3k36.windows.bed", header=F, sep="\t", stringsAsFactors = F)
h3k18 <- read.delim("yeast.h3k18.windows.bed", header=F, sep="\t", stringsAsFactors = F)
h3k4 <- read.delim("yeast.h3k4.windows.bed", header=F, sep="\t", stringsAsFactors = F)
window <- read.delim("yeast.1kb.windows.bed", header=F, sep="\t", stringsAsFactors = F)
atac.bin <- atac %>% group_by(V1, V2, V3) %>% mutate(count = n())
atac.count <- unique(atac.bin[,c(1:3,9)])
atac.window <- left_join(window, atac.count, by=c("V1", "V2", "V3"))
rna.bin <- rna %>% group_by(V1, V2, V3) %>% mutate(count = n())
rna.count <- unique(rna.bin[,c(1:3,9)])
atac.rna.window <- left_join(atac.window, rna.count, by=c("V1", "V2", "V3"))
tbp.bin <- tbp %>% group_by(V1, V2, V3) %>% mutate(count = n())
tbp.count <- unique(tbp.bin[,c(1:3,9)])
atac.rna.tbp.window <- left_join(atac.rna.window, tbp.count, by=c("V1", "V2", "V3"))
mnase.bin <- mnase %>% group_by(V1, V2, V3) %>% mutate(count = n())
mnase.count <- unique(mnase.bin[,c(1:3,9)])
atac.rna.tbp.mnase.window <- left_join(atac.rna.tbp.window, mnase.count, by=c("V1", "V2", "V3"))
h3k36.bin <- h3k36 %>% group_by(V1, V2, V3) %>% mutate(count = n())
h3k36.count <- unique(h3k36.bin[,c(1:3,9)])
atac.rna.tbp.mnase.h3k36.window <- left_join(atac.rna.tbp.mnase.window, h3k36.count, by=c("V1", "V2", "V3"))
h3k18.bin <- h3k18 %>% group_by(V1, V2, V3) %>% mutate(count = n())
h3k18.count <- unique(h3k18.bin[,c(1:3,9)])
atac.rna.tbp.mnase.h3k36.h3k18.window <- left_join(atac.rna.tbp.mnase.h3k36.window, h3k18.count, by=c("V1", "V2", "V3"))
h3k4.bin <- h3k4 %>% group_by(V1, V2, V3) %>% mutate(count = n())
h3k4.count <- unique(h3k4.bin[,c(1:3,9)])
atac.rna.tbp.mnase.h3k36.h3k18.h3k4.window <- left_join(atac.rna.tbp.mnase.h3k36.h3k18.window, h3k4.count, by=c("V1", "V2", "V3"))
atac.rna.tbp.mnase.h3k36.h3k18.h3k4.window[is.na(atac.rna.tbp.mnase.h3k36.h3k18.h3k4.window)] <- 0
colnames(atac.rna.tbp.mnase.h3k36.h3k18.h3k4.window) <- c("chr", "start", "end", "atac", "rna", "tbp", "mnase", "h3k36", "h3k18", "h3k4")
WaveletTransform <- function(dataMatrix){
library(wavethresh)
##----------------------------------------------------------------------
## FIND USEFUL VALUES --------------------------------------------------
##----------------------------------------------------------------------
nDataItems = nrow(dataMatrix)
nFeatures = ncol(dataMatrix)
nWavelets = 2^ceiling(log2(nFeatures)) - 1
data.wd = matrix(0, nDataItems, nWavelets)
row.names(data.wd) = row.names(dataMatrix)
##----------------------------------------------------------------------
## GENERATE THE WAVELET TRANSFORMED DATA -------------------------------
##----------------------------------------------------------------------
for (i in 1:nDataItems){
currentData = dataMatrix[i,]
working = numeric(nWavelets + 1)
working[1:nFeatures] = currentData
current.wd = wd(working, filter.number=10)
data.wd[i, ] = current.wd$D
}
##----------------------------------------------------------------------
## REMOVE ANY ZERO-VARIANCE FEATURES -----------------------------------
##----------------------------------------------------------------------
sigmaValues = apply(data.wd, 2, sd)
keep = which(sigmaValues>0)
data.wd = data.wd[, keep]
return(data.wd)
}
dataMatrix <- as.matrix(atac.rna.tbp.mnase.h3k36.h3k18.h3k4.window[1:12164,4:10])
data.wd1 <- WaveletTransform(dataMatrix)
##Function to perform a 2D wavelet transform on an input FAIMS data matrix
## input = all data
WaveletTransform2D <- function(m,dataMatrix,targetValues,dimensions=c(512, 102),cropped=FALSE){
library(wavethresh)
library(RColorBrewer)
##----------------------------------------------------------------------
## FIND USEFUL VALUES --------------------------------------------------
##----------------------------------------------------------------------
dataMatrix = as.matrix(dataMatrix)
nDataItems = nrow(dataMatrix)
nFeatures = ncol(dataMatrix)
if (cropped==TRUE){
DIMS = dimensions
if(DIMS[1]>512) cropped = FALSE
}
dimensions=c(512, 102) ## override input for cropping to work properly (one less consideration)
##----------------------------------------------------------------------
## GENERATE THE WAVELET TRANSFORMED DATA -------------------------------
##----------------------------------------------------------------------
for (i in 1:nDataItems){
currentData = dataMatrix[i,]
dim(currentData) = dimensions
if (!cropped){
add_zeros = matrix(0,dimensions[1],(dimensions[1]-dimensions[2]))
} else if (cropped){
currentData = currentData[(256-DIMS[1]/2+1):(256+DIMS[1]/2),]
add_zeros = matrix(0,DIMS[1],DIMS[1]-ncol(currentData))
# currentData = currentData[193:320,]
# add_zeros = matrix(0,128,26)
}
working = cbind(currentData,add_zeros)
# dim(working)
### PLOT DATA
if (m==1) {
if (i==1) {
colourTable = heat.colors(100)
png(file="Cropped_Data_disease.png",pointsize =16)
image(working,main=names(targetValues)[i],col=colourTable)
dev.off()
}
if (i==57) {
colourTable = heat.colors(100)
png(file="Cropped_Data_control.png",pointsize =16)
image(working,main=names(targetValues)[i],col=colourTable)
dev.off()
}
}
#### WAVELET TRANSFORM
current.wd = imwd(working, filter.number=10, family="DaubLeAsymm", type="wavelet")
nLevels = current.wd$nlevels-1
working.data = current.wd$w0Lconstant
for (j in 0:nLevels) {
for (k in 1:4){
working.data = c(working.data,current.wd[[ lt.to.name(j, k) ]])
}
}
if (i==1){
data.wd = matrix(0, nDataItems, length(working.data))
row.names(data.wd) = row.names(dataMatrix)
}
data.wd[i, ] = working.data
}
# length(current.wd2$w4L4) ## 8 levels, level 7 has length 16384 (4 components)
## level 6 has length 4096
## level 5 has length 1024 ....
# Number of columns for wavelet transform according to matrix size:
# 512x512--> 349525
# 256x256--> 87381
# 128x128--> 21845 ## additional benefit of dimensionality reduction
return(data.wd)
}
##*****************************************************************************
##*****************************************************************************
##----------------------------------------------------------------------
## ----------------------------------------
##----------------------------------------------------------------------
data <- atac.rna.tbp.mnase.h3k36.h3k18.h3k4.window[,4:10]
data.wd2 <- WaveletTransform2D(data)
# Error in wd(df.vec) : Data length is not power of two
dataMatrix <- as.matrix(atac.rna.tbp.mnase.h3k36.h3k18.h3k4.window[1:8192,4:10])
data.wd1 <- WaveletTransform(dataMatrix)
df.wd1 <- data.frame(data.wd1)
df.vec <- df.wd1$X1
wds <- wd(df.vec)
wdS <- wd(df.vec, type="station")
pdf("wds.plot.pdf")
plot(wds)
dev.off()
pdf("wds.image.pdf")
image.wd(wdS)
dev.off()
df.atac <- df.wd1$X1
wds.atac <- wd(df.atac, type="station")
pdf("wds.image.atac.pdf")
image.wd(wds.atac)
dev.off()
df.rna <- df.wd1$X2
wds.rna <- wd(df.rna, type="station")
pdf("wds.image.rna.pdf")
image.wd(wds.rna)
dev.off()
df.mnase <- df.wd1$X4
wds.mnase <- wd(df.mnase, type="station")
pdf("wds.image.mnase.pdf")
image.wd(wds.mnase)
dev.off()
df.36 <- df.wd1$X5
wds.36 <- wd(df.36, type="station")
pdf("wds.image.h3k36.pdf")
image.wd(wds.36)
dev.off()
df.4 <- df.wd1$X7
wds.4 <- wd(df.4, type="station")
pdf("wds.image.h3k4.pdf")
image.wd(wds.4)
dev.off()
# bwa mem ref.fa reads.fq > aln-se.sam
## generate fastq file of sequences
## align to Y.lipolytica & S.cerevisiae genome assemblies
cd /Users/27n/Dropbox (ORNL)/ORNL.Noshay/Projects/Poplar/SEED/ExploratoryDataForModelGeneration
scp /Users/27n/Downloads/gkt135_supplementary_data/nar-00060-h-2013-File006.csv noshayjm@dtn.ccs.ornl.gov:/gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/gkt135.sup.fastq
bwa mem
#install.packages("randomForest")
library(randomForest)
#ozone.rf <- randomForest(Ozone ~ ., data = airquality, mtry = 3,
importance = TRUE, na.action = na.omit)
#print(ozone.rf)
#plot(ozone.rf)
module load python
source /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/scripts/loadcondaandes.sh
conda activate /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/Libraries/andes/envs/test
cd /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli
# make 1kb windows of genome
# bedtools intersect to calculate density of feature per 1kb window
/gpfs/alpine/syb105/proj-shared/Personal/noshayjm/Libraries/andes/samtools/samtools faidx genome/GCF_000005845.2_ASM584v2_genomic.fna
cut -f1,2 genome/GCF_000005845.2_ASM584v2_genomic.fna.fai | sort -k 1,1 -k 2,2n > ecoli.sizes.genome
bedtools makewindows -g ecoli.sizes.genome -w 1000 > ecoli.1kb.windows.bed
bedtools makewindows -g ecoli.sizes.genome -w 500 > ecoli.500bp.windows.bed
## genes
bedtools intersect -wo -a ecoli.1kb.windows.bed -b genome/GCF_000005845.2_ASM584v2_genomic.gene.gff > ecoli.gene.windows.bed
bedtools intersect -wo -a ecoli.500bp.windows.bed -b genome/GCF_000005845.2_ASM584v2_genomic.gene.gff > ecoli.gene.windows500.bed
## GC content
bedtools nuc -fi genome/GCF_000005845.2_ASM584v2_genomic.fna -bed ecoli.1kb.windows.bed | sed '1d' > ecoli.GC.windows.bed
bedtools nuc -fi genome/GCF_000005845.2_ASM584v2_genomic.fna -bed ecoli.500bp.windows.bed | sed '1d' > ecoli.GC.windows500.bed
https://biopython.org/docs/1.75/api/Bio.SeqUtils.MeltingTemp.html
module load python
source /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/scripts/loadcondaandes.sh
conda activate /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/Libraries/andes/envs/test
Bio.SeqUtils.MeltingTemp.Tm_NN(seq, check=True, strict=True, c_seq=None, shift=0, nn_table=None, tmm_table=None, imm_table=None, de_table=None, dnac1=25, dnac2=25, selfcomp=False, Na=50, K=0, Tris=0, Mg=0, dNTPs=0, saltcorr=5)
https://warwick.ac.uk/fac/sci/moac/people/students/peter_cock/python/fasta_n
module load python
source /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/scripts/loadcondaandes.sh
conda activate /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/Libraries/andes/envs/test
# make fasta file of 1kb windows
cd /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli
bedtools getfasta -fi genome/GCF_000005845.2_ASM584v2_genomic.fna -bed ecoli.1kb.windows.bed -fo ecoli.1kb.fa
# count nucleotides
cd /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli
python
input_file = open('ecoli.1kb.fa', 'r')
output_file = open('nucleotide_counts.tsv','w')
output_file.write('Window\tA\tC\tG\tT\tLength\tCG%\n')
from Bio import SeqIO
for cur_record in SeqIO.parse(input_file, "fasta") :
gene_name = cur_record.name
A_count = cur_record.seq.count('A')
C_count = cur_record.seq.count('C')
G_count = cur_record.seq.count('G')
T_count = cur_record.seq.count('T')
length = len(cur_record.seq)
cg_percentage = float(C_count + G_count) / length
output_line = '%s\t%i\t%i\t%i\t%i\t%i\t%f\n' % \
(gene_name, A_count, C_count, G_count, T_count, length, cg_percentage)
output_file.write(output_line)
output_file.close()
input_file.close()
exit()
# Melting temperature(°C) = 64.9 + 41 * (nG+nC-16.4)/(nA+nT+nG+nC)
R
library(dplyr)
setwd("/gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli")
df <- read.delim("nucleotide_counts.tsv", header=T, sep="\t")
df.melt <- df %>% mutate(MeltingTemp = 64.9 + 41 * (G+C-16.4) / (A+T+G+C))
write.table(df.melt, "nucleotide_counts_temp.txt", quote=F, row.names=F, sep="\t")
q()
### 500bp windows
cd /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli
bedtools getfasta -fi genome/GCF_000005845.2_ASM584v2_genomic.fna -bed ecoli.500bp.windows.bed -fo ecoli.500bp.fa
# count nucleotides
cd /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli
python
input_file = open('ecoli.500bp.fa', 'r')
output_file = open('nucleotide_counts_500bp.tsv','w')
output_file.write('Window\tA\tC\tG\tT\tLength\tCG%\n')
from Bio import SeqIO
for cur_record in SeqIO.parse(input_file, "fasta") :
gene_name = cur_record.name
A_count = cur_record.seq.count('A')
C_count = cur_record.seq.count('C')
G_count = cur_record.seq.count('G')
T_count = cur_record.seq.count('T')
length = len(cur_record.seq)
cg_percentage = float(C_count + G_count) / length
output_line = '%s\t%i\t%i\t%i\t%i\t%i\t%f\n' % \
(gene_name, A_count, C_count, G_count, T_count, length, cg_percentage)
output_file.write(output_line)
output_file.close()
input_file.close()
exit()
# Melting temperature(°C) = 64.9 + 41 * (nG+nC-16.4)/(nA+nT+nG+nC)
R
library(dplyr)
setwd("/gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli")
df <- read.delim("nucleotide_counts_500bp.tsv", header=T, sep="\t")
df.melt <- df %>% mutate(MeltingTemp = 64.9 + 41 * (G+C-16.4) / (A+T+G+C))
write.table(df.melt, "nucleotide_counts_500bp_temp.txt", quote=F, row.names=F, sep="\t")
q()
https://academic.oup.com/bioinformatics/article/34/14/2499/4924718 https://github.com/Superzchen/iFeature/ https://github.com/feliixx/gotranseq
#cd /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/Libraries/andes
#git clone https://github.com/Superzchen/iFeature
cd /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli
python /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/Libraries/andes/iFeature/iFeature.py --file ecoli.1kb.fa --type AAC --out ecoli.structure.txt
# convert from nucleotide to protein sequence first
#cd /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/Libraries/andes
#git clone https://github.com/feliixx/gotranseq.git
cd /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli
/gpfs/alpine/syb105/proj-shared/Personal/noshayjm/Libraries/andes/gotranseq/transeq ecoli.1kb.fa ecoli.1kb.protein.fa
gotranseq --sequence file.fna --outseq test.protein.fa
python /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/Libraries/andes/iFeature/iFeature.py --file test.protein.fa --type AAC --out test.txt
python /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/Libraries/andes/iFeature/iFeature.py --file ecoli.500bp.fa --type AAC --out 500bp.protein.structure.fa
library(tidyr)
library(dplyr)
setwd("/gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli/genome")
# sed '1d' GCF_000005845.2_ASM584v2_genomic.gff | sed '1d' | sed '1d' | sed '1d' | sed '1d' | sed '1d' | sed '1d' > GCF_000005845.2_ASM584v2_genomic.txt
annotation <- read.delim("GCF_000005845.2_ASM584v2_genomic.txt", header=F, sep="\t")
gene <- subset(annotation, annotation$V3 == "gene")
gene.id <- separate(gene, V9, c("id1", "id2"), sep="EcoGene:")
gene.id$gene_id <- substr(gene.id$id2, 1, 7)
setwd("/gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli")
rna <- read.delim("GSM2267479_Sample-1.genes.results.txt", header=T, sep="\t")
rna.id <- left_join(rna, gene.id, by="gene_id")
rna.id.idf <- na.omit(rna.id[,c(8,11,12,1,3:7)])
write.table(rna.id.idf, "GSM2267479.fpkm.coord.txt", quote=F, row.names=F, sep="\t")
# calculate density (avg fpkm per 1kb window)
#sed '1d' GSM2267479.fpkm.coord.txt > GSM2267479.fpkm.coord.bed
#bedtools intersect -wo -a ecoli.1kb.windows.bed -b GSM2267479.fpkm.coord.bed > ecoli.rnaseq.windows.bed
setwd("/gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli")
window <- read.delim("ecoli.rnaseq.windows.bed", header=F, sep="\t")
window.df <- window %>% group_by(V1, V2, V3) %>% mutate(avg.fpkm = mean(V12))
window.uniq <- unique(window.df[,c(1:3,14)])
write.table(window.uniq, "ecoli.rnaseq.average.windows.bed", quote=F, row.names=F, sep="\t")
#bedtools intersect -wo -a ecoli.500bp.windows.bed -b GSM2267479.fpkm.coord.bed > ecoli.rnaseq.windows500.bed
setwd("/gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli")
window <- read.delim("ecoli.rnaseq.windows500.bed", header=F, sep="\t")
window.df <- window %>% group_by(V1, V2, V3) %>% mutate(avg.fpkm = mean(V12))
window.uniq <- unique(window.df[,c(1:3,14)])
write.table(window.uniq, "ecoli.rnaseq.average.windows500.bed", quote=F, row.names=F, sep="\t")
# All of this is in R
scales <- seq(1,48,2)
wCoefs <- peakInfo547$wCoefs
xMax <- dim(wCoefs)[1]
colVec <- hcl.colors(256, "PuBu", rev = TRUE)
image(1:xMax,scales,wCoefs,col=colVec, axes=FALSE, xlab='windows (1kb)', ylab='CWT coefficient scale')
xTickInterval <- 250
axis(1, at=seq(1, xMax, by=xTickInterval))
axis(2, at=c(1, seq(10, 48, by=10)))
box()
#if (!requireNamespace("BiocManager", quietly = TRUE))
# install.packages("BiocManager")
#BiocManager::install("MassSpecWavelet")
module load python
source /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/scripts/loadcondaandes.sh
conda activate /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/Libraries/andes/envs/test
# R
library(dplyr)
library(MassSpecWavelet)
setwd("/gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli")
gene <- read.delim("ecoli.gene.windows.bed", header=F, sep="\t", stringsAsFactors = F)
structure <- read.delim("ecoli.structure.txt", header=T, sep="\t", stringsAsFactors = F)
nuc <- read.delim("nucleotide_counts_temp.txt", header=T, sep="\t", stringsAsFactors = F)
rnaseq <- read.delim("ecoli.rnaseq.average.windows.bed", header=T, sep="\t", stringsAsFactors = F)
window <- read.delim("ecoli.1kb.windows.bed", header=F, sep="\t", stringsAsFactors = F)
gene.bin <- gene %>% group_by(V1, V2, V3) %>% mutate(gene.count = n())
gene.count <- unique(gene.bin[,c(1:3,14)])
gene.df <- gene.count$gene.count
structure.df <- structure[,2]
gc.df <- nuc[,7]
temp.df <- nuc[,8]
rna.df <- rnaseq[,4]
## Plot the 2-D CWT coefficients as image (It may take a while!)
# Melting Temperature
scales <- seq(1, 64, 3)
wCoefs <- cwt(temp.df, scales=scales, wavelet='mexh')
pdf("ecoli.temp.wavelet.pdf")
image(1:length(temp.df), scales, wCoefs, col=terrain.colors(256), axes=FALSE, xlab='m/z index', ylab='CWT coefficient scale', main='CWT Temp coefficients')
box()
dev.off()
wCoefs <- cbind(as.vector(temp.df), wCoefs)
colnames(wCoefs) <- c(0, scales)
h <- getLocalMaximumCWT(wCoefs)
ridgeList <- getRidge(localMax)
SNR.Th <- 3
nearbyPeak <- TRUE
majorPeakInfo <- identifyMajorPeaks(temp.df, ridgeList, wCoefs, SNR.Th = SNR.Th, nearbyPeak=nearbyPeak)
peakIndex <- majorPeakInfo$peakIndex
SNR.Th <- 3
pdf("ecoli.temp.wavelet.peak.pdf")
plotLocalMax(localMax, wCoefs)
dev.off()
plotRange <- c(1,length(temp.df))
pdf("ecoli.temp.peak.pdf")
plotPeak(temp.df, peakIndex, range=plotRange, main=paste('Identified Expression peaks with SNR >', SNR.Th))
dev.off()
# GC content
scales <- seq(1, 64, 3)
wCoefs <- cwt(gc.df, scales=scales, wavelet='mexh')
pdf("ecoli.gc.wavelet.pdf")
image(1:length(gc.df), scales, wCoefs, col=terrain.colors(256), axes=FALSE, xlab='m/z index', ylab='CWT coefficient scale', main='CWT GC coefficients')
box()
dev.off()
wCoefs <- cbind(as.vector(gc.df), wCoefs)
colnames(wCoefs) <- c(0, scales)
localMax <- getLocalMaximumCWT(wCoefs)
ridgeList <- getRidge(localMax)
SNR.Th <- 3
nearbyPeak <- TRUE
majorPeakInfo <- identifyMajorPeaks(gc.df, ridgeList, wCoefs, SNR.Th = SNR.Th, nearbyPeak=nearbyPeak)
peakIndex <- majorPeakInfo$peakIndex
SNR.Th <- 3
pdf("ecoli.gc.wavelet.peak.pdf")
plotLocalMax(localMax, wCoefs)
dev.off()
plotRange <- c(1,length(gc.df))
pdf("ecoli.gc.peak.pdf")
plotPeak(gc.df, peakIndex, range=plotRange, main=paste('Identified Expression peaks with SNR >', SNR.Th))
dev.off()
# Structure
scales <- seq(1, 64, 3)
wCoefs <- cwt(structure.df, scales=scales, wavelet='mexh')
pdf("ecoli.structure.wavelet.pdf")
image(1:length(structure.df), scales, wCoefs, col=terrain.colors(256), axes=FALSE, xlab='m/z index', ylab='CWT coefficient scale', main='CWT Structure coefficients')
box()
dev.off()
wCoefs <- cbind(as.vector(structure.df), wCoefs)
colnames(wCoefs) <- c(0, scales)
localMax <- getLocalMaximumCWT(wCoefs)
ridgeList <- getRidge(localMax)
SNR.Th <- 3
nearbyPeak <- TRUE
majorPeakInfo <- identifyMajorPeaks(structure.df, ridgeList, wCoefs, SNR.Th = SNR.Th, nearbyPeak=nearbyPeak)
peakIndex <- majorPeakInfo$peakIndex
SNR.Th <- 3
pdf("ecoli.structure.wavelet.peak.pdf")
plotLocalMax(localMax, wCoefs)
dev.off()
plotRange <- c(1,length(structure.df))
pdf("ecoli.structure.peak.pdf")
plotPeak(structure.df, peakIndex, range=plotRange, main=paste('Identified Expression peaks with SNR >', SNR.Th))
dev.off()
# Expression
scales <- seq(1, 64, 3)
wCoefs <- cwt(rna.df, scales=scales, wavelet='mexh')
pdf("ecoli.expression.wavelet.pdf")
image(1:length(rna.df), scales, wCoefs, col=terrain.colors(256), axes=FALSE, xlab='m/z index', ylab='CWT coefficient scale', main='CWT Expression coefficients')
box()
dev.off()
wCoefs <- cbind(as.vector(rna.df), wCoefs)
colnames(wCoefs) <- c(0, scales)
localMax <- getLocalMaximumCWT(wCoefs)
ridgeList <- getRidge(localMax)
SNR.Th <- 3
nearbyPeak <- TRUE
majorPeakInfo <- identifyMajorPeaks(rna.df, ridgeList, wCoefs, SNR.Th = SNR.Th, nearbyPeak=nearbyPeak)
peakIndex <- majorPeakInfo$peakIndex
SNR.Th <- 3
pdf("ecoli.expression.wavelet.peak.pdf")
plotLocalMax(localMax, wCoefs)
dev.off()
plotRange <- c(1,length(rna.df))
pdf("ecoli.expression.peak.pdf")
plotPeak(rna.df, peakIndex, range=plotRange, main=paste('Identified Expression peaks with SNR >', SNR.Th))
dev.off()
# Gene Density
scales <- seq(1, 64, 3)
wCoefs <- cwt(gene.df, scales=scales, wavelet='mexh')
pdf("ecoli.gene.wavelet.pdf")
image(1:length(gene.df), scales, wCoefs, col=terrain.colors(256), axes=FALSE, xlab='m/z index', ylab='CWT coefficient scale', main='CWT Gene Density coefficients')
box()
dev.off()
wCoefs <- cbind(as.vector(gene.df), wCoefs)
colnames(wCoefs) <- c(0, scales)
localMax <- getLocalMaximumCWT(wCoefs)
ridgeList <- getRidge(localMax)
SNR.Th <- 3
nearbyPeak <- TRUE
majorPeakInfo <- identifyMajorPeaks(gene.df, ridgeList, wCoefs, SNR.Th = SNR.Th, nearbyPeak=nearbyPeak)
peakIndex <- majorPeakInfo$peakIndex
SNR.Th <- 3
pdf("ecoli.gene.wavelet.peak.pdf")
plotLocalMax(localMax, wCoefs)
dev.off()
plotRange <- c(1,length(gene.df))
pdf("ecoli.gene.peak.pdf")
plotPeak(gene.df, peakIndex, range=plotRange, main=paste('Identified Expression peaks with SNR >', SNR.Th))
dev.off()
module load python
source /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/scripts/loadcondaandes.sh
conda activate /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/Libraries/andes/envs/test
# generate fastq file of sequences and blast to reference
#cd /Users/27n/Dropbox (ORNL)/ORNL.Noshay/Projects/Poplar/SEED/ExploratoryDataForModelGeneration/e.coli/gky572_supplemental_files
#scp *.txt noshayjm@dtn.ccs.ornl.gov:/gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli/.
cd /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli
sed '1d' DataS1.txt | awk '{print ">"$1"\n"$2}' > ecoli.gRNA.fasta
sed '1d' DataS1.rbs.txt | awk '{print ">"$1"\n"$2}' > ecoli.gRNA.rbs.fasta
## blast
#conda install blast
cd /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/Libraries/andes
wget https://ftp.ncbi.nlm.nih.gov/blast/executables/LATEST/ncbi-blast-2.11.0+-x64-linux.tar.gz
tar zxvpf ncbi-blast-2.11.0+-x64-linux.tar.gz
export PATH=$PATH:$HOME/ncbi-blast-2.10.1+/bin
echo $PATH
mkdir $HOME/blastdb
export BLASTDB=$HOME/blastdb
set BLASTDB=$HOME/blastdb
cd /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli
/gpfs/alpine/syb105/proj-shared/Personal/noshayjm/Libraries/andes/ncbi-blast-2.11.0+/bin/makeblastdb -in genome/GCF_000005845.2_ASM584v2_genomic.fna -dbtype nucl
/gpfs/alpine/syb105/proj-shared/Personal/noshayjm/Libraries/andes/ncbi-blast-2.11.0+/bin/blastn -query ecoli.gRNA.fasta -db genome/GCF_000005845.2_ASM584v2_genomic.fna -out ecoli.gRNA.blast.tab -outfmt 6 -evalue 0.0001 -task blastn -num_threads 10
/gpfs/alpine/syb105/proj-shared/Personal/noshayjm/Libraries/andes/ncbi-blast-2.11.0+/bin/blastn -query ecoli.gRNA.all.fasta -db genome/GCF_000005845.2_ASM584v2_genomic.fna -out ecoli.gRNA.all.blast.tab -outfmt 6 -evalue 0.0001 -task blastn -num_threads 10
# ## bwa
# git clone https://github.com/bwa-mem2/bwa-mem2
# cd bwa-mem2
# git submodule init
# git submodule update
# # Compile and run
# make
# ./bwa-mem2
# # Indexing the reference sequence (Requires 28N GB memory where N is the size of the reference sequence).
# cd /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli
# bwa-mem2/bwa-mem2 index -p genome/GCF_000005845.2_ASM584v2 genome/GCF_000005845.2_ASM584v2_genomic.fna
# # Mapping ... Run "./bwa-mem2 mem" to get all options
# cd /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli
# bwa-mem2/bwa-mem2 mem -t 2 genome/GCF_000005845.2_ASM584v2 ecoli.gRNA.fasta > out.sam
#
#
# # biopython
# cd /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli/genome
#
# python
# from Bio import SeqIO
#
# for long_sequence_record in SeqIO.parse(open('GCF_000005845.2_ASM584v2_genomic.fna'), 'fasta'):
# long_sequence = str(long_sequence_record.seq)
#
# for short_sequence_record in SeqIO.parse(open('ecoli.gRNA.fasta'), 'fasta'):
# short_sequence = str(short_sequence_record.seq)
#
# if short_sequence in long_sequence:
# chr =
# start = long_sequence.index(short_sequence) + 1
# stop = start + len(short_sequence) - 1
# print (short_sequence_record.id, chr, start, stop)
# sequences = (short_sequence_record.id, chr, start, stop)
# with open("example.fasta", "w") as output_handle:
# SeqIO.write(sequences, output_handle, "fasta")
awk '{if ($9 > $10) print $2"\t"$10"\t"$9"\t"$1}' ecoli.gRNA.blast.tab > tmp1.bed
awk '{if ($10 > $9) print $2"\t"$9"\t"$10"\t"$1}' ecoli.gRNA.blast.tab > tmp2.bed
cat tmp1.bed tmp2.bed > ecoli.gRNA.blast.bed
bedtools intersect -wo -a ecoli.1kb.windows.bed -b ecoli.gRNA.blast.bed > ecoli.gRNA.windows.bed
bedtools intersect -wo -a ecoli.500bp.windows.bed -b ecoli.gRNA.blast.bed > ecoli.gRNA.windows500.bed
awk '{if ($9 > $10) print $2"\t"$10"\t"$9"\t"$1}' ecoli.gRNA.all.blast.tab > tmp1.bed
awk '{if ($10 > $9) print $2"\t"$9"\t"$10"\t"$1}' ecoli.gRNA.all.blast.tab > tmp2.bed
cat tmp1.bed tmp2.bed > ecoli.gRNA.all.blast.bed
bedtools intersect -wo -a ecoli.500bp.windows.bed -b ecoli.gRNA.all.blast.bed > ecoli.gRNA.all.windows500.bed
# R
library(dplyr)
setwd("/gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli")
window <- read.delim("ecoli.gRNA.windows.bed", header=F, sep="\t")
window$sgRNA <- window$V7
score <- read.delim("DataS4.txt", header=T, sep="\t")
colnames(score) <- c("sgRNA", "score")
quality <- read.delim("DataS6.txt", header=T, sep="\t")
window.score <- left_join(window, score, by="sgRNA")
window.score.quality <- left_join(window.score, quality, by="sgRNA")
write.table(window.score.quality, "window.score.quality.txt", quote=F, row.names=F, sep="\t")
nrow(window.score.quality)
# 57283
length(unique(window.score.quality$sgRNA))
# 55593
window.score.quality.avg <- window.score.quality %>% group_by(V1, V2, V3) %>% mutate(score.avg = mean(score))
write.table(unique(window.score.quality.avg[,c(1:3,14)]), "window.score.avg.txt", quote=F, row.names=F, sep="\t")
window.score.quality.avg <- window.score.quality %>% group_by(V1, V2, V3) %>% mutate(score.avg = mean(score, na.rm=TRUE))
write.table(unique(window.score.quality.avg[,c(1:3,14)]), "window.score.avg.rmna.txt", quote=F, row.names=F, sep="\t")
count <- window.score.quality %>% group_by(V1, V2, V3, Quality) %>% mutate(quality.count = n())
count.uniq <- unique(count[,c(1:3,13,14)])
count.max <- count.uniq %>% group_by(V1, V2, V3) %>% filter(quality.count == max(quality.count))
write.table(count.max[,1:4], "window.quality.max.txt", quote=F, row.names=F, sep="\t")
## 500bp
library(dplyr)
setwd("/gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli")
window <- read.delim("ecoli.gRNA.windows500.bed", header=F, sep="\t")
window$sgRNA <- window$V7
score <- read.delim("DataS4.txt", header=T, sep="\t")
colnames(score) <- c("sgRNA", "score")
quality <- read.delim("DataS6.txt", header=T, sep="\t")
window.score <- left_join(window, score, by="sgRNA")
window.score.quality <- left_join(window.score, quality, by="sgRNA")
write.table(window.score.quality, "window500.score.quality.txt", quote=F, row.names=F, sep="\t")
nrow(window.score.quality)
# 57283
length(unique(window.score.quality$sgRNA))
# 55593
window.score.quality.avg <- window.score.quality %>% group_by(V1, V2, V3) %>% mutate(score.avg = mean(score, na.rm=TRUE))
write.table(unique(window.score.quality.avg[,c(1:3,14)]), "window500.score.avg.txt", quote=F, row.names=F, sep="\t")
count <- window.score.quality %>% group_by(V1, V2, V3, Quality) %>% mutate(quality.count = n())
count.uniq <- unique(count[,c(1:3,13,14)])
count.max <- count.uniq %>% group_by(V1, V2, V3) %>% filter(quality.count == max(quality.count))
write.table(count.max[,1:4], "window500.quality.max.txt", quote=F, row.names=F, sep="\t")
#install.packages("randomForest")
library(randomForest)
#ozone.rf <- randomForest(Ozone ~ ., data = airquality, mtry = 3, importance = TRUE, na.action = na.omit)
#print(ozone.rf)
#plot(ozone.rf)
module load python
source /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/scripts/loadcondaandes.sh
conda activate /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/Libraries/andes/envs/test
# R
library(dplyr)
library(reshape2)
library(MassSpecWavelet)
setwd("/gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli")
gene <- read.delim("ecoli.gene.windows.bed", header=F, sep="\t", stringsAsFactors = F)
structure <- read.delim("ecoli.structure.txt", header=T, sep="\t", stringsAsFactors = F)
nuc <- read.delim("nucleotide_counts_temp.txt", header=T, sep="\t", stringsAsFactors = F)
rnaseq <- read.delim("ecoli.rnaseq.average.windows.bed", header=T, sep="\t", stringsAsFactors = F)
window <- read.delim("ecoli.1kb.windows.bed", header=F, sep="\t", stringsAsFactors = F)
score <- read.delim("window.score.avg.txt", header=T, sep="\t", stringsAsFactors = F)
gene.bin <- gene %>% group_by(V1, V2, V3) %>% mutate(gene.count = n())
gene.count <- unique(gene.bin[,c(1:3,14)])
gene.df <- gene.count$gene.count
structure.df <- structure[,2]
gc.df <- nuc[,7]
temp.df <- nuc[,8]
rna.df <- rnaseq[,4]
## Plot the 2-D CWT coefficients as image (It may take a while!)
scales <- seq(1, 64, 3)
wCoefs.temp <- cwt(temp.df, scales=scales, wavelet='mexh')
wCoefs.gc <- cwt(gc.df, scales=scales, wavelet='mexh')
wCoefs.structure <- cwt(structure.df, scales=scales, wavelet='mexh')
wCoefs.rna <- cwt(rna.df, scales=scales, wavelet='mexh')
wCoefs.gene <- cwt(gene.df, scales=scales, wavelet='mexh')
wCoefs.temp.melt <- melt(wCoefs.temp)
wCoefs.gc.melt <- melt(wCoefs.gc)
wCoefs.structure.melt <- melt(wCoefs.structure)
wCoefs.rna.melt <- melt(wCoefs.rna)
wCoefs.gene.melt <- melt(wCoefs.gene)
colnames(window) <- c("chr", "start", "end")
window$Var1 <- seq.int(nrow(window))
colnames(score) <- c("chr", "start", "end", "score")
window.score <- left_join(window, score, by=c("chr", "start", "end"))
window.score.temp <- left_join(window.score, wCoefs.temp.melt, by="Var1")
window.temp.gc <- left_join(window.score.temp, wCoefs.gc.melt, by=c("Var1", "Var2"))
window.temp.gc.structure <- left_join(window.temp.gc, wCoefs.structure.melt, by=c("Var1", "Var2"))
window.temp.gc.structure.rna <- left_join(window.temp.gc.structure, wCoefs.rna.melt, by=c("Var1", "Var2"))
window.temp.gc.structure.rna.gene <- left_join(window.temp.gc.structure.rna, wCoefs.gene.melt, by=c("Var1", "Var2"))
colnames(window.temp.gc.structure.rna.gene) <- c("chr", "start", "end", "window", "score", "scale", "melting.temp", "gc.content", "structure", "fpkm", "gene.density")
write.table(window.temp.gc.structure.rna.gene, "window.temp.gc.structure.rna.gene.df.txt", quote=F, row.names=F, sep="\t")
df.melt <- melt(window.temp.gc.structure.rna.gene[,4:11], id=c("window", "score", "scale"))
write.table(df.melt, "window.temp.gc.structure.rna.gene.input.txt", quote=F, row.names=F, sep="\t")
https://www.listendata.com/2014/11/random-forest-with-r.html - Two parameters are important in the random forest algorithm: - Number of trees used in the forest (ntree ) and - Number of random variables used in each tree (mtry ). - First set the mtry to the default value (sqrt of total number of all predictors) and search for the optimal ntree value. To find the number of trees that correspond to a stable classifier, we build random forest with different ntree values (100, 200, 300….,1,000). We build 10 RF classifiers for each ntree value, record the OOB error rate and see the number of trees where the out of bag error rate stabilizes and reach minimum.
module load python
source /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/scripts/loadcondaandes.sh
conda activate /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/Libraries/andes/envs/test
# R
library(randomForest)
setwd("/gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli")
df <- read.delim("window.temp.gc.structure.rna.gene.input.txt", header=T, sep="\t", stringsAsFactors = F)
df <- na.omit(df)
df1 <- na.omit(subset(df, df$scale == 1))
## Regression:
set.seed(131)
rf <- randomForest(score ~ ., data=df1, mtry=3, importance=TRUE, na.action=na.omit)
print(rf)
# Call:
# randomForest(formula = score ~ ., data = df1, mtry = 3, importance = TRUE, na.action = na.omit)
# Type of random forest: regression
# Number of trees: 500
# No. of variables tried at each split: 3
#
# Mean of squared residuals: 15.68781
# % Var explained: 46.77
## Show "importance" of variables: higher value mean more important:
round(importance(rf), 2)
# %IncMSE IncNodePurity
# window 78.55 44961.20
# scale 0.00 0.00
# variable -9.05 1786.66
# value -16.74 16877.36
## "x" can be a matrix instead of a data frame:
set.seed(17)
x <- matrix(runif(5e2), 100)
y <- gl(2, 50)
(myrf <- randomForest(x, y))
(predict(myrf, x))
set.seed(131)
rf <- randomForest(score ~ ., data=df, mtry=3, importance=TRUE, na.action=na.omit)
print(rf)
# Call:
# randomForest(formula = score ~ ., data = df, mtry = 3, importance = TRUE, na.action = na.omit)
# Type of random forest: regression
# Number of trees: 500
# No. of variables tried at each split: 3
#
# Mean of squared residuals: 0.3390503
# % Var explained: 98.85
# options(repos='http://cran.rstudio.org')
# have.packages <- installed.packages()
# cran.packages <- c('devtools','plotrix','randomForest','tree')
# to.install <- setdiff(cran.packages, have.packages[,1])
# if(length(to.install)>0) install.packages(to.install)
# library(devtools)
# if(!('reprtree' %in% installed.packages())){
# install_github('araastat/reprtree')
# }
# for(p in c(cran.packages, 'reprtree')) eval(substitute(library(pkg), list(pkg=p)))
library(randomForest)
library(reprtree)
model <- randomForest(score ~ ., data=df, importance=TRUE, ntree=500, mtry = 3, do.trace=100, na.action=na.omit)
# | Out-of-bag |
# Tree | MSE %Var(y) |
# 100 | 0.3703 1.26 |
# 200 | 0.3555 1.21 |
# 300 | 0.3558 1.21 |
# 400 | 0.3517 1.19 |
# 500 | 0.347 1.18 |
tree <- getTree(model, k=1, labelVar=TRUE)
realtree <- reprtree:::as.tree(tree, model)
# determine best parameters to use... mtry
mtry <- tuneRF(df[-1],df$score, ntreeTry=500, stepFactor=1.5,improve=0.01, trace=TRUE, plot=TRUE)
best.m <- mtry[mtry[, 2] == min(mtry[, 2]), 1]
print(mtry)
# mtry OOBError
# 1 1 1.999363
print(best.m)
# 1
# re-run model with new mtry
set.seed(71)
rf <-randomForest(score~.,data=df[-1], mtry=1, importance=TRUE,ntree=500)
print(rf)
# Call:
# randomForest(formula = score ~ ., data = df, mtry = best.m, importance = TRUE, ntree = 500)
# Type of random forest: regression
# Number of trees: 500
# No. of variables tried at each split: 1
#
# Mean of squared residuals: 23.01933
# % Var explained: 21.89
# Evaluate variable importance
importance(rf)
# %IncMSE IncNodePurity
# scale 19.72781 6303.888
# variable 36.83138 4169.115
# value 38.27873 38703.914
pdf("rf.residual.pdf")
plot(df$scale, df$scale - rf$predicted)
dev.off()
module load python
source /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/scripts/loadcondaandes.sh
conda activate /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/Libraries/andes/envs/test
# R
library(randomForest)
library(reshape2)
library(dplyr)
setwd("/gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli")
df <- read.delim("window.temp.gc.structure.rna.gene.input.txt", header=T, sep="\t", stringsAsFactors = F)
df <- na.omit(df)
df.dcast <- df %>% dcast(window + score + scale ~ variable, value.var = "value")
df.dcast <- na.omit(df.dcast)
mtry <- tuneRF(df.dcast[-1],df.dcast$score, ntreeTry=500, stepFactor=1.5,improve=0.01, trace=TRUE, plot=TRUE)
best.m <- mtry[mtry[, 2] == min(mtry[, 2]), 1]
print(mtry)
# mtry OOBError
# 2 2 0.24481286
# 3 3 0.04999674
# 4 4 0.01510117
# 6 6 0.01025664
# 7 7 0.01411472
print(best.m)
# 6
set.seed(71)
rf <-randomForest(score~.,data=df.dcast[-1], mtry=best.m, importance=TRUE,ntree=500)
print(rf)
# Call:
# randomForest(formula = score ~ ., data = df.dcast[-1], mtry = best.m, importance = TRUE, ntree = 500)
# Type of random forest: regression
# Number of trees: 500
# No. of variables tried at each split: 6
#
# Mean of squared residuals: 25.8724
# % Var explained: 14.68
importance(rf)
# %IncMSE IncNodePurity
# scale 72.35430 19807.82
# fpkm 95.81782 47881.55
# gc.content 44.28466 21671.31
# gene.density 87.12267 47738.73
# melting.temp 41.96295 21588.83
# structure 70.59875 44338.57
pdf("rf.importance.pdf")
varImpPlot(rf)
dev.off()
pdf("rf.feature.residual.pdf")
plot(rf$y, rf$y - rf$predicted)
dev.off()
module load python
source /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/scripts/loadcondaandes.sh
conda activate /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/Libraries/andes/envs/test
# R
library(randomForest)
library(reshape2)
library(dplyr)
library(tidyr)
setwd("/gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli")
df <- read.delim("window.temp.gc.structure.rna.gene.input.txt", header=T, sep="\t", stringsAsFactors = F)
df <- na.omit(df)
df.id <- df %>% unite(feature.scale, c(variable, scale), sep = "")
df.dcast <- df.id %>% dcast(window + score ~ feature.scale, value.var = "value")
df.dcast <- na.omit(df.dcast)
mtry <- tuneRF(df.dcast[-1],df.dcast$score, ntreeTry=500, stepFactor=1.5,improve=0.01, trace=TRUE, plot=TRUE)
best.m <- mtry[mtry[, 2] == min(mtry[, 2]), 1]
print(mtry)
# mtry OOBError
# 25 25 5.7438944
# 37 37 2.9917658
# 55 55 1.5460130
# 82 82 0.6667672
# 111 111 0.3567881
print(best.m)
# 111
set.seed(131)
rf <- randomForest(score ~ ., data=df.dcast[-1], mtry=82, importance=TRUE, ntree=500)
print(rf)
# Call:
# randomForest(formula = score ~ ., data = df.dcast[-1], mtry = 82, importance = TRUE, ntree = 500)
# Type of random forest: regression
# Number of trees: 500
# No. of variables tried at each split: 82
#
# Mean of squared residuals: 30.68943
# % Var explained: -1.21
importance(rf)
pdf("rf.all.importance.pdf")
varImpPlot(rf)
dev.off()
pdf("rf.feature.scale.residual.pdf")
plot(rf$y, rf$y - rf$predicted)
dev.off()
%IncMSE is the most robust and informative measure. It is the increase in mse of predictions(estimated with out-of-bag-CV) as a result of variable j being permuted(values randomly shuffled).
grow regression forest. Compute OOB-mse, name this mse0. for 1 to j var: permute values of column j, then predict and compute OOB-mse(j) %IncMSE of j’th is (mse(j)-mse0)/mse0 * 100% the higher number, the more important
IncNodePurity relates to the loss function which by best splits are chosen. The loss function is mse for regression and gini-impurity for classification. More useful variables achieve higher increases in node purities, that is to find a split which has a high inter node ‘variance’ and a small intra node ‘variance’. IncNodePurity is biased and should only be used if the extra computation time of calculating %IncMSE is unacceptable. Since it only takes ~5-25% extra time to calculate %IncMSE, this would almost never happen.
module load python
source /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/scripts/loadcondaandes.sh
conda activate /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/Libraries/andes/envs/test
# R
library(dplyr)
library(reshape2)
library(MassSpecWavelet)
setwd("/gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli")
#score <- read.delim("window.score.avg.txt", header=T, sep="\t", stringsAsFactors = F)
score <- read.delim("window.score.avg.rmna.txt", header=T, sep="\t", stringsAsFactors = F)
#score <- read.delim("window.score.quality.txt", header=T, sep="\t", stringsAsFactors = F)
score.na <- na.omit(score)
#score.df <- score[,4]
#score.df[is.na(score.df)] <- 0
score.df <- score.na[,4]
#score.df <- score.na[,10]
## Plot the 2-D CWT coefficients as image (It may take a while!)
# Melting Temperature
scales <- seq(1, 86, 3)
wCoefs <- cwt(score.df, scales=scales, wavelet='mexh')
#pdf("ecoli.score.wavelet.pdf")
pdf("ecoli.score.nona.wavelet.pdf")
#pdf("ecoli.score.raw.wavelet.pdf")
image(1:length(score.df), scales, wCoefs, col=terrain.colors(256), axes=FALSE, xlab='m/z index', ylab='CWT coefficient scale', main='CWT Efficiency Score coefficients')
box()
dev.off()
wCoefs <- cbind(as.vector(score.df), wCoefs)
colnames(wCoefs) <- c(0, scales)
localMax <- getLocalMaximumCWT(wCoefs)
ridgeList <- getRidge(localMax)
SNR.Th <- 3
nearbyPeak <- TRUE
majorPeakInfo <- identifyMajorPeaks(score.df, ridgeList, wCoefs, SNR.Th = SNR.Th, nearbyPeak=nearbyPeak)
peakIndex <- majorPeakInfo$peakIndex
SNR.Th <- 3
pdf("ecoli.score.wavelet.peak.pdf")
plotLocalMax(localMax, wCoefs)
dev.off()
plotRange <- c(1,length(score.df))
pdf("ecoli.score.peak.pdf")
plotPeak(score.df, peakIndex, range=plotRange, main=paste('Identified Efficiency Score peaks with SNR >', SNR.Th))
dev.off()
module load python
source /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/scripts/loadcondaandes.sh
conda activate /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/Libraries/andes/envs/test
# R
library(dplyr)
library(reshape2)
library(MassSpecWavelet)
setwd("/gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli")
gene <- read.delim("ecoli.gene.windows.bed", header=F, sep="\t", stringsAsFactors = F)
structure <- read.delim("ecoli.structure.txt", header=T, sep="\t", stringsAsFactors = F)
nuc <- read.delim("nucleotide_counts_temp.txt", header=T, sep="\t", stringsAsFactors = F)
rnaseq <- read.delim("ecoli.rnaseq.average.windows.bed", header=T, sep="\t", stringsAsFactors = F)
window <- read.delim("ecoli.1kb.windows.bed", header=F, sep="\t", stringsAsFactors = F)
score <- read.delim("window.score.avg.txt", header=T, sep="\t", stringsAsFactors = F)
gene.bin <- gene %>% group_by(V1, V2, V3) %>% mutate(gene.count = n())
gene.count <- unique(gene.bin[,c(1:3,14)])
gene.df <- gene.count$gene.count
structure.df <- structure[,2]
gc.df <- nuc[,7]
temp.df <- nuc[,8]
rna.df <- rnaseq[,4]
score.df <- score[,4]
score.df[is.na(score.df)] <- 0
scales <- seq(1, 64, 3)
wCoefs.temp <- cwt(temp.df, scales=scales, wavelet='mexh')
wCoefs.gc <- cwt(gc.df, scales=scales, wavelet='mexh')
wCoefs.structure <- cwt(structure.df, scales=scales, wavelet='mexh')
wCoefs.rna <- cwt(rna.df, scales=scales, wavelet='mexh')
wCoefs.gene <- cwt(gene.df, scales=scales, wavelet='mexh')
wCoefs.score <- cwt(score.df, scales=scales, wavelet='mexh')
wCoefs.temp.melt <- melt(wCoefs.temp)
wCoefs.gc.melt <- melt(wCoefs.gc)
wCoefs.structure.melt <- melt(wCoefs.structure)
wCoefs.rna.melt <- melt(wCoefs.rna)
wCoefs.gene.melt <- melt(wCoefs.gene)
wCoefs.score.melt <- melt(wCoefs.score)
colnames(window) <- c("chr", "start", "end")
window$Var1 <- seq.int(nrow(window))
window.score <- left_join(window, wCoefs.score.melt, by="Var1")
window.score.temp <- left_join(window.score, wCoefs.temp.melt, by=c("Var1", "Var2"))
window.temp.gc <- left_join(window.score.temp, wCoefs.gc.melt, by=c("Var1", "Var2"))
window.temp.gc.structure <- left_join(window.temp.gc, wCoefs.structure.melt, by=c("Var1", "Var2"))
window.temp.gc.structure.rna <- left_join(window.temp.gc.structure, wCoefs.rna.melt, by=c("Var1", "Var2"))
window.temp.gc.structure.rna.gene <- left_join(window.temp.gc.structure.rna, wCoefs.gene.melt, by=c("Var1", "Var2"))
colnames(window.temp.gc.structure.rna.gene) <- c("chr", "start", "end", "window", "scale", "score", "melting.temp", "gc.content", "structure", "fpkm", "gene.density")
df.melt <- melt(window.temp.gc.structure.rna.gene[,4:11], id=c("window", "score", "scale"))
write.table(df.melt, "window.temp.gc.structure.rna.gene.input.wavelet.txt", quote=F, row.names=F, sep="\t")
# salloc -A SYB105 -p gpu -N 2 -t 2:00:00
module load python
source /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/scripts/loadcondaandes.sh
conda activate /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/Libraries/andes/envs/test
# R
library(randomForest)
library(reshape2)
library(dplyr)
setwd("/gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli")
df <- read.delim("window.temp.gc.structure.rna.gene.input.wavelet.txt", header=T, sep="\t", stringsAsFactors = F)
df <- na.omit(df)
# features and scales dependent
mtry <- tuneRF(df[-1],df$score, ntreeTry=500, stepFactor=1.5,improve=0.01, trace=TRUE, plot=TRUE)
best.m <- mtry[mtry[, 2] == min(mtry[, 2]), 1]
print(mtry)
# mtry OOBError
# 1 1 12.67119
print(best.m)
set.seed(71)
rf <-randomForest(score~.,data=df[-1], mtry=1, importance=TRUE,ntree=500)
print(rf)
# Call:
# randomForest(formula = score ~ ., data = df[-1], mtry = 1, importance = TRUE, ntree = 500)
# Type of random forest: regression
# Number of trees: 500
# No. of variables tried at each split: 1
#
# Mean of squared residuals: 315.4771
# % Var explained: 12.91
importance(rf)
# %IncMSE IncNodePurity
# scale 35.23307 6014208
# variable 47.71408 1449416
# value 45.91147 7383858
# features independent
df.dcast <- df %>% dcast(window + score + scale ~ variable, value.var = "value")
df.dcast <- na.omit(df.dcast)
mtry <- tuneRF(df.dcast[-1],df.dcast$score, ntreeTry=500, stepFactor=1.5,improve=0.01, trace=TRUE, plot=TRUE)
best.m <- mtry[mtry[, 2] == min(mtry[, 2]), 1]
print(mtry)
# mtry OOBError
# 2 2 0.0895399828
# 3 3 0.0070506725
# 4 4 0.0010140446
# 6 6 0.0004765615
# 7 7 0.0005071095
print(best.m)
# 6
set.seed(71)
rf <- randomForest(score~.,data=df.dcast[-1], mtry=best.m, importance=TRUE,ntree=500)
print(rf)
# Call:
# randomForest(formula = score ~ ., data = df.dcast[-1], mtry = best.m, importance = TRUE, ntree = 500)
# Type of random forest: regression
# Number of trees: 500
# No. of variables tried at each split: 6
#
# Mean of squared residuals: 15.29609
# % Var explained: 95.78
importance(rf)
# %IncMSE IncNodePurity
# scale 784.4232 9225361
# fpkm 316.9117 3884911
# gc.content 126.0098 1961285
# gene.density 1008.9605 4242166
# melting.temp 399.8243 3920941
# structure 121.6250 1964286
# all independent
df.dcast <- df %>% dcast(window + score ~ variable, value.var = "value")
df.dcast <- na.omit(df.dcast)
mtry <- tuneRF(df.dcast[-1],df.dcast$score, ntreeTry=500, stepFactor=1.5,improve=0.01, trace=TRUE, plot=TRUE)
best.m <- mtry[mtry[, 2] == min(mtry[, 2]), 1]
print(mtry)
print(best.m)
set.seed(71)
rf <-randomForest(score~.,data=df.dcast[-1], mtry=best.m, importance=TRUE,ntree=500)
print(rf)
importance(rf)
module load python
source /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/scripts/loadcondaandes.sh
conda activate /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/Libraries/andes/envs/test
# R
library(dplyr)
library(reshape2)
library(MassSpecWavelet)
setwd("/gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli")
score <- read.delim("window500.score.avg.txt", header=T, sep="\t", stringsAsFactors = F)
score.na <- na.omit(score)
score.df <- score.na[,4]
## Plot the 2-D CWT coefficients as image (It may take a while!)
scales <- seq(1, 86, 3)
wCoefs <- cwt(score.df, scales=scales, wavelet='mexh')
pdf("ecoli.score.500.wavelet.pdf")
image(1:length(score.df), scales, wCoefs, col=terrain.colors(256), axes=FALSE, xlab='m/z index', ylab='CWT coefficient scale', main='CWT Efficiency Score coefficients - 500bp windows')
box()
dev.off()
wCoefs <- cbind(as.vector(score.df), wCoefs)
colnames(wCoefs) <- c(0, scales)
localMax <- getLocalMaximumCWT(wCoefs)
ridgeList <- getRidge(localMax)
SNR.Th <- 3
nearbyPeak <- TRUE
majorPeakInfo <- identifyMajorPeaks(score.df, ridgeList, wCoefs, SNR.Th = SNR.Th, nearbyPeak=nearbyPeak)
peakIndex <- majorPeakInfo$peakIndex
SNR.Th <- 3
pdf("ecoli.score.500.wavelet.peak.pdf")
plotLocalMax(localMax, wCoefs)
dev.off()
plotRange <- c(1,length(score.df))
pdf("ecoli.score.500.peak.pdf")
plotPeak(score.df, peakIndex, range=plotRange, main=paste('Identified Efficiency Score peaks with SNR >', SNR.Th))
dev.off()
# subset of windows
nrow(score.na)
# 8013
score.sub <- score.na[1:1000,]
score.df <- score.sub[,4]
# play with scales
scales <- seq(1, 10, 2)
wCoefs <- cwt(score.df, scales=scales, wavelet='mexh')
pdf("ecoli.score.500.subset.wavelet10.pdf")
image(1:length(score.df), scales, wCoefs, col=terrain.colors(256), axes=FALSE, xlab='m/z index', ylab='CWT coefficient scale', main='CWT Score - 500bp windows (first 1000) - Scale 10x2')
box()
dev.off()
scales <- seq(1, 20, 2)
wCoefs <- cwt(score.df, scales=scales, wavelet='mexh')
pdf("ecoli.score.500.subset.wavelet20.pdf")
image(1:length(score.df), scales, wCoefs, col=terrain.colors(256), axes=FALSE, xlab='m/z index', ylab='CWT coefficient scale', main='CWT Score - 500bp windows (first 1000) - Scale 20x2')
box()
dev.off()
scales <- seq(1, 40, 2)
wCoefs <- cwt(score.df, scales=scales, wavelet='mexh')
pdf("ecoli.score.500.subset.wavelet40.pdf")
image(1:length(score.df), scales, wCoefs, col=terrain.colors(256), axes=FALSE, xlab='m/z index', ylab='CWT coefficient scale', main='CWT Score - 500bp windows (first 1000) - Scale 40x2')
box()
dev.off()
scales <- seq(1, 60, 2)
wCoefs <- cwt(score.df, scales=scales, wavelet='mexh')
pdf("ecoli.score.500.subset.wavelet60.pdf")
image(1:length(score.df), scales, wCoefs, col=terrain.colors(256), axes=FALSE, xlab='m/z index', ylab='CWT coefficient scale', main='CWT Score - 500bp windows (first 1000) - Scale 60x2')
box()
dev.off()
scales <- seq(1, 80, 2)
wCoefs <- cwt(score.df, scales=scales, wavelet='mexh')
# Error in cwt(score.df, scales = scales, wavelet = "mexh") :
# scale 65 is too large!
library(dplyr)
library(reshape2)
library(MassSpecWavelet)
setwd("/gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli")
score <- read.delim("window500.score.avg.txt", header=T, sep="\t", stringsAsFactors = F)
score.na <- na.omit(score)
score.df <- score.na[,4]
scales <- seq(1, 60, 2)
wCoefs <- cwt(score.df, scales=scales, wavelet='mexh')
setwd("/gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli")
gene <- read.delim("ecoli.gene.windows500.bed", header=F, sep="\t", stringsAsFactors = F)
structure <- read.delim("500bp.protein.structure.fa", header=T, sep="\t", stringsAsFactors = F)
nuc <- read.delim("nucleotide_counts_500bp_temp.txt", header=T, sep="\t", stringsAsFactors = F)
rnaseq <- read.delim("ecoli.rnaseq.average.windows500.bed", header=T, sep="\t", stringsAsFactors = F)
window <- read.delim("ecoli.500bp.windows.bed", header=F, sep="\t", stringsAsFactors = F)
score <- read.delim("window500.score.avg.txt", header=T, sep="\t", stringsAsFactors = F)
gene.bin <- gene %>% group_by(V1, V2, V3) %>% mutate(gene.count = n())
gene.count <- unique(gene.bin[,c(1:3,14)])
gene.df <- gene.count$gene.count
structure.df <- structure[,2]
gc.df <- nuc[,7]
temp.df <- nuc[,8]
rna.df <- rnaseq[,4]
scales <- seq(1, 100, 2)
wCoefs.temp <- cwt(temp.df, scales=scales, wavelet='mexh')
wCoefs.gc <- cwt(gc.df, scales=scales, wavelet='mexh')
wCoefs.structure <- cwt(structure.df, scales=scales, wavelet='mexh')
wCoefs.rna <- cwt(rna.df, scales=scales, wavelet='mexh')
wCoefs.gene <- cwt(gene.df, scales=scales, wavelet='mexh')
# Error in plot.new() : figure margins too large
# par("mar")
# par(mar=c(1,1,1,1))
pdf(file='wavelets.score.features1.pdf')
par(mfrow=c(4,1))
scales <- seq(1, 60, 2)
image(1:length(score.df), scales, wCoefs, col=terrain.colors(256), axes=FALSE, ylab='CWT coefficient scale', main='CWT Efficiency Score coefficients - 500bp windows')
scales <- seq(1, 100, 2)
image(1:length(temp.df), scales, wCoefs.temp, col=terrain.colors(256), axes=FALSE, ylab='CWT coefficient scale', main='CWT Melting Temp coefficients')
image(1:length(gc.df), scales, wCoefs.gc, col=terrain.colors(256), axes=FALSE, ylab='CWT coefficient scale', main='CWT GC Content coefficients')
image(1:length(structure.df), scales, wCoefs.structure, col=terrain.colors(256), axes=FALSE, ylab='CWT coefficient scale', main='CWT Secondary Structure coefficients')
dev.off()
pdf(file='wavelets.score.features2.pdf')
par(mfrow=c(4,1))
scales <- seq(1, 60, 2)
image(1:length(score.df), scales, wCoefs, col=terrain.colors(256), axes=FALSE, ylab='CWT coefficient scale', main='CWT Efficiency Score coefficients - 500bp windows')
scales <- seq(1, 100, 2)
image(1:length(rna.df), scales, wCoefs.rna, col=terrain.colors(256), axes=FALSE, ylab='CWT coefficient scale', main='CWT RNA-seq FPKM coefficients')
image(1:length(gene.df), scales, wCoefs.gene, col=terrain.colors(256), axes=FALSE, xlab='m/z index', ylab='CWT coefficient scale', main='CWT Gene Density coefficients')
dev.off()
module load python
source /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/scripts/loadcondaandes.sh
conda activate /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/Libraries/andes/envs/test
# R
library(dplyr)
library(reshape2)
library(MassSpecWavelet)
setwd("/gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli")
gene <- read.delim("ecoli.gene.windows500.bed", header=F, sep="\t", stringsAsFactors = F)
structure <- read.delim("500bp.protein.structure.fa", header=T, sep="\t", stringsAsFactors = F)
nuc <- read.delim("nucleotide_counts_500bp_temp.txt", header=T, sep="\t", stringsAsFactors = F)
rnaseq <- read.delim("ecoli.rnaseq.average.windows500.bed", header=T, sep="\t", stringsAsFactors = F)
window <- read.delim("ecoli.500bp.windows.bed", header=F, sep="\t", stringsAsFactors = F)
score <- read.delim("window500.score.avg.txt", header=T, sep="\t", stringsAsFactors = F)
gene.bin <- gene %>% group_by(V1, V2, V3) %>% mutate(gene.count = n())
gene.count <- unique(gene.bin[,c(1:3,14)])
gene.df <- gene.count$gene.count
structure.df <- structure[,2]
gc.df <- nuc[,7]
temp.df <- nuc[,8]
rna.df <- rnaseq[,4]
## try using a larger scale?
# scales <- seq(1, 64, 3)
scales <- seq(1, 100, 2)
wCoefs.temp <- cwt(temp.df, scales=scales, wavelet='mexh')
wCoefs.gc <- cwt(gc.df, scales=scales, wavelet='mexh')
wCoefs.structure <- cwt(structure.df, scales=scales, wavelet='mexh')
wCoefs.rna <- cwt(rna.df, scales=scales, wavelet='mexh')
wCoefs.gene <- cwt(gene.df, scales=scales, wavelet='mexh')
wCoefs.temp.melt <- melt(wCoefs.temp)
wCoefs.gc.melt <- melt(wCoefs.gc)
wCoefs.structure.melt <- melt(wCoefs.structure)
wCoefs.rna.melt <- melt(wCoefs.rna)
wCoefs.gene.melt <- melt(wCoefs.gene)
colnames(window) <- c("chr", "start", "end")
window$Var1 <- seq.int(nrow(window))
colnames(score) <- c("chr", "start", "end", "score")
window.score <- left_join(window, score, by=c("chr", "start", "end"))
window.score.temp <- left_join(window.score, wCoefs.temp.melt, by="Var1")
window.temp.gc <- left_join(window.score.temp, wCoefs.gc.melt, by=c("Var1", "Var2"))
window.temp.gc.structure <- left_join(window.temp.gc, wCoefs.structure.melt, by=c("Var1", "Var2"))
window.temp.gc.structure.rna <- left_join(window.temp.gc.structure, wCoefs.rna.melt, by=c("Var1", "Var2"))
window.temp.gc.structure.rna.gene <- left_join(window.temp.gc.structure.rna, wCoefs.gene.melt, by=c("Var1", "Var2"))
colnames(window.temp.gc.structure.rna.gene) <- c("chr", "start", "end", "window", "score", "scale", "melting.temp", "gc.content", "structure", "fpkm", "gene.density")
df.melt <- melt(window.temp.gc.structure.rna.gene[,4:11], id=c("window", "score", "scale"))
df <- na.omit(df.melt)
library(tidyr)
df.id <- df %>% unite(feature.scale, c(variable, scale), sep = "")
df.dcast <- df.id %>% dcast(window + score ~ feature.scale, value.var = "value")
df.dcast <- na.omit(df.dcast)
write.table(df.dcast, "ecoli.features.500bp.wavelet.dcast.txt", quote=F, row.names=F, sep="\t")
module load python
source /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/scripts/loadcondaandes.sh
conda activate /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/Libraries/andes/envs/test
library(randomForest)
setwd("/gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli")
df <- read.delim("ecoli.features.500bp.wavelet.dcast.txt", header=T, sep="\t", stringsAsFactors = F)
# use largest possible mtry including all features... ncol(df) - 2
set.seed(131)
rf <- randomForest(score ~ ., data=df[-1], mtry=250, importance=TRUE, ntree=500)
print(rf)
# Call:
# randomForest(formula = score ~ ., data = df[-1], mtry = 250, importance = TRUE, ntree = 500)
# Type of random forest: regression
# Number of trees: 500
# No. of variables tried at each split: 250
#
# Mean of squared residuals: 37.09652
# % Var explained: 7.93
imp <- data.frame(importance(rf))
imp.order <- imp[order(imp),]
tail(na.omit(imp.order))
pdf("rf.all.500bp.importance.pdf")
varImpPlot(rf)
dev.off()
pdf("rf.all.500bp.residual.pdf")
plot(rf$y, rf$y - rf$predicted)
dev.off()
pdf("rf.all.500bp.predicted.pdf")
plot(rf$y, rf$predicted)
dev.off()
module load python
source /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/scripts/loadcondaandes.sh
conda activate /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/Libraries/andes/envs/test
library(dplyr)
library(reshape2)
library(MassSpecWavelet)
setwd("/gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli")
window <- read.delim("ecoli.500bp.windows.bed", header=F, sep="\t", stringsAsFactors = F)
score <- read.delim("window500.score.avg.txt", header=T, sep="\t", stringsAsFactors = F)
score.na <- na.omit(score)
score.df <- score.na[,4]
scales <- seq(1, 86, 3)
wCoefs <- cwt(score.df, scales=scales, wavelet='mexh')
wCoefs.melt <- melt(wCoefs)
colnames(window) <- c("chr", "start", "end")
window$Var1 <- seq.int(nrow(window))
colnames(score) <- c("chr", "start", "end", "score")
window.score <- left_join(window, score, by=c("chr", "start", "end"))
window.score.wave <- left_join(window.score, wCoefs.melt, by="Var1")
colnames(window.score.wave) <- c("chr", "start", "end", "window", "score", "scale", "scorewave")
df.melt <- melt(window.score.wave[,4:7], id=c("window", "score", "scale"))
df <- na.omit(df.melt)
library(tidyr)
df.id <- df %>% unite(feature.scale, c(variable, scale), sep = "")
df.dcast <- df.id %>% dcast(window + score ~ feature.scale, value.var = "value")
df.dcast <- na.omit(df.dcast)
library(randomForest)
set.seed(131)
rf <- randomForest(score ~ ., data=df.dcast[-1], mtry=ncol(df.dcast)-2, importance=TRUE, ntree=500)
print(rf)
# Call:
# randomForest(formula = score ~ ., data = df.dcast[-1], mtry = ncol(df.dcast) - 2, importance = TRUE, ntree = 500)
# Type of random forest: regression
# Number of trees: 500
# No. of variables tried at each split: 29
#
# Mean of squared residuals: 38.85093
# % Var explained: 6.04
######### the wavelet transform of the average score in a 500bp window onl explains 6% of the variance in the actual score for that window???
imp <- data.frame(importance(rf))
imp.order <- imp[order(imp),]
tail(na.omit(imp.order))
# X.IncMSE IncNodePurity
# scorewave31 28.13412 9020.583
# scorewave13 29.99599 14053.696
# scorewave19 30.99481 13536.830
# scorewave10 33.43378 15471.722
# scorewave4 35.94473 21140.822
# scorewave7 39.08630 17281.466
##### Try with actual score (not 500bp average)
setwd("/gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli")
score <- read.delim("sgRNA.coord.txt", header=T, sep="\t", stringsAsFactors = F)
colnames(score) <- c("chr", "start", "end", "sgRNA", "id", "seq", "id2", "cut.score", "gid", "change.val", "quality")
score.coord <- score[,c(1:3,8)]
score.na <- na.omit(score.coord)
score.df <- score.na[,4]
scales <- seq(1, 86, 3)
wCoefs <- cwt(score.df, scales=scales, wavelet='mexh')
wCoefs.melt <- melt(wCoefs)
score.coord$Var1 <- seq.int(nrow(score.coord))
score.wave <- left_join(score.coord, wCoefs.melt, by="Var1")
colnames(score.wave) <- c("chr", "start", "end", "score", "window", "scale", "scorewave")
df.melt <- melt(score.wave[,4:7], id=c("window", "score", "scale"))
df <- na.omit(df.melt)
library(tidyr)
df.id <- df %>% unite(feature.scale, c(variable, scale), sep = "")
df.dcast <- df.id %>% dcast(window + score ~ feature.scale, value.var = "value")
df.dcast <- na.omit(df.dcast)
library(randomForest)
set.seed(131)
rf <- randomForest(score ~ ., data=df.dcast[-1], mtry=ncol(df.dcast)-2, importance=TRUE, ntree=500)
print(rf)
# Type of random forest: regression
# Number of trees: 500
# No. of variables tried at each split: 29
#
# Mean of squared residuals: 113.5857
# % Var explained: -2.04
######## negative % variance explained????? <-- "A negative 𝑅2𝑜𝑜𝑏 is a clear warning sign that your model might be overfitting noise."
imp <- data.frame(importance(rf))
imp.order <- imp[order(imp),]
tail(na.omit(imp.order))
# X.IncMSE IncNodePurity
# scorewave22 31.37119 114903.6
# scorewave16 33.21437 137896.0
# scorewave13 33.88340 158200.7
# scorewave19 34.63017 124402.2
# scorewave85 35.30701 115063.3
# scorewave7 36.79408 205576.1
library(ranger)
iRF <- function(xmat, y, ntree=500, iter=5, classification=F, threads=1,alwayssplits=NULL, saveall=T)
{
tmp <- cbind(xmat, Y = y)
wt <- rep(1/ncol(xmat), ncol(xmat)) # start with equal sample weighting per SNP
rfs <- list()
for(i in 1:iter)
{
cat("\niRF iteration ",i,"\n")
cat("=================\n")
mtry = 0.5*sum(wt>0)
rf <- ranger::ranger(dependent.variable.name = "Y", data = tmp, num.trees=ntree,
split.select.weights = wt, classification = classification,
mtry = mtry, importance = "impurity_corrected", num.threads=threads, write.forest = T,
always.split.variables = alwayssplits)
wt <- rf$variable.importance / sum(abs(rf$variable.importance)) # scale importances to range(0,1)
wt[wt<0] <- 0 # set negative weights to zero
cat("mtry: ", mtry, "\n")
cat("prediction error: ",rf$prediction.error,"\n")
if(classification==FALSE) cat("r^2: ",rf$r.squared,"\n")
if(classification==TRUE) print(rf$confusion.matrix)
cat("cor(y,yhat): ",cor(rf$predictions,y),"\n")
cat("SNPs with importance > 0:",sum(wt>0),"\n")
if(saveall) rfs[[i]] <- rf
if(sum(wt>0) < max(0.01*(ncol(xmat)-1), 10))
{
if(!saveall) rfs <- rf
break
}
}
return(rfs)
}
iRF(df.dcast[,3:ncol(df.dcast)], df.dcast$score)
# mtry: 125
# prediction error: 36.86407
# r^2: 0.08523164
# cor(y,yhat): 0.2930709
# SNPs with importance > 0: 220
https://cran.r-project.org/web/packages/ranger/ranger.pdf
# salloc -A SYB105 -p gpu -N 2 -t 2:00:00
module load python
source /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/scripts/loadcondaandes.sh
conda activate /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/Libraries/andes/envs/test
R
library(ranger)
setwd("/gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli")
df <- read.delim("ecoli.features.500bp.wavelet.dcast.txt", header=T, sep="\t", stringsAsFactors = F)
## Split in training and test data
train.idx <- sample(nrow(df), 2/3 * nrow(df))
df.train <- df[train.idx, ]
df.test <- df[-train.idx, ]
## Run case-specific RF
csrf(score ~ ., training_data = df.train[-1], test_data = df.test[-1], params1 = list(num.trees = 1000, mtry = 110), params2 = list(num.trees = 1000))
# salloc -A SYB105 -p gpu -N 2 -t 2:00:00
module load python
source /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/scripts/loadcondaandes.sh
conda activate /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/Libraries/andes/envs/test
R
library(dplyr)
library(tidypredict)
library(ranger)
setwd("/gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli")
df <- read.delim("ecoli.features.500bp.wavelet.dcast.txt", header=T, sep="\t", stringsAsFactors = F)
train.idx <- sample(nrow(df), 2/3 * nrow(df))
df.train <- df[train.idx, ]
df.test <- df[-train.idx, ]
model <- ranger(score ~ .,data = df.train[-1], num.trees = 1000, mtry=110, importance = "permutation")
# Type: Regression
# Number of trees: 1000
# Sample size: 3406
# Number of independent variables: 110
# Mtry: 110
# Target node size: 5
# Variable importance mode: permutation
# Splitrule: variance
# OOB prediction error (MSE): 38.23165
# R squared (OOB): 0.07509708
imp <- data.frame(importance(model))
imp$feature <- rownames(imp)
colnames(imp) <- c("importance", "feature")
imp.order <- imp[order(imp),]
imp.order$feature <- as.factor(imp.order$feature)
na.omit(imp.order)
# importance feature
# gene.density1 0.007121516 gene.density1
# gc.content43 0.260337623 gc.content43
# melting.temp43 0.262132209 melting.temp43
# gc.content40 0.285967363 gc.content40
# structure1 0.304611126 structure1
# melting.temp46 0.305165152 melting.temp46
# gc.content55 0.309757214 gc.content55
# gc.content37 0.347891040 gc.content37
# melting.temp55 0.353581030 melting.temp55
# gc.content58 0.354913667 gc.content58
# melting.temp52 0.383092676 melting.temp52
# melting.temp58 0.384150723 melting.temp58
# melting.temp31 0.389207101 melting.temp31
# melting.temp37 0.392552001 melting.temp37
# gc.content34 0.397982606 gc.content34
# gc.content46 0.408280867 gc.content46
# melting.temp34 0.408406603 melting.temp34
# melting.temp40 0.422559009 melting.temp40
# fpkm49 0.433683983 fpkm49
# gc.content61 0.437677876 gc.content61
# melting.temp61 0.439671788 melting.temp61
# fpkm52 0.454066068 fpkm52
# fpkm1 0.464554054 fpkm1
# gc.content28 0.472088644 gc.content28
# melting.temp28 0.484934585 melting.temp28
# gc.content31 0.490093648 gc.content31
# structure46 0.521850495 structure46
# gc.content52 0.523583403 gc.content52
# gene.density58 0.529030093 gene.density58
# gc.content19 0.535231905 gc.content19
# fpkm58 0.537376419 fpkm58
# gc.content49 0.538376583 gc.content49
# melting.temp19 0.549214010 melting.temp19
# melting.temp49 0.556124084 melting.temp49
# gc.content64 0.557424741 gc.content64
# melting.temp16 0.589457207 melting.temp16
# structure52 0.591994885 structure52
# structure40 0.596812110 structure40
# structure37 0.608181099 structure37
# gc.content13 0.621088221 gc.content13
# fpkm43 0.623073172 fpkm43
# structure49 0.625214923 structure49
# gene.density4 0.628599329 gene.density4
# structure61 0.631511179 structure61
# gc.content16 0.635238261 gc.content16
# gene.density61 0.643850260 gene.density61
# structure4 0.644563309 structure4
# fpkm4 0.646555289 fpkm4
# fpkm37 0.648948837 fpkm37
# melting.temp13 0.656447705 melting.temp13
# fpkm46 0.659996492 fpkm46
# fpkm61 0.669965085 fpkm61
# fpkm40 0.670644689 fpkm40
# structure31 0.672333873 structure31
# structure58 0.672661420 structure58
# melting.temp10 0.673338319 melting.temp10
# structure55 0.678603606 structure55
# structure28 0.682957088 structure28
# gc.content10 0.685086473 gc.content10
# melting.temp64 0.694412176 melting.temp64
# gene.density64 0.697673580 gene.density64
# fpkm7 0.706199750 fpkm7
# gene.density55 0.709007505 gene.density55
# structure34 0.712423305 structure34
# structure43 0.736719022 structure43
# fpkm55 0.749914227 fpkm55
# fpkm10 0.753124273 fpkm10
# gc.content25 0.763074463 gc.content25
# fpkm16 0.763483498 fpkm16
# structure16 0.765923081 structure16
# melting.temp1 0.779940063 melting.temp1
# melting.temp25 0.826445489 melting.temp25
# structure10 0.828485300 structure10
# fpkm22 0.837178271 fpkm22
# gc.content1 0.843727611 gc.content1
# fpkm13 0.854177208 fpkm13
# fpkm64 0.854893461 fpkm64
# gene.density52 0.867197933 gene.density52
# gc.content22 0.872064414 gc.content22
# fpkm19 0.887226328 fpkm19
# structure25 0.904154806 structure25
# structure19 0.909409549 structure19
# gene.density7 0.921766554 gene.density7
# gene.density16 0.923790964 gene.density16
# gene.density19 0.930372493 gene.density19
# gene.density22 0.941061911 gene.density22
# fpkm28 0.943849459 fpkm28
# melting.temp22 0.951304346 melting.temp22
# structure13 0.991907409 structure13
# gene.density13 1.023104317 gene.density13
# structure64 1.054270307 structure64
# structure22 1.062568384 structure22
# gene.density25 1.063675724 gene.density25
# gene.density49 1.096541591 gene.density49
# fpkm25 1.112054828 fpkm25
# fpkm31 1.139024811 fpkm31
# gene.density10 1.150614351 gene.density10
# fpkm34 1.291055139 fpkm34
# structure7 1.343665378 structure7
# melting.temp4 1.516449419 melting.temp4
# gene.density28 1.523513233 gene.density28
# gc.content4 1.607140751 gc.content4
# gene.density46 1.618872935 gene.density46
# gene.density43 1.803703934 gene.density43
# gene.density37 1.985888856 gene.density37
# melting.temp7 2.066527732 melting.temp7
# gc.content7 2.075917496 gc.content7
# gene.density40 2.100699406 gene.density40
# gene.density31 2.281914372 gene.density31
# gene.density34 3.071857385 gene.density34
library(ggplot2)
pdf("ranger.imp.pdf")
ggplot(na.omit(imp.order), aes(x=feature, y=importance)) + geom_point(col="tomato2", size=1) + geom_segment(aes(x=feature, xend=feature, y=min(importance), yend=max(importance)), linetype="dashed", size=0.1) + scale_y_discrete(drop=FALSE) +
labs(title="Dot Plot of Ranger RF", subtitle="Feature Vs RF Importance") + coord_flip() + theme(text = element_text(size = 5))
dev.off()
tree <- treeInfo(model)
# nodeID The nodeID, 0-indexed.
# leftChild ID of the left child node, 0-indexed.
# rightChild ID of the right child node, 0-indexed.
# splitvarID ID of the splitting variable, 0-indexed. Caution, the variable order changes if the formula interface is used.
# splitvarName Name of the splitting variable.
# splitval The splitting value. For numeric or ordinal variables, all values smaller or equal go to the left, larger values to the right. For unordered factor variables see above.
# terminal Logical, TRUE for terminal nodes.
# prediction One column with the predicted class (factor) for classification and the predicted numerical value for regressi
head(tree)
# nodeID leftChild rightChild splitvarID splitvarName splitval terminal
# 1 0 1 2 87 melting.temp7 -4.50473699 FALSE
# 2 1 3 4 57 gene.density43 0.97487778 FALSE
# 3 2 5 6 88 structure1 0.06615566 FALSE
# 4 3 7 8 66 melting.temp1 -2.13392151 FALSE
# 5 4 9 10 77 melting.temp4 -3.11893738 FALSE
# 6 5 11 12 63 gene.density61 -1.30484916 FALSE
# prediction
# 1 NA
# 2 NA
# 3 NA
# 4 NA
# 5 NA
# 6 NA
pred.df <- predict(model, data = df.test)
df.tab <- data.frame(score = df.test$score, pred = pred.df$predictions)
library(ggplot2)
pdf("ranger.pred.pdf")
ggplot(df.tab) + geom_point(aes(x=score, y=pred)) + theme_classic()
dev.off()
## Classification forest
ranger(score ~ ., data = df)
# Type: Regression
# Number of trees: 500
# Sample size: 5109
# Number of independent variables: 111
# Mtry: 10
# Target node size: 5
# Variable importance mode: none
# Splitrule: variance
# OOB prediction error (MSE): 37.46318
# R squared (OOB): 0.07036481
train.idx <- sample(nrow(df), 2/3 * nrow(df))
df.train <- df[train.idx, ]
df.test <- df[-train.idx, ]
rg.df <- ranger(score ~ ., data = df.train)
# Type: Regression
# Number of trees: 500
# Sample size: 3406
# Number of independent variables: 111
# Mtry: 10
# Target node size: 5
# Variable importance mode: none
# Splitrule: variance
# OOB prediction error (MSE): 38.38306
# R squared (OOB): 0.07143403
pred.df <- predict(rg.df, data = df.test)
# Type: Regression
# Sample size: 1703
# Number of independent variables: 111
df.tab <- data.frame(score = df.test$score, pred = pred.df$predictions)
library(ggplot2)
pdf("ranger.test.pred.pdf")
ggplot(df.tab) + geom_point(aes(x=score, y=pred)) + theme_classic()
dev.off()
cor.test(df.tab$score, df.tab$pred, method=c("pearson", "kendall", "spearman"))
# Pearson's product-moment correlation
#
# data: df.tab$score and df.tab$pred
# t = 10.903, df = 1701, p-value < 2.2e-16
# alternative hypothesis: true correlation is not equal to 0
# 95 percent confidence interval:
# 0.2106409 0.2994489
# sample estimates:
# cor
# 0.255584
## Quantile regression forest
rf <- ranger(score ~ ., df.train[-1], quantreg = TRUE)
# Type: Regression
# Number of trees: 500
# Sample size: 3406
# Number of independent variables: 110
# Mtry: 10
# Target node size: 5
# Variable importance mode: none
# Splitrule: variance
# OOB prediction error (MSE): 38.58052
# R squared (OOB): 0.0666571
pred <- predict(rf, df.test[-1], type = "quantiles", quantiles = c(0.1, 0.5, 0.9))
pred$predictions
df.tab <- data.frame(score = df.test$score, pred$predictions)
pdf("ranger.test2.quant.1.pdf")
ggplot(df.tab) + geom_point(aes(x=score, y=quantile..0.1)) + theme_classic()
dev.off()
pdf("ranger.test2.quant.5.pdf")
ggplot(df.tab) + geom_point(aes(x=score, y=quantile..0.5)) + theme_classic()
dev.off()
pdf("ranger.test2.quant.9.pdf")
ggplot(df.tab) + geom_point(aes(x=score, y=quantile..0.9)) + theme_classic()
dev.off()
cor.test(df.tab$score, df.tab$quantile..0.1, method=c("pearson", "kendall", "spearman"))
# Pearson's product-moment correlation
#
# data: df.tab$score and df.tab$quantile..0.1
# t = 8.3358, df = 1701, p-value < 2.2e-16
# alternative hypothesis: true correlation is not equal to 0
# 95 percent confidence interval:
# 0.1520390 0.2433193
# sample estimates:
# cor
# 0.1981087
cor.test(df.tab$score, df.tab$quantile..0.5, method=c("pearson", "kendall", "spearman"))
# Pearson's product-moment correlation
#
# data: df.tab$score and df.tab$quantile..0.5
# t = 9.5903, df = 1701, p-value < 2.2e-16
# alternative hypothesis: true correlation is not equal to 0
# 95 percent confidence interval:
# 0.1809337 0.2710716
# sample estimates:
# cor
# 0.2264876
cor.test(df.tab$score, df.tab$quantile..0.9, method=c("pearson", "kendall", "spearman"))
# Pearson's product-moment correlation
#
# data: df.tab$score and df.tab$quantile..0.9
# t = 6.5063, df = 1701, p-value = 1.01e-10
# alternative hypothesis: true correlation is not equal to 0
# 95 percent confidence interval:
# 0.1091355 0.2018344
# sample estimates:
# cor
# 0.1558281
## Quantile regression forest with user-specified function
rf <- ranger(score ~ ., df.train[-1], quantreg = TRUE)
pred <- predict(rf, df.test[-1], type = "quantiles", what = function(x) sample(x, 10, replace = TRUE))
pred$predictions
df.tab <- data.frame(score = df.test$score, pred$predictions)
cor.test(df.tab$score, df.tab[,2], method=c("pearson", "kendall", "spearman"))
# data: df.tab$score and df.tab[, 2]
# t = 3.7462, df = 1701, p-value = 0.0001855
# alternative hypothesis: true correlation is not equal to 0
# 95 percent confidence interval:
# 0.04314433 0.13736936
# sample estimates:
# cor
# 0.09045928
cor.test(df.tab$score, df.tab[,3], method=c("pearson", "kendall", "spearman"))
# t = 4.1876, df = 1701, p-value = 2.964e-05
# cor=0.1010146
cor.test(df.tab$score, df.tab[,4], method=c("pearson", "kendall", "spearman"))
cor.test(df.tab$score, df.tab[,5], method=c("pearson", "kendall", "spearman"))
cor.test(df.tab$score, df.tab[,6], method=c("pearson", "kendall", "spearman"))
cor.test(df.tab$score, df.tab[,7], method=c("pearson", "kendall", "spearman"))
cor.test(df.tab$score, df.tab[,8], method=c("pearson", "kendall", "spearman"))
cor.test(df.tab$score, df.tab[,9], method=c("pearson", "kendall", "spearman"))
cor.test(df.tab$score, df.tab[,10], method=c("pearson", "kendall", "spearman"))
# t = 3.8904, df = 1701, p-value = 0.0001039
# cor=0.09391143
module load python
source /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/scripts/loadcondaandes.sh
conda activate /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/Libraries/andes/envs/test
R
library(ranger)
setwd("/gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli")
df <- read.delim("ecoli.features.500bp.wavelet.dcast.txt", header=T, sep="\t", stringsAsFactors = F)
library(ranger)
iRF <- function(xmat, y, ntree=500, iter=5, classification=F, threads=1,alwayssplits=NULL, saveall=T)
{
tmp <- cbind(xmat, Y = y)
wt <- rep(1/ncol(xmat), ncol(xmat)) # start with equal sample weighting per SNP
rfs <- list()
for(i in 1:iter)
{
cat("\niRF iteration ",i,"\n")
cat("=================\n")
mtry = 0.5*sum(wt>0)
rf <- ranger::ranger(dependent.variable.name = "Y", data = tmp, num.trees=ntree,
split.select.weights = wt, classification = classification,
mtry = mtry, importance = "impurity_corrected", num.threads=threads, write.forest = T,
always.split.variables = alwayssplits)
wt <- rf$variable.importance / sum(abs(rf$variable.importance)) # scale importances to range(0,1)
wt[wt<0] <- 0 # set negative weights to zero
cat("mtry: ", mtry, "\n")
cat("prediction error: ",rf$prediction.error,"\n")
if(classification==FALSE) cat("r^2: ",rf$r.squared,"\n")
if(classification==TRUE) print(rf$confusion.matrix)
cat("cor(y,yhat): ",cor(rf$predictions,y),"\n")
cat("SNPs with importance > 0:",sum(wt>0),"\n")
if(saveall) rfs[[i]] <- rf
if(sum(wt>0) < max(0.01*(ncol(xmat)-1), 10))
{
if(!saveall) rfs <- rf
break
}
}
return(rfs)
}
iRF(df[,3:ncol(df)], df$score)
module load python
source /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/scripts/loadcondaandes.sh
conda activate /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/Libraries/andes/envs/test
# R
library(dplyr)
library(reshape2)
library(MassSpecWavelet)
setwd("/gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli")
gene <- read.delim("ecoli.gene.windows500.bed", header=F, sep="\t", stringsAsFactors = F)
structure <- read.delim("500bp.protein.structure.fa", header=T, sep="\t", stringsAsFactors = F)
nuc <- read.delim("nucleotide_counts_500bp_temp.txt", header=T, sep="\t", stringsAsFactors = F)
rnaseq <- read.delim("ecoli.rnaseq.average.windows500.bed", header=T, sep="\t", stringsAsFactors = F)
window <- read.delim("ecoli.500bp.windows.bed", header=F, sep="\t", stringsAsFactors = F)
score <- read.delim("window500.score.avg.txt", header=T, sep="\t", stringsAsFactors = F)
gene.bin <- gene %>% group_by(V1, V2, V3) %>% mutate(gene.count = n())
gene.count <- unique(gene.bin[,c(1:3,14)])
gene.df <- gene.count$gene.count
structure.df <- structure[,2]
gc.df <- nuc[,7]
temp.df <- nuc[,8]
rna.df <- rnaseq[,4]
## Plot the 2-D CWT coefficients as image (It may take a while!)
scales <- seq(1, 100, 2)
wCoefs.temp <- cwt(temp.df, scales=scales, wavelet='mexh')
wCoefs.gc <- cwt(gc.df, scales=scales, wavelet='mexh')
wCoefs.structure <- cwt(structure.df, scales=scales, wavelet='mexh')
wCoefs.rna <- cwt(rna.df, scales=scales, wavelet='mexh')
wCoefs.gene <- cwt(gene.df, scales=scales, wavelet='mexh')
wCoefs.temp.melt <- melt(wCoefs.temp)
wCoefs.gc.melt <- melt(wCoefs.gc)
wCoefs.structure.melt <- melt(wCoefs.structure)
wCoefs.rna.melt <- melt(wCoefs.rna)
wCoefs.gene.melt <- melt(wCoefs.gene)
colnames(window) <- c("chr", "start", "end")
window$Var1 <- seq.int(nrow(window))
colnames(score) <- c("chr", "start", "end", "score")
window.score <- left_join(window, score, by=c("chr", "start", "end"))
window.score.temp <- left_join(window.score, wCoefs.temp.melt, by="Var1")
window.temp.gc <- left_join(window.score.temp, wCoefs.gc.melt, by=c("Var1", "Var2"))
window.temp.gc.structure <- left_join(window.temp.gc, wCoefs.structure.melt, by=c("Var1", "Var2"))
window.temp.gc.structure.rna <- left_join(window.temp.gc.structure, wCoefs.rna.melt, by=c("Var1", "Var2"))
window.temp.gc.structure.rna.gene <- left_join(window.temp.gc.structure.rna, wCoefs.gene.melt, by=c("Var1", "Var2"))
colnames(window.temp.gc.structure.rna.gene) <- c("chr", "start", "end", "window", "score", "scale", "melting.temp", "gc.content", "structure", "fpkm", "gene.density")
df.melt <- melt(window.temp.gc.structure.rna.gene[,4:11], id=c("window", "score", "scale"))
df <- na.omit(df.melt)
library(tidyr)
df.id <- df %>% unite(feature.scale, c(variable, scale), sep = "")
df.dcast <- df.id %>% dcast(window + score ~ feature.scale, value.var = "value")
df.dcast <- na.omit(df.dcast)
structure$Var1 <- seq.int(nrow(structure))
gene.count$Var1 <- seq.int(nrow(gene.count))
nuc$Var1 <- seq.int(nrow(nuc))
rnaseq$Var1 <- seq.int(nrow(rnaseq))
structure$Var2 <- 0
gene.count$Var2 <- 0
nuc$Var2 <- 0
rnaseq$Var2 <- 0
window.score.temp <- left_join(window.score, nuc[,8:10], by="Var1")
window.temp.gc <- left_join(window.score.temp, nuc[,c(7,9:10)], by=c("Var1", "Var2"))
window.temp.gc.gene <- left_join(window.temp.gc, gene.count[,c(4:6)], by=c("Var1", "Var2"))
window.temp.gc.gene.rna <- left_join(window.temp.gc.gene, rnaseq[,c(4:6)], by=c("Var1", "Var2"))
window.temp.gc.gene.rna.structure <- left_join(window.temp.gc.gene.rna, structure[,c(2,22,23)], by=c("Var1", "Var2"))
colnames(window.temp.gc.gene.rna.structure) <- c("chr", "start", "end", "window", "score", "melting.temp", "scale", "gc.content", "gene.density", "fpkm", "structure")
df2.melt <- melt(window.temp.gc.gene.rna.structure[,4:11], id=c("window", "score", "scale"))
df2 <- na.omit(df2.melt)
df3 <- rbind(df, df2)
library(tidyr)
df.id <- df3 %>% unite(feature.scale, c(variable, scale), sep = "")
df.dcast <- df.id %>% dcast(window + score ~ feature.scale, value.var = "value")
df.dcast <- na.omit(df.dcast)
write.table(df.dcast, "ecoli.features.500bp.wavelet.raw.dcast.txt", quote=F, row.names=F, sep="\t")
###### regression
library(randomForest)
setwd("/gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli")
df <- read.delim("ecoli.features.500bp.wavelet.raw.dcast.txt", header=T, sep="\t", stringsAsFactors = F)
set.seed(131)
rf <- randomForest(score ~ ., data=df[-1], mtry=255, importance=TRUE, ntree=500)
print(rf)
# Call:
# randomForest(formula = score ~ ., data = df[-1], mtry = 255, importance = TRUE, ntree = 500)
# Type of random forest: regression
# Number of trees: 500
# No. of variables tried at each split: 255
#
# Mean of squared residuals: 36.99016
# % Var explained: 8.19
imp <- data.frame(importance(rf))
imp.order <- imp[order(imp),]
tail(na.omit(imp.order))
# X.IncMSE IncNodePurity
# gene.density7 13.58831 2355.832
# structure13 13.81274 1311.481
# gene.density5 14.88046 3149.443
# melting.temp0 16.11066 6414.466
# structure99 17.73517 1975.141
# gc.content0 19.37491 7285.205
pdf("rf.all.raw.500bp.importance.pdf")
varImpPlot(rf)
dev.off()
pdf("rf.all.raw.500bp.residual.pdf")
plot(rf$y, rf$y - rf$predicted)
dev.off()
pdf("rf.all.raw.500bp.predicted.pdf")
plot(rf$y, rf$predicted)
dev.off()
# run with just raw values... no wavelets
library(dplyr)
library(reshape2)
library(MassSpecWavelet)
setwd("/gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli")
setwd("/gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli")
gene <- read.delim("ecoli.gene.windows500.bed", header=F, sep="\t", stringsAsFactors = F)
structure <- read.delim("500bp.protein.structure.fa", header=T, sep="\t", stringsAsFactors = F)
nuc <- read.delim("nucleotide_counts_500bp_temp.txt", header=T, sep="\t", stringsAsFactors = F)
rnaseq <- read.delim("ecoli.rnaseq.average.windows500.bed", header=T, sep="\t", stringsAsFactors = F)
window <- read.delim("ecoli.500bp.windows.bed", header=F, sep="\t", stringsAsFactors = F)
score <- read.delim("window500.score.avg.txt", header=T, sep="\t", stringsAsFactors = F)
gene.bin <- gene %>% group_by(V1, V2, V3) %>% mutate(gene.count = n())
gene.count <- unique(gene.bin[,c(1:3,14)])
structure$Var1 <- seq.int(nrow(structure))
gene.count$Var1 <- seq.int(nrow(gene.count))
nuc$Var1 <- seq.int(nrow(nuc))
rnaseq$Var1 <- seq.int(nrow(rnaseq))
structure$Var2 <- 0
gene.count$Var2 <- 0
nuc$Var2 <- 0
rnaseq$Var2 <- 0
colnames(window) <- c("chr", "start", "end")
window$Var1 <- seq.int(nrow(window))
colnames(score) <- c("chr", "start", "end", "score")
window.score <- left_join(window, score, by=c("chr", "start", "end"))
window.score.temp <- left_join(window.score, nuc[,8:10], by="Var1")
window.temp.gc <- left_join(window.score.temp, nuc[,c(7,9:10)], by=c("Var1", "Var2"))
window.temp.gc.gene <- left_join(window.temp.gc, gene.count[,c(4:6)], by=c("Var1", "Var2"))
window.temp.gc.gene.rna <- left_join(window.temp.gc.gene, rnaseq[,c(4:6)], by=c("Var1", "Var2"))
window.temp.gc.gene.rna.structure <- left_join(window.temp.gc.gene.rna, structure[,c(2,22,23)], by=c("Var1", "Var2"))
colnames(window.temp.gc.gene.rna.structure) <- c("chr", "start", "end", "window", "score", "melting.temp", "scale", "gc.content", "gene.density", "fpkm", "structure")
df2.melt <- melt(window.temp.gc.gene.rna.structure[,4:11], id=c("window", "score", "scale"))
df2 <- na.omit(df2.melt)
library(tidyr)
df.id <- df2 %>% unite(feature.scale, c(variable, scale), sep = "")
df.dcast <- df.id %>% dcast(window + score ~ feature.scale, value.var = "value")
df.dcast <- na.omit(df.dcast)
mtry <- tuneRF(df.dcast[-1],df.dcast$score, ntreeTry=500, stepFactor=1.5,improve=0.01, trace=TRUE, plot=TRUE)
best.m <- mtry[mtry[, 2] == min(mtry[, 2]), 1]
print(mtry)
# mtry OOBError
# 2 2 0.35827938
# 3 3 0.06911110
# 4 4 0.01751164
# 6 6 0.00107009
print(best.m)
set.seed(131)
rf <- randomForest(score ~ ., data=df.dcast[-1], mtry=5, importance=TRUE, ntree=500)
print(rf)
# Call:
# randomForest(formula = score ~ ., data = df.dcast[-1], mtry = 5, importance = TRUE, ntree = 500)
# Type of random forest: regression
# Number of trees: 500
# No. of variables tried at each split: 5
#
# Mean of squared residuals: 41.95043
# % Var explained: -4.12
imp <- data.frame(importance(rf))
imp.order <- imp[order(imp),]
tail(na.omit(imp.order))
# X.IncMSE IncNodePurity
# gene.density0 2.946551 7251.867
# fpkm0 12.727303 78364.490
# structure0 21.238520 44787.742
# gc.content0 35.183095 27790.830
# melting.temp0 37.730071 27929.362
library(ranger)
iRF <- function(xmat, y, ntree=500, iter=5, classification=F, threads=1,alwayssplits=NULL, saveall=T)
{
tmp <- cbind(xmat, Y = y)
wt <- rep(1/ncol(xmat), ncol(xmat)) # start with equal sample weighting per SNP
rfs <- list()
for(i in 1:iter)
{
cat("\niRF iteration ",i,"\n")
cat("=================\n")
mtry = 0.5*sum(wt>0)
rf <- ranger::ranger(dependent.variable.name = "Y", data = tmp, num.trees=ntree,
split.select.weights = wt, classification = classification,
mtry = mtry, importance = "impurity_corrected", num.threads=threads, write.forest = T,
always.split.variables = alwayssplits)
wt <- rf$variable.importance / sum(abs(rf$variable.importance)) # scale importances to range(0,1)
wt[wt<0] <- 0 # set negative weights to zero
cat("mtry: ", mtry, "\n")
cat("prediction error: ",rf$prediction.error,"\n")
if(classification==FALSE) cat("r^2: ",rf$r.squared,"\n")
if(classification==TRUE) print(rf$confusion.matrix)
cat("cor(y,yhat): ",cor(rf$predictions,y),"\n")
cat("SNPs with importance > 0:",sum(wt>0),"\n")
if(saveall) rfs[[i]] <- rf
if(sum(wt>0) < max(0.01*(ncol(xmat)-1), 10))
{
if(!saveall) rfs <- rf
break
}
}
return(rfs)
}
iRF(df.dcast[,3:ncol(df.dcast)], df.dcast$score)
# mtry: 2.5
# prediction error: 39.78074
# r^2: 0.01285541
# cor(y,yhat): 0.1666902
# SNPs with importance > 0: 4
#
# Type: Regression
# Number of trees: 500
# Sample size: 5109
# Number of independent variables: 5
# Mtry: 2
# Target node size: 5
# Variable importance mode: impurity_corrected
# Splitrule: variance
# OOB prediction error (MSE): 39.78074
# R squared (OOB): 0.01285541
https://shiring.github.io/machine_learning/2017/03/16/rf_plot_ggraph https://www.datacareer.ch/blog/random-forest-in-r-an-example/
module load python
source /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/scripts/loadcondaandes.sh
conda activate /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/Libraries/andes/envs/test
# R
library(randomForest)
setwd("/gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli")
data <- read.delim("window.temp.gc.structure.rna.gene.input.txt", header=T, sep="\t", stringsAsFactors = F)
df <- na.omit(data)
library(reshape2)
library(dplyr)
df.dcast <- df %>% dcast(window + score + scale ~ variable, value.var = "value")
df.dcast <- na.omit(df.dcast)
# separate into training and test data
library(caret)
set.seed(42)
index <- createDataPartition(df.dcast$score, p = 0.7, list = FALSE)
train_data <- df.dcast[index, ]
test_data <- df.dcast[-index, ]
# run model
set.seed(42)
model_rf <- caret::train(score ~ ., data = train_data[-1], method = "rf", preProcess = c("scale", "center"), trControl = trainControl(method = "repeatedcv", number = 10, repeats = 10, savePredictions = TRUE, verboseIter = FALSE))
model_rf$results
# mtry RMSE Rsquared MAE RMSESD RsquaredSD MAESD
# 1 2 5.197843 0.1176861 3.976653 0.1568225 0.02746348 0.1076914
# 2 4 5.175104 0.1274827 3.946541 0.1562126 0.02990854 0.1081773
# 3 6 5.165462 0.1315344 3.932913 0.1586892 0.03081919 0.1100221
# prediction and performance metrics
pred <- predict(model_rf, test_data)
library(ROCR)
perf = prediction(data.frame(pred), data.frame(test_data$score))
## ROCR currently supports only evaluation of binary classification tasks.
# 1. Area under curve
auc = performance(perf, "auc")
auc
# 2. True Positive and Negative Rate
pred3 = performance(perf, "tpr","fpr")
# 3. Plot the ROC curve
pdf("rf.roc.pdf")
plot(pred3,main="ROC Curve for Random Forest",col=2,lwd=2)
abline(a=0,b=1,lwd=2,lty=2,col="gray")
dev.off()
# alternative approach to look at each feature contribution --> individual features play a small role (especially in a poorly predictive model)
# Create features and target
X <- df.dcast %>% select(fpkm, gene.density, scale, structure, gc.content, melting.temp)
y <- df.dcast$score
# Split data into training and test sets
index <- createDataPartition(y, p=0.75, list=FALSE)
X_train <- X[ index, ]
X_test <- X[-index, ]
y_train <- y[index]
y_test<-y[-index]
# Train the model
regr <- randomForest(x = X_train, y = y_train , maxnodes = 10, ntree = 10)
# Make prediction
predictions <- predict(regr, X_test)
result <- X_test
result['score'] <- y_test
result['prediction']<- predictions
head(result)
# fpkm gene.density scale structure gc.content melting.temp score
# 1 -445.6984 -0.06047305 1 0.012161122 -0.065808777 -2.6981599 17.72125
# 9 2580.6988 0.82450852 25 0.035880115 -0.031645828 -1.2974790 17.72125
# 11 2178.4112 0.58351224 31 0.049097433 -0.067301541 -2.7593632 17.72125
# 14 1455.3243 0.44398115 40 0.043869933 -0.070270784 -2.8811021 17.72125
# 17 817.3342 0.46428202 49 0.014476622 -0.022961904 -0.9414381 17.72125
# 18 672.7136 0.46254925 52 0.002600081 -0.002414944 -0.0990127 17.72125
# prediction
# 1 24.60309
# 9 24.30413
# 11 24.51526
# 14 24.51526
# 17 24.38288
# 18 24.38288
# Build scatterplot
library(ggplot2)
pdf("rf.scatter.fpkm.pdf")
ggplot( ) +
geom_point( aes(x = X_test$fpkm, y = y_test, color = 'red', alpha = 0.5) ) +
geom_point( aes(x = X_test$fpkm , y = predictions, color = 'blue', alpha = 0.5)) +
labs(x = "fpkm", y = "score", color = "", alpha = 'Transperency') +
scale_color_manual(labels = c( "Predicted", "Real"), values = c("blue", "red"))
dev.off()
pdf("rf.scatter.gene.density.pdf")
ggplot( ) +
geom_point( aes(x = X_test$gene.density, y = y_test, color = 'red', alpha = 0.5) ) +
geom_point( aes(x = X_test$gene.density , y = predictions, color = 'blue', alpha = 0.5)) +
labs(x = "gene.density", y = "score", color = "", alpha = 'Transperency') +
scale_color_manual(labels = c( "Predicted", "Real"), values = c("blue", "red"))
dev.off()
pdf("rf.scatter.scale.pdf")
ggplot( ) +
geom_point( aes(x = X_test$scale, y = y_test, color = 'red', alpha = 0.5) ) +
geom_point( aes(x = X_test$scale , y = predictions, color = 'blue', alpha = 0.5)) +
labs(x = "scale", y = "score", color = "", alpha = 'Transperency') +
scale_color_manual(labels = c( "Predicted", "Real"), values = c("blue", "red"))
dev.off()
pdf("rf.scatter.structure.pdf")
ggplot( ) +
geom_point( aes(x = X_test$structure, y = y_test, color = 'red', alpha = 0.5) ) +
geom_point( aes(x = X_test$structure , y = predictions, color = 'blue', alpha = 0.5)) +
labs(x = "structure", y = "score", color = "", alpha = 'Transperency') +
scale_color_manual(labels = c( "Predicted", "Real"), values = c("blue", "red"))
dev.off()
pdf("rf.scatter.gc.content.pdf")
ggplot( ) +
geom_point( aes(x = X_test$gc.content, y = y_test, color = 'red', alpha = 0.5) ) +
geom_point( aes(x = X_test$gc.content , y = predictions, color = 'blue', alpha = 0.5)) +
labs(x = "gc.content", y = "score", color = "", alpha = 'Transperency') +
scale_color_manual(labels = c( "Predicted", "Real"), values = c("blue", "red"))
dev.off()
pdf("rf.scatter.melting.temp.pdf")
ggplot( ) +
geom_point( aes(x = X_test$melting.temp, y = y_test, color = 'red', alpha = 0.5) ) +
geom_point( aes(x = X_test$melting.temp , y = predictions, color = 'blue', alpha = 0.5)) +
labs(x = "melting.temp", y = "score", color = "", alpha = 'Transperency') +
scale_color_manual(labels = c( "Predicted", "Real"), values = c("blue", "red"))
dev.off()
# Import library for Metrics
library(Metrics)
print(paste0('MAE: ' , mae(y_test,predictions) ))
# MAE: 4.21631985178738
print(paste0('MSE: ' ,caret::postResample(predictions , y_test)['RMSE']^2 ))
# MSE: 28.8711109032785
print(paste0('R2: ' ,caret::postResample(predictions , y_test)['Rsquared'] ))
# R2: 0.0272489286481885
# tuning parameters
# If training the model takes too long try setting up lower value of N
N=500 #length(X_train)
X_train_ = X_train[1:N , ]
y_train_ = y_train[1:N]
seed <-7
metric<-'RMSE'
customRF <- list(type = "Regression", library = "randomForest", loop = NULL)
customRF$parameters <- data.frame(parameter = c("maxnodes", "ntree"), class = rep("numeric", 2), label = c("maxnodes", "ntree"))
customRF$grid <- function(x, y, len = NULL, search = "grid") {}
customRF$fit <- function(x, y, wts, param, lev, last, weights, classProbs, ...) {
randomForest(x, y, maxnodes = param$maxnodes, ntree=param$ntree, ...)
}
customRF$predict <- function(modelFit, newdata, preProc = NULL, submodels = NULL)
predict(modelFit, newdata)
customRF$prob <- function(modelFit, newdata, preProc = NULL, submodels = NULL)
predict(modelFit, newdata, type = "prob")
customRF$sort <- function(x) x[order(x[,1]),]
customRF$levels <- function(x) x$classes
# Set grid search parameters
control <- trainControl(method="repeatedcv", number=10, repeats=3, search='grid')
# Outline the grid of parameters
tunegrid <- expand.grid(.maxnodes=c(70,80,90,100), .ntree=c(900, 1000, 1100))
set.seed(seed)
# Train the model
rf_gridsearch <- train(x=X_train_, y=y_train_, method=customRF, metric=metric, tuneGrid=tunegrid, trControl=control)
pdf("rf.gridsearch.pdf")
plot(rf_gridsearch)
dev.off()
rf_gridsearch$bestTune
pdf("rf.gridsearch.importance.pdf")
varImpPlot(rf_gridsearch$finalModel, main ='Feature importance')
dev.off()
```
## Categorical
```{r eval=F}
module load python
source /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/scripts/loadcondaandes.sh
conda activate /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/Libraries/andes/envs/test
# R
library(dplyr)
library(reshape2)
setwd("/gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli")
df <- read.delim("window.temp.gc.structure.rna.gene.df.txt", header=T, sep="\t", stringsAsFactors = F)
quality <- read.delim("window.quality.max.txt", header=T, sep="\t", stringsAsFactors = F)
colnames(quality) <- c("chr", "start", "end", "quality")
df.qual <- left_join(df, quality, by=c("chr", "start", "end"))
df.melt <- melt(df.qual[,c(4,6:12)], id=c("window", "scale", "quality"))
write.table(df.melt, "window.temp.gc.structure.rna.gene.input.category.txt", quote=F, row.names=F, sep="\t")
setwd("/gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli")
df.melt <- read.delim("window.temp.gc.structure.rna.gene.input.category.txt", header=T, sep="\t", stringsAsFactors = F)
library(randomForest)
library(reshape2)
library(dplyr)
library(caret)
df.na <- na.omit(df.melt)
df.dcast <- df.na %>% dcast(window + quality + scale ~ variable, value.var = "value")
df.dcast <- na.omit(df.dcast)
df.dcast$scale <- as.numeric(df.dcast$scale)
df.dcast$fpkm <- as.numeric(df.dcast$fpkm)
df.dcast$gc.content <- as.numeric(df.dcast$gc.content)
df.dcast$gene.density <- as.numeric(df.dcast$gene.density)
df.dcast$melting.temp <- as.numeric(df.dcast$melting.temp)
df.dcast$structure <- as.numeric(df.dcast$structure)
df.dcast$binary <- as.numeric(df.dcast$binary)
set.seed(42)
index <- createDataPartition(df.dcast$quality, p = 0.7, list = FALSE)
train_data <- df.dcast[index, ]
test_data <- df.dcast[-index, ]
dim(train_data)
dim(test_data)
rf <- randomForest(quality ~ ., data=train_data[-1])
rf_classifier = randomForest(quality ~ ., data=train_data[-1], ntree=100, mtry=2, importance=TRUE)
pred = predict(rf, newdata=test_data)
cm = table(test_data, pred)
set.seed(42)
index <- createDataPartition(df.na$quality, p = 0.7, list = FALSE)
train_data <- df.na[index, ]
test_data <- df.na[-index, ]
# run model
set.seed(42)
model_rf <- caret::train(quality ~ ., data = train_data[-1], method = "rf", preProcess = c("scale", "center"), trControl = trainControl(method = "repeatedcv", number = 10, repeats = 10, savePredictions = TRUE, verboseIter = FALSE))
model_rf$results
print(rf)
importance(rf)
# prediction and performance metrics
pred <- predict(model_rf, test_data)
library(ROCR)
perf = prediction(data.frame(pred), data.frame(test_data$quality))
# 1. Area under curve
auc = performance(perf, "auc")
auc
# 2. True Positive and Negative Rate
pred3 = performance(perf, "tpr","fpr")
# 3. Plot the ROC curve
pdf("rf.roc.pdf")
plot(pred3,main="ROC Curve for Random Forest",col=2,lwd=2)
abline(a=0,b=1,lwd=2,lty=2,col="gray")
dev.off()
df.dcast <- df.na %>% dcast(window + quality + scale ~ variable, value.var = "value")
set.seed(42)
index <- createDataPartition(df.dcast$quality, p = 0.7, list = FALSE)
train_data <- df.dcast[index, ]
test_data <- df.dcast[-index, ]
# run model
set.seed(42)
model_rf <- caret::train(quality ~ ., data = train_data[-1], method = "rf", preProcess = c("scale", "center"), trControl = trainControl(method = "repeatedcv", number = 10, repeats = 10, savePredictions = TRUE, verboseIter = FALSE))
model_rf$results
print(rf)
importance(rf)
# prediction and performance metrics
pred <- predict(model_rf, test_data)
library(ROCR)
perf = prediction(data.frame(pred), data.frame(test_data$quality))
# 1. Area under curve
auc = performance(perf, "auc")
auc
# 2. True Positive and Negative Rate
pred3 = performance(perf, "tpr","fpr")
# 3. Plot the ROC curve
pdf("rf.roc.pdf")
plot(pred3,main="ROC Curve for Random Forest",col=2,lwd=2)
abline(a=0,b=1,lwd=2,lty=2,col="gray")
dev.off()
```
## Use gRNA sequence regions
- add gRNA sequence region information (raw or wavelet) to the region-based matrix?
```{r eval=F}
module load python
source /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/scripts/loadcondaandes.sh
conda activate /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/Libraries/andes/envs/test
# R
setwd("/gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli")
d1 <- read.delim("DataS1.txt", header=T, sep="\t")
d4 <- read.delim("DataS4.txt", header=T, sep="\t")
d6 <- read.delim("DataS6.txt", header=T, sep="\t")
coord <- read.delim("ecoli.gRNA.blast.bed", header=F, sep="\t")
colnames(coord) <- c("chr", "start", "end", "sgRNA")
d1$sgRNA <- d1$sgRNAID
d4$sgRNA <- d4$sgRNAID
library(dplyr)
df <- left_join(coord, d1, by="sgRNA")
df2 <- left_join(df, d4, by="sgRNA")
df3 <- left_join(df2, d6, by="sgRNA")
write.table(df3, "sgRNA.coord.txt", quote=F, row.names=F, sep="\t")
setwd("/gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli")
d1 <- read.delim("DataS1.txt", header=T, sep="\t")
d1rbs <- read.delim("DataS1.rbs.txt", header=T, sep="\t")
d4 <- read.delim("DataS4.txt", header=T, sep="\t")
d6 <- read.delim("DataS6.txt", header=T, sep="\t")
coord <- read.delim("ecoli.gRNA.all.blast.bed", header=F, sep="\t")
colnames(coord) <- c("chr", "start", "end", "sgRNA")
d1$sgRNA <- d1$sgRNAID
d1rbs$sgRNA <- d1rbs$sgRNAID
d1all <- rbind(d1, d1rbs)
d4$sgRNA <- d4$sgRNAID
library(dplyr)
df <- left_join(coord, d1all, by="sgRNA")
df2 <- left_join(df, d4, by="sgRNA")
df3 <- left_join(df2, d6, by="sgRNA")
write.table(df3, "sgRNA.rbs.coord.txt", quote=F, row.names=F, sep="\t")
```
### melting temp
https://biopython.org/docs/1.75/api/Bio.SeqUtils.MeltingTemp.html
```{r eval=F}
module load python
source /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/scripts/loadcondaandes.sh
conda activate /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/Libraries/andes/envs/test
# count nucleotides
cd /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli
python
input_file = open('ecoli.gRNA.fasta', 'r')
output_file = open('gRNA_nuc_counts.tsv','w')
output_file.write('Window\tA\tC\tG\tT\tLength\tCG%\n')
from Bio import SeqIO
for cur_record in SeqIO.parse(input_file, "fasta") :
gene_name = cur_record.name
A_count = cur_record.seq.count('A')
C_count = cur_record.seq.count('C')
G_count = cur_record.seq.count('G')
T_count = cur_record.seq.count('T')
length = len(cur_record.seq)
cg_percentage = float(C_count + G_count) / length
output_line = '%s\t%i\t%i\t%i\t%i\t%i\t%f\n' % \
(gene_name, A_count, C_count, G_count, T_count, length, cg_percentage)
output_file.write(output_line)
output_file.close()
input_file.close()
exit()
# Melting temperature(°C) = 64.9 + 41 * (nG+nC-16.4)/(nA+nT+nG+nC)
R
library(dplyr)
setwd("/gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli")
df <- read.delim("gRNA_nuc_counts.tsv", header=T, sep="\t")
df.melt <- df %>% mutate(MeltingTemp = 64.9 + 41 * (G+C-16.4) / (A+T+G+C))
write.table(df.melt, "gRNA_nuc_counts_temp.txt", quote=F, row.names=F, sep="\t")
q()
# count nucleotides - all sgRNA (RBS)
cd /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli
python
input_file = open('ecoli.gRNA.rbs.fasta', 'r')
output_file = open('gRNA_rbs_nuc_counts.tsv','w')
output_file.write('Window\tA\tC\tG\tT\tLength\tCG%\n')
from Bio import SeqIO
for cur_record in SeqIO.parse(input_file, "fasta") :
gene_name = cur_record.name
A_count = cur_record.seq.count('A')
C_count = cur_record.seq.count('C')
G_count = cur_record.seq.count('G')
T_count = cur_record.seq.count('T')
length = len(cur_record.seq)
cg_percentage = float(C_count + G_count) / length
output_line = '%s\t%i\t%i\t%i\t%i\t%i\t%f\n' % \
(gene_name, A_count, C_count, G_count, T_count, length, cg_percentage)
output_file.write(output_line)
output_file.close()
input_file.close()
exit()
R
library(dplyr)
setwd("/gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli")
df <- read.delim("gRNA_rbs_nuc_counts.tsv", header=T, sep="\t")
df.melt <- df %>% mutate(MeltingTemp = 64.9 + 41 * (G+C-16.4) / (A+T+G+C))
write.table(df.melt, "gRNA_rbs_nuc_counts_temp.txt", quote=F, row.names=F, sep="\t")
q()
```
### structure
```{r eval=F}
module load python
source /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/scripts/loadcondaandes.sh
conda activate /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/Libraries/andes/envs/test
cd /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli
python /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/Libraries/andes/iFeature/iFeature.py --file ecoli.gRNA.fasta --type AAC --out ecoli.gRNA.structure.txt
python /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/Libraries/andes/iFeature/iFeature.py --file ecoli.gRNA.rbs.fasta --type AAC --out ecoli.gRNA.rbs.structure.txt
cd /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli
python /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/Libraries/andes/iFeature/iFeature.py --file ecoli_sgRNA_fullseq.fasta --type AAC --out ecoli.gRNA.rbs.fullseq.structure.txt
```
### wavelets
```{r eval=F}
module load python
source /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/scripts/loadcondaandes.sh
conda activate /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/Libraries/andes/envs/test
# R
library(dplyr)
library(reshape2)
library(MassSpecWavelet)
setwd("/gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli")
gene <- read.delim("ecoli.gene.windows500.bed", header=F, sep="\t", stringsAsFactors = F)
structure <- read.delim("500bp.protein.structure.fa", header=T, sep="\t", stringsAsFactors = F)
nuc <- read.delim("nucleotide_counts_500bp_temp.txt", header=T, sep="\t", stringsAsFactors = F)
rnaseq <- read.delim("ecoli.rnaseq.average.windows500.bed", header=T, sep="\t", stringsAsFactors = F)
window <- read.delim("ecoli.500bp.windows.bed", header=F, sep="\t", stringsAsFactors = F)
score <- read.delim("window500.score.avg.txt", header=T, sep="\t", stringsAsFactors = F)
gene.bin <- gene %>% group_by(V1, V2, V3) %>% mutate(gene.count = n())
gene.count <- unique(gene.bin[,c(1:3,14)])
gene.df <- gene.count$gene.count
structure.df <- structure[,2]
gc.df <- nuc[,7]
temp.df <- nuc[,8]
rna.df <- rnaseq[,4]
## Plot the 2-D CWT coefficients as image (It may take a while!)
scales <- seq(1, 64, 3)
wCoefs.temp <- cwt(temp.df, scales=scales, wavelet='mexh')
wCoefs.gc <- cwt(gc.df, scales=scales, wavelet='mexh')
wCoefs.structure <- cwt(structure.df, scales=scales, wavelet='mexh')
wCoefs.rna <- cwt(rna.df, scales=scales, wavelet='mexh')
wCoefs.gene <- cwt(gene.df, scales=scales, wavelet='mexh')
wCoefs.temp.melt <- melt(wCoefs.temp)
wCoefs.gc.melt <- melt(wCoefs.gc)
wCoefs.structure.melt <- melt(wCoefs.structure)
wCoefs.rna.melt <- melt(wCoefs.rna)
wCoefs.gene.melt <- melt(wCoefs.gene)
colnames(window) <- c("chr", "start", "end")
window$Var1 <- seq.int(nrow(window))
colnames(score) <- c("chr", "start", "end", "score")
window.score <- left_join(window, score, by=c("chr", "start", "end"))
window.score.temp <- left_join(window.score, wCoefs.temp.melt, by="Var1")
window.temp.gc <- left_join(window.score.temp, wCoefs.gc.melt, by=c("Var1", "Var2"))
window.temp.gc.structure <- left_join(window.temp.gc, wCoefs.structure.melt, by=c("Var1", "Var2"))
window.temp.gc.structure.rna <- left_join(window.temp.gc.structure, wCoefs.rna.melt, by=c("Var1", "Var2"))
window.temp.gc.structure.rna.gene <- left_join(window.temp.gc.structure.rna, wCoefs.gene.melt, by=c("Var1", "Var2"))
colnames(window.temp.gc.structure.rna.gene) <- c("chr", "start", "end", "window", "score", "scale", "melting.temp", "gc.content", "structure", "fpkm", "gene.density")
# add gRNA sequence attributes
setwd("/gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli")
structure <- read.delim("ecoli.gRNA.structure.txt", header=T, sep="\t", stringsAsFactors = F)
nuc <- read.delim("gRNA_nuc_counts_temp.txt", header=T, sep="\t", stringsAsFactors = F)
score <- read.delim("sgRNA.coord.txt", header=T, sep="\t", stringsAsFactors = F)
colnames(score) <- c("chr", "start", "end", "sgRNA", "id", "seq", "id2", "cut.score", "gid", "change.val", "quality")
structure.df <- structure[,2]
gc.df <- nuc[,7]
temp.df <- nuc[,8]
## Plot the 2-D CWT coefficients as image (It may take a while!)
scales <- seq(1, 64, 3)
wCoefs.temp <- cwt(temp.df, scales=scales, wavelet='mexh')
wCoefs.gc <- cwt(gc.df, scales=scales, wavelet='mexh')
wCoefs.structure <- cwt(structure.df, scales=scales, wavelet='mexh')
wCoefs.temp.melt <- melt(wCoefs.temp)
wCoefs.gc.melt <- melt(wCoefs.gc)
wCoefs.structure.melt <- melt(wCoefs.structure)
features.score <- left_join(window.temp.gc.structure.rna.gene, score[,1:4], by=c("chr", "start", "end"))
score.temp <- left_join(features.score, wCoefs.temp.melt, by="sgRNA")
window.temp.gc <- left_join(score.temp, wCoefs.gc.melt, by=c("sgRNA", "Var2"))
window.temp.gc.structure <- left_join(window.temp.gc, wCoefs.structure.melt, by=c("sgRNA", "Var2"))
colnames(window.temp.gc.structure) <- c("chr", "start", "end", "window", "score", "scale", "melting.temp", "gc.content", "structure")
write.table(window.temp.gc.structure, "gRNA.wavelets.df.txt", quote=F, row.names=F, sep="\t")
df.melt <- melt(window.temp.gc.structure[,4:9], id=c("window", "score", "scale"))
write.table(df.melt, "gRNA.wavelets.input.txt", quote=F, row.names=F, sep="\t")
## Plot the 2-D CWT coefficients as image (It may take a while!)
# Melting Temperature
scales <- seq(1, 64, 3)
wCoefs <- cwt(temp.df, scales=scales, wavelet='mexh')
pdf("ecoli.gRNA.temp.wavelet.pdf")
image(1:length(temp.df), scales, wCoefs, col=terrain.colors(256), axes=FALSE, xlab='m/z index', ylab='CWT coefficient scale', main='CWT Temp coefficients')
box()
dev.off()
wCoefs <- cbind(as.vector(temp.df), wCoefs)
colnames(wCoefs) <- c(0, scales)
h <- getLocalMaximumCWT(wCoefs)
ridgeList <- getRidge(localMax)
SNR.Th <- 3
nearbyPeak <- TRUE
majorPeakInfo <- identifyMajorPeaks(temp.df, ridgeList, wCoefs, SNR.Th = SNR.Th, nearbyPeak=nearbyPeak)
peakIndex <- majorPeakInfo$peakIndex
SNR.Th <- 3
pdf("ecoli.gRNA.temp.wavelet.peak.pdf")
plotLocalMax(localMax, wCoefs)
dev.off()
plotRange <- c(1,length(temp.df))
pdf("ecoli.gRNA.temp.peak.pdf")
plotPeak(temp.df, peakIndex, range=plotRange, main=paste('Identified Expression peaks with SNR >', SNR.Th))
dev.off()
# GC content
scales <- seq(1, 64, 3)
wCoefs <- cwt(gc.df, scales=scales, wavelet='mexh')
pdf("ecoli.gRNA.gc.wavelet.pdf")
image(1:length(gc.df), scales, wCoefs, col=terrain.colors(256), axes=FALSE, xlab='m/z index', ylab='CWT coefficient scale', main='CWT GC coefficients')
box()
dev.off()
wCoefs <- cbind(as.vector(gc.df), wCoefs)
colnames(wCoefs) <- c(0, scales)
localMax <- getLocalMaximumCWT(wCoefs)
ridgeList <- getRidge(localMax)
SNR.Th <- 3
nearbyPeak <- TRUE
majorPeakInfo <- identifyMajorPeaks(gc.df, ridgeList, wCoefs, SNR.Th = SNR.Th, nearbyPeak=nearbyPeak)
peakIndex <- majorPeakInfo$peakIndex
SNR.Th <- 3
pdf("ecoli.gRNA.gc.wavelet.peak.pdf")
plotLocalMax(localMax, wCoefs)
dev.off()
plotRange <- c(1,length(gc.df))
pdf("ecoli.gRNA.gc.peak.pdf")
plotPeak(gc.df, peakIndex, range=plotRange, main=paste('Identified Expression peaks with SNR >', SNR.Th))
dev.off()
# Structure
scales <- seq(1, 64, 3)
wCoefs <- cwt(structure.df, scales=scales, wavelet='mexh')
pdf("ecoli.gRNA.structure.wavelet.pdf")
image(1:length(structure.df), scales, wCoefs, col=terrain.colors(256), axes=FALSE, xlab='m/z index', ylab='CWT coefficient scale', main='CWT Structure coefficients')
box()
dev.off()
wCoefs <- cbind(as.vector(structure.df), wCoefs)
colnames(wCoefs) <- c(0, scales)
localMax <- getLocalMaximumCWT(wCoefs)
ridgeList <- getRidge(localMax)
SNR.Th <- 3
nearbyPeak <- TRUE
majorPeakInfo <- identifyMajorPeaks(structure.df, ridgeList, wCoefs, SNR.Th = SNR.Th, nearbyPeak=nearbyPeak)
peakIndex <- majorPeakInfo$peakIndex
SNR.Th <- 3
pdf("ecoli.gRNA.structure.wavelet.peak.pdf")
plotLocalMax(localMax, wCoefs)
dev.off()
plotRange <- c(1,length(structure.df))
pdf("ecoli.gRNA.structure.peak.pdf")
plotPeak(structure.df, peakIndex, range=plotRange, main=paste('Identified Expression peaks with SNR >', SNR.Th))
dev.off()
```
### regression
- create a data table with just the gRNA and gRNA sequence specific features (raw and wavelet) to run through iRF --> predict specific gRNA score
- add the 500bp window data (raw and wavelet) to run through iRF --> predict specific gRNA score and average 500bp window score
```{r eval=F}
library(dplyr)
library(reshape2)
library(tidyr)
library(MassSpecWavelet)
setwd("/gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli")
score <- read.delim("sgRNA.coord.txt", header=T, sep="\t", stringsAsFactors = F)
colnames(score) <- c("chr", "start", "end", "sgRNA", "id", "seq", "id2", "cut.score", "gid", "change.val", "quality")
score.df <- score[,c(1:4,8)]
structure <- read.delim("ecoli.gRNA.structure.txt", header=T, sep="\t", stringsAsFactors = F)
nuc <- read.delim("gRNA_nuc_counts_temp.txt", header=T, sep="\t", stringsAsFactors = F)
structure.df <- structure[,2]
gc.df <- nuc[,7]
temp.df <- nuc[,8]
scales <- seq(1, 64, 3)
wCoefs.temp <- cwt(temp.df, scales=scales, wavelet='mexh')
wCoefs.gc <- cwt(gc.df, scales=scales, wavelet='mexh')
wCoefs.structure <- cwt(structure.df, scales=scales, wavelet='mexh')
wCoefs.temp.melt <- melt(wCoefs.temp)
wCoefs.gc.melt <- melt(wCoefs.gc)
wCoefs.structure.melt <- melt(wCoefs.structure)
score.df$Var1 <- seq.int(nrow(score.df))
structure$sgRNA <- structure$X.
nuc$sgRNA <- nuc$Window
structure.val <- structure[,c(2,22)]
gc.val <- nuc[,c(7,9)]
temp.val <- nuc[,c(8,9)]
structure.val$Var1 <- seq.int(nrow(structure.val))
gc.val$Var1 <- seq.int(nrow(gc.val))
temp.val$Var1 <- seq.int(nrow(temp.val))
structure.val$Var2 <- 0
gc.val$Var2 <- 0
temp.val$Var2 <- 0
score.coord.tempval <- full_join(score.df, temp.val, by=c("Var1"))
score.coord.tempval.gcval <- left_join(score.coord.tempval, gc.val, by=c("Var1", "Var2"))
score.coord.tempval.gcval.structure.val <- left_join(score.coord.tempval.gcval, structure.val, by=c("Var1", "Var2"))
score.temp <- full_join(score.coord.tempval.gcval.structure.val, wCoefs.temp.melt, by=c("Var1", "Var2"))
score.temp2 <- full_join(score.temp, score.df, by="Var1")
score.temp.gc <- left_join(score.temp2, wCoefs.gc.melt, by=c("Var1", "Var2"))
score.temp.gc.structure <- left_join(score.temp.gc, wCoefs.structure.melt, by=c("Var1", "Var2"))
df <- score.temp.gc.structure[,c(6:ncol(score.temp.gc.structure))]
colnames(df) <- c("window", "temp.raw", "sgRNA2", "scale", "gc.raw", "sgRNA3", "structure.raw", "sgRNA4", "temp", "chr", "start", "end", "sgRNA", "score", "gc", "structure")
write.table(df[,c(1:5,7,9:16)], "gRNA.score.temp.gc.structure.wavelet.txt", quote=F, row.names=F, sep="\t")
setwd("/gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli")
data <- read.delim("gRNA.score.temp.gc.structure.wavelet.txt", header=T, sep="\t", stringsAsFactors = F)
df.melt <- melt(data[,c(1,2,4,5,6,7,12,13,14)], id=c("window", "score", "scale"))
df <- na.omit(df.melt)
df.id <- df %>% unite(feature.scale, c(variable, scale), sep = "")
df.dcast <- df.id %>% dcast(window + score ~ feature.scale, value.var = "value")
df.dcast <- na.omit(df.dcast)
write.table(df.dcast, "gRNA.score.temp.gc.structure.wavelet.dcast.txt", quote=F, row.names=F, sep="\t")
library(ranger)
iRF <- function(xmat, y, ntree=500, iter=5, classification=F, threads=1,alwayssplits=NULL, saveall=T)
{
tmp <- cbind(xmat, Y = y)
wt <- rep(1/ncol(xmat), ncol(xmat)) # start with equal sample weighting per SNP
rfs <- list()
for(i in 1:iter)
{
cat("\niRF iteration ",i,"\n")
cat("=================\n")
mtry = 0.5*sum(wt>0)
rf <- ranger::ranger(dependent.variable.name = "Y", data = tmp, num.trees=ntree,
split.select.weights = wt, classification = classification,
mtry = mtry, importance = "impurity_corrected", num.threads=threads, write.forest = T,
always.split.variables = alwayssplits)
wt <- rf$variable.importance / sum(abs(rf$variable.importance)) # scale importances to range(0,1)
wt[wt<0] <- 0 # set negative weights to zero
cat("mtry: ", mtry, "\n")
cat("prediction error: ",rf$prediction.error,"\n")
if(classification==FALSE) cat("r^2: ",rf$r.squared,"\n")
if(classification==TRUE) print(rf$confusion.matrix)
cat("cor(y,yhat): ",cor(rf$predictions,y),"\n")
cat("SNPs with importance > 0:",sum(wt>0),"\n")
if(saveall) rfs[[i]] <- rf
if(sum(wt>0) < max(0.01*(ncol(xmat)-1), 10))
{
if(!saveall) rfs <- rf
break
}
}
return(rfs)
}
# include sgRNA raw and wavelet values
# iRF(df.dcast[,3:ncol(df.dcast)], df.dcast$score)
# mtry: 34.5
# prediction error: 111.6837
# r^2: -0.009525993
# cor(y,yhat): 0.04323492
# SNPs with importance > 0: 67
# include sgRNA raw values only
df.melt <- melt(data[,c(1,2,4,5,6,12)], id=c("window", "score", "scale"))
df <- na.omit(df.melt)
df.id <- df %>% unite(feature.scale, c(variable, scale), sep = "")
df.dcast <- df.id %>% dcast(window + score ~ feature.scale, value.var = "value")
df.dcast <- na.omit(df.dcast)
iRF(df.dcast[,3:ncol(df.dcast)], df.dcast$score)
# mtry: 1.5
# prediction error: 111.1393
# r^2: -0.004604956
# cor(y,yhat): -0.01132501
# SNPs with importance > 0: 3
#
# Type: Regression
# Number of trees: 500
# Sample size: 40216
# Number of independent variables: 3
# Mtry: 1
# Target node size: 5
# Variable importance mode: impurity_corrected
# Splitrule: variance
# OOB prediction error (MSE): 111.1393
# R squared (OOB): -0.004604956
# include sgRNA wavelet values only
df.melt <- melt(data[,c(5,6,8,11:13)], id=c("window", "score", "scale"))
df <- na.omit(df.melt)
df.id <- df %>% unite(feature.scale, c(variable, scale), sep = "")
df.dcast <- df.id %>% dcast(window + score ~ feature.scale, value.var = "value")
df.dcast <- na.omit(df.dcast)
iRF(df.dcast[,3:ncol(df.dcast)], df.dcast$score)
# mtry: 1.5
# prediction error: 111.2963
# r^2: -0.006024316
# cor(y,yhat): -0.002429047
# SNPs with importance > 0: 1
#
# Type: Regression
# Number of trees: 500
# Sample size: 40216
# Number of independent variables: 3
# Mtry: 1
# Target node size: 5
# Variable importance mode: impurity_corrected
# Splitrule: variance
# OOB prediction error (MSE): 111.2963
# R squared (OOB): -0.006024316
# salloc -A SYB105 -p gpu -N 2 -t 2:00:00
module load python
source /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/scripts/loadcondaandes.sh
conda activate /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/Libraries/andes/envs/test
R
###### add overlapping 500bp window patterns... gene density, fpkm, etc...
library(dplyr)
library(reshape2)
library(MassSpecWavelet)
setwd("/gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli")
data <- read.delim("gRNA.score.temp.gc.structure.wavelet.txt", header=T, sep="\t", stringsAsFactors = F)
data <- data[,c(1,2,4:14)]
setwd("/gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli")
gene <- read.delim("ecoli.gene.windows500.bed", header=F, sep="\t", stringsAsFactors = F)
structure <- read.delim("500bp.protein.structure.fa", header=T, sep="\t", stringsAsFactors = F)
nuc <- read.delim("nucleotide_counts_500bp_temp.txt", header=T, sep="\t", stringsAsFactors = F)
rnaseq <- read.delim("ecoli.rnaseq.average.windows500.bed", header=T, sep="\t", stringsAsFactors = F)
window <- read.delim("ecoli.500bp.windows.bed", header=F, sep="\t", stringsAsFactors = F)
score <- read.delim("window500.score.avg.txt", header=T, sep="\t", stringsAsFactors = F)
gene.bin <- gene %>% group_by(V1, V2, V3) %>% mutate(gene.count = n())
gene.count <- unique(gene.bin[,c(1:3,14)])
gene.df <- gene.count$gene.count
structure.df <- structure[,2]
gc.df <- nuc[,7]
temp.df <- nuc[,8]
rna.df <- rnaseq[,4]
scales <- seq(1, 100, 2)
wCoefs.temp <- cwt(temp.df, scales=scales, wavelet='mexh')
wCoefs.gc <- cwt(gc.df, scales=scales, wavelet='mexh')
wCoefs.structure <- cwt(structure.df, scales=scales, wavelet='mexh')
wCoefs.rna <- cwt(rna.df, scales=scales, wavelet='mexh')
wCoefs.gene <- cwt(gene.df, scales=scales, wavelet='mexh')
wCoefs.temp.melt <- melt(wCoefs.temp)
wCoefs.gc.melt <- melt(wCoefs.gc)
wCoefs.structure.melt <- melt(wCoefs.structure)
wCoefs.rna.melt <- melt(wCoefs.rna)
wCoefs.gene.melt <- melt(wCoefs.gene)
colnames(window) <- c("chr", "start", "end")
window$Var1 <- seq.int(nrow(window))
colnames(score) <- c("chr", "start", "end", "score")
nrow(data)
nrow(window)
structure$Var1 <- seq.int(nrow(structure))
gene.count$Var1 <- seq.int(nrow(gene.count))
nuc$Var1 <- seq.int(nrow(nuc))
rnaseq$Var1 <- seq.int(nrow(rnaseq))
structure$Var2 <- 0
gene.count$Var2 <- 0
nuc$Var2 <- 0
rnaseq$Var2 <- 0
data$Var2 <- data$scale
library(tidygenomics)
window$Var1 <- seq.int(nrow(window))
score$Var1 <- seq.int(nrow(score))
gRNA.window <- genome_intersect(data, window, by=c("chr", "start", "end"))
window.score <- left_join(gRNA.window[,c(1:6,8:13)], score, by=c("Var1"))
window.score1 <- left_join(window.score, nuc[,8:10], by=c("Var1", "Var2"))
window.score2 <- left_join(window.score1, nuc[,c(7,9:10)], by=c("Var1", "Var2"))
window.score3 <- left_join(window.score2, gene.count[,c(4:6)], by=c("Var1", "Var2"))
window.score4 <- left_join(window.score3, rnaseq[,c(4:6)], by=c("Var1", "Var2"))
window.score5 <- left_join(window.score4, structure[,c(2,22,23)], by=c("Var1", "Var2"))
window.score.temp <- full_join(window.score5, wCoefs.temp.melt, by=c("Var1", "Var2"))
window.temp.gc <- left_join(window.score.temp, wCoefs.gc.melt, by=c("Var1", "Var2"))
window.temp.gc.structure <- left_join(window.temp.gc, wCoefs.structure.melt, by=c("Var1", "Var2"))
window.temp.gc.structure.rna <- left_join(window.temp.gc.structure, wCoefs.rna.melt, by=c("Var1", "Var2"))
window.temp.gc.structure.rna.gene <- left_join(window.temp.gc.structure.rna, wCoefs.gene.melt, by=c("Var1", "Var2"))
colnames(window.temp.gc.structure.rna.gene) <- c("window", "sgRNA.temp.raw", "none", "sgRNA.gc.raw", "sgRNA.structure.raw", "sgRNA.temp.wave", "sgRNA", "score", "sgRNA.gc.wave", "sgRNA.structure.wave", "scale", "window", "chr", "start", "end", "score500", "temp500.raw", "gc500.raw", "genedensity500.raw", "fpkm500.raw", "structure500.raw", "temp500.wave", "gc500.wave", "genedensity500.wave", "fpkm500.wave", "structure500.wave")
write.table(window.temp.gc.structure.rna.gene, "gRNA.500bp.score.temp.gc.structure.wavelet.txt", quote=F, row.names=F, sep="\t")
data <- window.temp.gc.structure.rna.gene
library(reshape2)
library(tidyr)
library(dplyr)
setwd("/gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli")
data <- read.delim("gRNA.500bp.score.temp.gc.structure.wavelet.txt", header=T, sep="\t", stringsAsFactors = F)
library(ranger)
iRF <- function(xmat, y, ntree=500, iter=5, classification=F, threads=1,alwayssplits=NULL, saveall=T)
{
tmp <- cbind(xmat, Y = y)
wt <- rep(1/ncol(xmat), ncol(xmat)) # start with equal sample weighting per SNP
rfs <- list()
for(i in 1:iter)
{
cat("\niRF iteration ",i,"\n")
cat("=================\n")
mtry = 0.5*sum(wt>0)
rf <- ranger::ranger(dependent.variable.name = "Y", data = tmp, num.trees=ntree,
split.select.weights = wt, classification = classification,
mtry = mtry, importance = "impurity_corrected", num.threads=threads, write.forest = T,
always.split.variables = alwayssplits)
wt <- rf$variable.importance / sum(abs(rf$variable.importance)) # scale importances to range(0,1)
wt[wt<0] <- 0 # set negative weights to zero
cat("mtry: ", mtry, "\n")
cat("prediction error: ",rf$prediction.error,"\n")
if(classification==FALSE) cat("r^2: ",rf$r.squared,"\n")
if(classification==TRUE) print(rf$confusion.matrix)
cat("cor(y,yhat): ",cor(rf$predictions,y),"\n")
cat("SNPs with importance > 0:",sum(wt>0),"\n")
if(saveall) rfs[[i]] <- rf
if(sum(wt>0) < max(0.01*(ncol(xmat)-1), 10))
{
if(!saveall) rfs <- rf
break
}
}
return(rfs)
}
# predicting 500bp window score average
df.melt <- melt(data[,c(2,4:6,9:12,16:ncol(data))], id=c("window", "score500", "scale"))
df <- na.omit(df.melt)
df.id <- df %>% unite(feature.scale, c(variable, scale), sep = "")
df.dcast <- df.id %>% dcast(window + score500 ~ feature.scale, value.var = "value")
df.dcast <- na.omit(df.dcast)
iRF(df.dcast[,3:ncol(df.dcast)], df.dcast$score500)
# iRF iteration 1
# =================
# mtry: 64.5
# prediction error: 44.0659
# r^2: -0.07877568
# cor(y,yhat): 0.0622196
# SNPs with importance > 0: 51
# Type: Regression
# Number of trees: 500
# Sample size: 7544
# Number of independent variables: 129
# Mtry: 64
# Target node size: 5
# Variable importance mode: impurity_corrected
# Splitrule: variance
# OOB prediction error (MSE): 44.0659
# R squared (OOB): -0.07877568
# iRF iteration 2
# =================
# mtry: 25.5
# prediction error: 45.72915
# r^2: -0.1194938
# cor(y,yhat): -0.002229165
# SNPs with importance > 0: 3
# Type: Regression
# Number of trees: 500
# Sample size: 7544
# Number of independent variables: 129
# Mtry: 25
# Target node size: 5
# Variable importance mode: impurity_corrected
# Splitrule: variance
# OOB prediction error (MSE): 45.72915
# R squared (OOB): -0.1194938
# predicting sgRNA score
df.melt <- melt(data[,c(2,4:11,17:ncol(data))], id=c("sgRNA", "score", "scale"))
df <- na.omit(df.melt)
df.id <- df %>% unite(feature.scale, c(variable, scale), sep = "")
df.dcast <- df.id %>% dcast(sgRNA + score ~ feature.scale, value.var = "value")
df.dcast <- na.omit(df.dcast)
iRF(df.dcast[,3:ncol(df.dcast)], df.dcast$score)
# iRF iteration 1
# =================
# mtry: 64.5
# prediction error: 110.2542
# r^2: 0.003393309
# cor(y,yhat): 0.07424564
# SNPs with importance > 0: 31
# Type: Regression
# Number of trees: 500
# Sample size: 40196
# Number of independent variables: 129
# Mtry: 64
# Target node size: 5
# Variable importance mode: impurity_corrected
# Splitrule: variance
# OOB prediction error (MSE): 110.2542
# R squared (OOB): 0.003393309
# iRF iteration 2
# =================
# mtry: 15.5
# prediction error: 109.6506
# r^2: 0.008848719
# cor(y,yhat): 0.09439722
# SNPs with importance > 0: 23
# iRF iteration 3
# =================
# mtry: 11.5
# prediction error: 109.7116
# r^2: 0.008297279
# cor(y,yhat): 0.0936397
# SNPs with importance > 0: 16
#### add one-hot encoded sequence data
module load python
source /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/scripts/loadcondaandes.sh
conda activate /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/Libraries/andes/envs/test
R
library(reshape2)
library(tidyr)
library(dplyr)
setwd("/gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli")
data <- read.delim("gRNA.500bp.score.temp.gc.structure.wavelet.txt", header=T, sep="\t", stringsAsFactors = F)
setwd("/gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli")
onehot.ind1 <- read.delim("DataS1_independent1.txt")
onehot.ind2 <- read.delim("DataS1_independent2.txt")
onehot.dep1 <- read.delim("DataS1_dependent1.txt")
onehot.dep2 <- read.delim("DataS1_dependent2.txt")
onehot.ind <- full_join(onehot.ind1, onehot.ind2, by="sgRNAID")
onehot.dep <- full_join(onehot.dep1, onehot.dep2, by="sgRNAID")
onehot <- full_join(onehot.ind, onehot.dep, by="sgRNAID")
onehot <- onehot[2:nrow(onehot),]
colnames(onehot) <- c("sgRNA", "onehot.ind1", "onehot.ind2", "onehot.dep1", "onehot.dep2")
onehot$scale <- 0
data.df <- data[,c(2,4:11,17:ncol(data))]
data.onehot <- left_join(data.df, onehot, by=c("sgRNA", "scale"))
df.melt <- melt(data.onehot, id=c("sgRNA", "score", "scale"))
df <- na.omit(df.melt)
df.id <- df %>% unite(feature.scale, c(variable, scale), sep = "")
#df.dcast <- df.id %>% dcast(sgRNA + score ~ feature.scale, value.var = "value")
df.dcast <- df.id %>% dcast(sgRNA + score ~ feature.scale, value.var = "value", fun.aggregate=mean)
df.dcast <- na.omit(df.dcast)
iRF(df.dcast[,3:ncol(df.dcast)], df.dcast$score)
# mtry: 66.5
# prediction error: 110.5902
# r^2: 0.01852922
# cor(y,yhat): 0.143547
# SNPs with importance > 0: 65
library(randomForest)
set.seed(131)
rf <- randomForest(df.dcast$score ~ ., data=df.dcast[-2], mtry=ncol(df.dcast)-2, importance=TRUE, ntree=500)
print(rf)
# Call:
# randomForest(formula = df.dcast$score ~ ., data = df.dcast[-2], mtry = ncol(df.dcast) - 2, importance = TRUE, ntree = 500)
# Type of random forest: regression
# Number of trees: 500
# No. of variables tried at each split: 133
#
# Mean of squared residuals: 110.9958
# % Var explained: 1.49
imp <- data.frame(importance(rf))
imp.order <- imp[order(imp),]
tail(na.omit(imp.order))
# X.IncMSE IncNodePurity
# genedensity500.wave7 18.74117 35117.52
# sgRNA 19.36808 56148.30
# onehot.ind20 20.20094 92810.29
# gc500.raw0 23.72218 33155.23
# temp500.raw0 24.00947 31961.01
# onehot.ind10 24.56806 99529.89
data.df <- data[,c(1,2,4:6,7,9:11,16:ncol(data))]
data.onehot <- left_join(data.df, onehot, by=c("sgRNA", "scale"))
df.melt <- melt(data.onehot[,c(1:5,7:ncol(data.onehot))], id=c("window", "score500", "scale"))
df <- na.omit(df.melt)
df.id <- df %>% unite(feature.scale, c(variable, scale), sep = "")
df.dcast <- df.id %>% dcast(window + score500 ~ feature.scale, value.var = "value", fun.aggregate=mean)
df.dcast <- na.omit(df.dcast)
iRF(df.dcast[,3:ncol(df.dcast)], df.dcast$score500)
# mtry: 66.5
# prediction error: 1.685719
# r^2: 0.9566772
# cor(y,yhat): 0.9816768
# SNPs with importance > 0: 127
library(randomForest)
set.seed(131)
rf <- randomForest(df.dcast$score500 ~ ., data=df.dcast[-2], mtry=ncol(df.dcast)-2, importance=TRUE, ntree=500)
print(rf)
imp <- data.frame(importance(rf))
imp.order <- imp[order(imp),]
tail(na.omit(imp.order))
# salloc -A SYB105 -p gpu -N 2 -t 2:00:00
module load python
source /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/scripts/loadcondaandes.sh
conda activate /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/Libraries/andes/envs/test
R
library(ranger)
setwd("/gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli")
df <- read.delim("ecoli.features.500bp.wavelet.raw.dcast.txt", header=T, sep="\t", stringsAsFactors = F)
iRF <- function(xmat, y, ntree=500, iter=5, classification=F, threads=1,alwayssplits=NULL, saveall=T)
{
tmp <- cbind(xmat, Y = y)
wt <- rep(1/ncol(xmat), ncol(xmat)) # start with equal sample weighting per SNP
rfs <- list()
for(i in 1:iter)
{
cat("\niRF iteration ",i,"\n")
cat("=================\n")
mtry = 0.5*sum(wt>0)
rf <- ranger::ranger(dependent.variable.name = "Y", data = tmp, num.trees=ntree,
split.select.weights = wt, classification = classification,
mtry = mtry, importance = "impurity_corrected", num.threads=threads, write.forest = T,
always.split.variables = alwayssplits)
wt <- rf$variable.importance / sum(abs(rf$variable.importance)) # scale importances to range(0,1)
wt[wt<0] <- 0 # set negative weights to zero
cat("mtry: ", mtry, "\n")
cat("prediction error: ",rf$prediction.error,"\n")
if(classification==FALSE) cat("r^2: ",rf$r.squared,"\n")
if(classification==TRUE) print(rf$confusion.matrix)
cat("cor(y,yhat): ",cor(rf$predictions,y),"\n")
cat("SNPs with importance > 0:",sum(wt>0),"\n")
if(saveall) rfs[[i]] <- rf
if(sum(wt>0) < max(0.01*(ncol(xmat)-1), 10))
{
if(!saveall) rfs <- rf
break
}
}
return(rfs)
}
iRF(df[,3:ncol(df)], df$score)
### mtry = 0.1*sum(wt>0)
# iRF iteration 1
# =================
# mtry: 25.5
# prediction error: 36.64929
# r^2: 0.0905612
# cor(y,yhat): 0.3014373
# SNPs with importance > 0: 234
#
# iRF iteration 2
# =================
# mtry: 23.4
# prediction error: 36.86941
# r^2: 0.08509914
# cor(y,yhat): 0.291807
# SNPs with importance > 0: 196
#
# iRF iteration 3
# =================
# mtry: 19.6
# prediction error: 36.88579
# r^2: 0.08469255
# cor(y,yhat): 0.2939837
# SNPs with importance > 0: 173
#
# iRF iteration 4
# =================
# mtry: 17.3
# prediction error: 37.03742
# r^2: 0.08092997
# cor(y,yhat): 0.2898007
# SNPs with importance > 0: 152
#
# iRF iteration 5
# =================
# mtry: 15.2
# prediction error: 36.82996
# r^2: 0.08607792
# cor(y,yhat): 0.2945025
# SNPs with importance > 0: 135
### mtry = 0.5*sum(wt>0)
# iRF iteration 1
# =================
# mtry: 127.5
# prediction error: 36.50046
# r^2: 0.09425443
# cor(y,yhat): 0.3067766
# SNPs with importance > 0: 218
#
# iRF iteration 2
# =================
# mtry: 109
# prediction error: 36.50627
# r^2: 0.09411021
# cor(y,yhat): 0.3090887
# SNPs with importance > 0: 183
#
# iRF iteration 3
# =================
mtry: 91.5
prediction error: 36.70472
r^2: 0.08918578
cor(y,yhat): 0.2984714
SNPs with importance > 0: 153
iRF iteration 4
=================
mtry: 76.5
prediction error: 36.87442
r^2: 0.08497478
cor(y,yhat): 0.2916071
SNPs with importance > 0: 143
iRF iteration 5
=================
mtry: 71.5
prediction error: 36.52422
r^2: 0.09366491
cor(y,yhat): 0.305952
SNPs with importance > 0: 123
# Type: Regression
# Number of trees: 500
# Sample size: 5109
# Number of independent variables: 255
# Mtry: 127
# Target node size: 5
# Variable importance mode: impurity_corrected
# Splitrule: variance
# OOB prediction error (MSE): 36.50046
# R squared (OOB): 0.09425443
https://www.fs.fed.us/nrs/pubs/jrnl/2017/nrs_2017_gove_003.pdf mocp.8 = ssMODWT(sscp.8, J = hfs.J) cp.hfs = hfsMODWT(mocp.8, mocp.10, mocp.12, ids=c(‘cp8’,‘cp10’,‘cp12’), long = TRUE) waveslim::wave.filter(‘haar’) showClass(‘ssWavelet’)
#module load python
#source /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/scripts/loadcondaandes.sh
#conda activate /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/Libraries/andes/envs/test
module load python/3.7-anaconda3
source /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/scripts/loadcondaandes.sh
#conda create -p /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/Libraries/andes/envs/renv
conda activate /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/Libraries/andes/envs/renv
#conda install -c conda-forge r-essentials r-rgdal
# R
library(dplyr)
library(reshape2)
library(waveslim)
library(ssWavelets)
### still having issues with install of ssWavelet... dependencies issues with rgdal?? --> use conda installs instead of install.package within R
setwd("/gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli")
gene <- read.delim("ecoli.gene.windows500.bed", header=F, sep="\t", stringsAsFactors = F)
structure <- read.delim("500bp.protein.structure.fa", header=T, sep="\t", stringsAsFactors = F)
nuc <- read.delim("nucleotide_counts_500bp_temp.txt", header=T, sep="\t", stringsAsFactors = F)
rnaseq <- read.delim("ecoli.rnaseq.average.windows500.bed", header=T, sep="\t", stringsAsFactors = F)
window <- read.delim("ecoli.500bp.windows.bed", header=F, sep="\t", stringsAsFactors = F)
score <- read.delim("window500.score.avg.txt", header=T, sep="\t", stringsAsFactors = F)
gene.bin <- gene %>% group_by(V1, V2, V3) %>% mutate(gene.count = n())
gene.count <- unique(gene.bin[,c(1:3,14)])
gene.df <- gene.count$gene.count
structure.df <- structure[,2]
gc.df <- nuc[,7]
temp.df <- nuc[,8]
rna.df <- rnaseq[,4]
# Run DWT instead of CWT
library(waveslim)
setwd("/gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli")
nuc <- read.delim("nucleotide_counts_500bp_temp.txt", header=T, sep="\t", stringsAsFactors = F)
temp.df <- nuc[,8]
#temp.dwt <- dwt(temp.df, wf="la8", n.levels=4, boundary="periodic")
temp.dwt <- dwt.nondyadic(temp.df)
str(temp.dwt)
#temp.dwt.df <- data.frame(temp.dwt=temp.dwt$d1)
names(temp.dwt) <- c("d1", "d2", "d3", "d4", "s4")
temp.dwt$d1 <- c(temp.dwt$d1[-c(1:2)], temp.dwt$d1[1:2])
temp.dwt$d2 <- c(temp.dwt$d2[-c(1:2)], temp.dwt$d2[1:2])
for(i in names(temp.dwt)[3:4])
temp.dwt[[i]] <- c(temp.dwt[[i]][-c(1:3)], temp.dwt[[i]][1:3])
temp.dwt$s4 <- c(temp.dwt$s4[-c(1:2)], temp.dwt$s4[1:2])
require(sampSurf)
tract.m = Tract(c(x = 71, y = 71), cellSize = 0.5) #meters
buffTract.m = bufferedTract(bufferWidth = 10, tract.m)
sausageLen.ss = sampSurf(20, tract = buffTract.m, iZone = 'sausageIZ',
+ plotRadius = 3, estimate = 'Length',
+ buttDiams = c(20, 50), logLens = c(2, 8))
temp.dwt = ssMODWT(temp.df)
cp.hfs = hfsMODWT(mocp.8, mocp.10, mocp.12, ids=c('cp8','cp10','cp12'), long = TRUE)
gc.dwt <- dwt.nondyadic(gc.df)
gc.dwt.df <- data.frame(gc.dwt=gc.dwt$d1)
structure.dwt <- dwt.nondyadic(structure.df)
structure.dwt.df <- data.frame(structure.dwt=structure.dwt$d1)
rna.dwt <- dwt.nondyadic(rna.df)
rna.dwt.df <- data.frame(rna.dwt=rna.dwt$d1)
gene.dwt <- dwt.nondyadic(gene.df)
gene.dwt.df <- data.frame(gene.dwt=gene.dwt$d1)
temp.dwt.df$Var1 <- seq.int(nrow(temp.dwt.df))
gc.dwt.df$Var1 <- seq.int(nrow(gc.dwt.df))
structure.dwt.df$Var1 <- seq.int(nrow(structure.dwt.df))
rna.dwt.df$Var1 <- seq.int(nrow(rna.dwt.df))
gene.dwt.df$Var1 <- seq.int(nrow(gene.dwt.df))
colnames(window) <- c("chr", "start", "end")
window$Var1 <- seq.int(nrow(window))
colnames(score) <- c("chr", "start", "end", "score")
window.score <- left_join(window, score, by=c("chr", "start", "end"))
window.score.temp <- left_join(window.score, wCoefs.temp.melt, by="Var1")
window.temp.gc <- left_join(window.score.temp, wCoefs.gc.melt, by=c("Var1"))
window.temp.gc.structure <- left_join(window.temp.gc, wCoefs.structure.melt, by=c("Var1"))
window.temp.gc.structure.rna <- left_join(window.temp.gc.structure, wCoefs.rna.melt, by=c("Var1"))
window.temp.gc.structure.rna.gene <- left_join(window.temp.gc.structure.rna, wCoefs.gene.melt, by=c("Var1"))
colnames(window.temp.gc.structure.rna.gene) <- c("chr", "start", "end", "window", "score", "scale", "melting.temp", "gc.content", "structure", "fpkm", "gene.density")
df.melt <- melt(window.temp.gc.structure.rna.gene[,4:11], id=c("window", "score", "scale"))
df <- na.omit(df.melt)
library(tidyr)
df.id <- df %>% unite(feature.scale, c(variable, scale), sep = "")
df.dcast <- df.id %>% dcast(window + score ~ feature.scale, value.var = "value")
df.dcast <- na.omit(df.dcast)
write.table(df.dcast, "ecoli.features.500bp.wavelet.dcast.dwt.txt", quote=F, row.names=F, sep="\t")
https://cran.r-project.org/src/contrib/Archive/wmtsa/ https://rdrr.io/cran/wmtsa/man/wavDWT.html https://www.ncbi.nlm.nih.gov/pmc/articles/PMC4927172/ - use MODWT (but maybe adds redundancy again which is an issue for RF??)
module load python
source /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/scripts/loadcondaandes.sh
conda activate /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/Libraries/andes/envs/test
#require(devtools)
#install_version("wmtsa", version = "2.0-3", repos = "http://cran.us.r-project.org")
# wavDWT(x, n.levels=ilogb(length(x), base=2), wavelet="s8", position=list(from=1,by=1,units=character()), units=character(), title.data=character(), documentation=character(), keep.series=FALSE)
library(dplyr)
library(reshape2)
library(wmtsa)
setwd("/gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli")
gene <- read.delim("ecoli.gene.windows500.bed", header=F, sep="\t", stringsAsFactors = F)
structure <- read.delim("500bp.protein.structure.fa", header=T, sep="\t", stringsAsFactors = F)
nuc <- read.delim("nucleotide_counts_500bp_temp.txt", header=T, sep="\t", stringsAsFactors = F)
rnaseq <- read.delim("ecoli.rnaseq.average.windows500.bed", header=T, sep="\t", stringsAsFactors = F)
window <- read.delim("ecoli.500bp.windows.bed", header=F, sep="\t", stringsAsFactors = F)
score <- read.delim("window500.score.avg.txt", header=T, sep="\t", stringsAsFactors = F)
gene.bin <- gene %>% group_by(V1, V2, V3) %>% mutate(gene.count = n())
gene.count <- unique(gene.bin[,c(1:3,14)])
gene.df <- gene.count$gene.count
structure.df <- structure[,2]
gc.df <- nuc[,7]
temp.df <- nuc[,8]
rna.df <- rnaseq[,4]
# Run DWT instead of CWT
temp.dwt <- wavDWT(temp.df)
# Discrete Wavelet Transform of temp.df
# -------------------------------------
# Wavelet : s8
# Length of series : 9284
# Number of levels : 13
# Boundary correction rule : periodic
# Filtering technique : convolution
# Zero phase shifted : FALSE
# Crystals : d1 d2 d3 d4 d5 d6
summary(temp.dwt)
# Min 1Q Median 3Q Max Mean SD
# d1 -7.395 -0.812 0.051 0.863 7.426 -0.010 1.416
# d2 -9.480 -1.039 0.097 1.196 10.384 0.005 1.938
# d3 -11.799 -1.522 0.071 1.619 11.887 0.041 2.572
# d4 -16.330 -1.768 0.000 1.899 12.001 -0.012 3.296
# d5 -18.052 -2.481 -0.042 2.313 14.081 -0.071 4.419
# d6 -19.204 -3.064 0.153 2.421 10.164 -0.433 4.485
# d7 -16.373 -1.343 1.477 5.057 17.262 1.518 6.083
# d8 -18.526 -5.106 -1.810 2.385 6.939 -2.167 6.184
# d9 -14.452 -4.618 0.051 2.665 5.485 -1.397 5.157
# d10 -7.465 -5.397 0.814 4.485 5.404 -0.171 4.968
# d11 -12.499 -3.274 1.827 4.258 5.472 -0.843 8.128
# d12 -16.572 -9.904 -3.236 3.432 10.100 -3.236 18.859
# d13 -14.211 -14.211 -14.211 -14.211 -14.211 -14.211 NA
# s13 7637.517 7637.517 7637.517 7637.517 7637.517 7637.517 NA
# extra 171.152 418.693 666.234 1683.607 2700.981 1179.455 1340.730
# Var MAD Energy %
# d1 2.005 1.236 0.014
# d2 3.755 1.670 0.013
# d3 6.615 2.321 0.012
# d4 10.865 2.715 0.010
# d5 19.531 3.508 0.009
# d6 20.118 3.984 0.004
# d7 36.998 5.077 0.004
# d8 38.240 5.651 0.002
# d9 26.593 4.882 0.001
# d10 24.677 6.299 0.000
# d11 66.066 4.204 0.000
# d12 355.678 19.771 0.001
# d13 NA 0.000 0.000
# s13 NA 0.000 88.186
# extra 1797556.762 734.009 11.744
#
# Energy Distribution:
# 1st 1% 2% 3% 4% 5% 10% 15%
# Energy % 88.186 99.947 99.954 99.958 99.962 99.965 99.975 99.982
# |coeffs| 7637.517 7.760 6.136 5.397 4.893 4.477 3.345 2.719
# #coeffs 1.000 93.000 186.000 279.000 372.000 465.000 929.000 1393.000
# 20% 25%
# Energy % 99.986 99.990
# |coeffs| 2.343 2.016
# #coeffs 1857.000 2321.000
temp.modwt <- wavMODWT(temp.df)
temp.modwt.df <- as.matrix(temp.modwt)
temp.modwt.label <- data.frame(label = row.names(temp.modwt.df), temp.modwt.df)
temp.modwt.name <- temp.modwt.label %>% separate(label, c("name", "number"), sep = "[^[:alnum:]]+")
colnames(temp.modwt.name) <- c("scale", "window", "temp.dwt")
gc.modwt <- wavMODWT(gc.df)
gc.modwt.df <- as.matrix(gc.modwt)
gc.modwt.label <- data.frame(label = row.names(gc.modwt.df), gc.modwt.df)
gc.modwt.name <- gc.modwt.label %>% separate(label, c("name", "number"), sep = "[^[:alnum:]]+")
colnames(gc.modwt.name) <- c("scale", "window", "gc.dwt")
structure.modwt <- wavMODWT(structure.df)
structure.modwt.df <- as.matrix(structure.modwt)
structure.modwt.label <- data.frame(label = row.names(structure.modwt.df), structure.modwt.df)
structure.modwt.name <- structure.modwt.label %>% separate(label, c("name", "number"), sep = "[^[:alnum:]]+")
colnames(structure.modwt.name) <- c("scale", "window", "structure.dwt")
rna.modwt <- wavMODWT(rna.df)
rna.modwt.df <- as.matrix(rna.modwt)
rna.modwt.label <- data.frame(label = row.names(rna.modwt.df), rna.modwt.df)
rna.modwt.name <- rna.modwt.label %>% separate(label, c("name", "number"), sep = "[^[:alnum:]]+")
colnames(rna.modwt.name) <- c("scale", "window", "rna.dwt")
gene.modwt <- wavMODWT(gene.df)
gene.modwt.df <- as.matrix(gene.modwt)
gene.modwt.label <- data.frame(label = row.names(gene.modwt.df), gene.modwt.df)
gene.modwt.name <- gene.modwt.label %>% separate(label, c("name", "number"), sep = "[^[:alnum:]]+")
colnames(gene.modwt.name) <- c("scale", "window", "gene.dwt")
colnames(window) <- c("chr", "start", "end")
window$window <- seq.int(nrow(window))
window$window <- as.character(window$window-1)
colnames(score) <- c("chr", "start", "end", "score")
window.score <- left_join(window, score, by=c("chr", "start", "end"))
window.score.temp <- left_join(window.score, temp.modwt.name, by="window")
window.temp.gc <- left_join(window.score.temp, gc.modwt.name, by=c("window", "scale"))
window.temp.gc.structure <- left_join(window.temp.gc, structure.modwt.name, by=c("window", "scale"))
window.temp.gc.structure.rna <- left_join(window.temp.gc.structure, rna.modwt.name, by=c("window", "scale"))
window.temp.gc.structure.rna.gene <- left_join(window.temp.gc.structure.rna, gene.modwt.name, by=c("window", "scale"))
write.table(window.temp.gc.structure.rna.gene, "ecoli.features.500bp.DWT.txt", quote=F, row.names=F, sep="\t")
df.melt <- melt(window.temp.gc.structure.rna.gene[,4:11], id=c("window", "score", "scale"))
df <- na.omit(df.melt)
write.table(df, "ecoli.features.500bp.DWT.txt", quote=F, row.names=F, sep="\t")
library(tidyr)
df.id <- df %>% unite(feature.scale, c(variable, scale), sep = "")
df.dcast <- df.id %>% dcast(window + score ~ feature.scale, value.var = "value")
df.dcast <- na.omit(df.dcast)
write.table(df.dcast, "ecoli.features.500bp.DWT.dcast.txt", quote=F, row.names=F, sep="\t")
library(randomForest)
setwd("/gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli")
df <- read.delim("ecoli.features.500bp.DWT.dcast.txt", header=T, sep="\t", stringsAsFactors = F)
set.seed(131)
rf <- randomForest(score ~ ., data=df[-1], mtry=ncol(df) - 2, importance=TRUE, ntree=500)
print(rf)
# Call:
# randomForest(formula = score ~ ., data = df[-1], mtry = ncol(df) - 2, importance = TRUE, ntree = 500)
# Type of random forest: regression
# Number of trees: 500
# No. of variables tried at each split: 68
#
# Mean of squared residuals: 37.48483
# % Var explained: 6.96
imp <- data.frame(importance(rf))
imp.order <- imp[order(imp),]
tail(na.omit(imp.order))
# X.IncMSE IncNodePurity
# rna.dwtd6 17.74768 3992.877
# structure.dwtd6 17.80892 4250.176
# gene.dwtd8 18.84431 3279.767
# structure.dwtd5 19.07937 4882.358
# gene.dwtd6 20.34434 4670.066
# gene.dwtd7 20.39398 3920.976
#### add raw data
structure$window <- seq.int(nrow(structure))
structure$window <- as.character(structure$window-1)
gene.count$window <- seq.int(nrow(gene.count))
gene.count$window <- as.character(gene.count$window-1)
nuc$window <- seq.int(nrow(nuc))
nuc$window <- as.character(nuc$window-1)
rnaseq$window <- seq.int(nrow(rnaseq))
rnaseq$window <- as.character(rnaseq$window-1)
structure$scale <- "raw"
gene.count$scale <- "raw"
nuc$scale <- "raw"
rnaseq$scale <- "raw"
window.score.temp <- left_join(window.score, nuc[,8:10], by="window")
window.temp.gc <- left_join(window.score.temp, nuc[,c(7,9:10)], by=c("window", "scale"))
window.temp.gc.gene <- left_join(window.temp.gc, gene.count[,c(4:6)], by=c("window", "scale"))
window.temp.gc.gene.rna <- left_join(window.temp.gc.gene, rnaseq[,c(4:6)], by=c("window", "scale"))
window.temp.gc.gene.rna.structure <- left_join(window.temp.gc.gene.rna, structure[,c(2,22,23)], by=c("window", "scale"))
colnames(window.temp.gc.gene.rna.structure) <- c("chr", "start", "end", "window", "score", "temp", "scale", "gc", "gene", "rna", "structure")
df2.melt <- melt(window.temp.gc.gene.rna.structure[,4:11], id=c("window", "score", "scale"))
df2 <- na.omit(df2.melt)
df3 <- rbind(df, df2)
library(tidyr)
df.id <- df3 %>% unite(feature.scale, c(variable, scale), sep = "")
df.dcast <- df.id %>% dcast(window + score ~ feature.scale, value.var = "value")
df.dcast <- na.omit(df.dcast)
write.table(df.dcast, "ecoli.features.500bp.DWT.raw.dcast.txt", quote=F, row.names=F, sep="\t")
library(ranger)
setwd("/gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli")
df <- read.delim("ecoli.features.500bp.DWT.raw.dcast.txt", header=T, sep="\t", stringsAsFactors = F)
iRF <- function(xmat, y, ntree=500, iter=5, classification=F, threads=1,alwayssplits=NULL, saveall=T)
{
tmp <- cbind(xmat, Y = y)
wt <- rep(1/ncol(xmat), ncol(xmat)) # start with equal sample weighting per SNP
rfs <- list()
for(i in 1:iter)
{
cat("\niRF iteration ",i,"\n")
cat("=================\n")
mtry = 0.5*sum(wt>0)
rf <- ranger::ranger(dependent.variable.name = "Y", data = tmp, num.trees=ntree,
split.select.weights = wt, classification = classification,
mtry = mtry, importance = "impurity_corrected", num.threads=threads, write.forest = T,
always.split.variables = alwayssplits)
wt <- rf$variable.importance / sum(abs(rf$variable.importance)) # scale importances to range(0,1)
wt[wt<0] <- 0 # set negative weights to zero
cat("mtry: ", mtry, "\n")
cat("prediction error: ",rf$prediction.error,"\n")
if(classification==FALSE) cat("r^2: ",rf$r.squared,"\n")
if(classification==TRUE) print(rf$confusion.matrix)
cat("cor(y,yhat): ",cor(rf$predictions,y),"\n")
cat("SNPs with importance > 0:",sum(wt>0),"\n")
if(saveall) rfs[[i]] <- rf
if(sum(wt>0) < max(0.01*(ncol(xmat)-1), 10))
{
if(!saveall) rfs <- rf
break
}
}
return(rfs)
}
iRF(df[,3:ncol(df)], df$score)
# mtry: 36.5
# prediction error: 36.51254
# r^2: 0.0939546
# cor(y,yhat): 0.3068121
# SNPs with importance > 0: 52
# Type: Regression
# Number of trees: 500
# Sample size: 5109
# Number of independent variables: 73
# Mtry: 36
# Target node size: 5
# Variable importance mode: impurity_corrected
# Splitrule: variance
# OOB prediction error (MSE): 36.51254
# R squared (OOB): 0.0939546
# mtry: 26
# prediction error: 36.39723
# r^2: 0.09681604
# cor(y,yhat): 0.3114234
# SNPs with importance > 0: 39
# mtry: 19.5
# prediction error: 36.27406
# r^2: 0.09987251
# cor(y,yhat): 0.3178869
# SNPs with importance > 0: 32
# mtry: 16
# prediction error: 36.42674
# r^2: 0.09608373
# cor(y,yhat): 0.3141778
# SNPs with importance > 0: 29
# mtry: 14.5
# prediction error: 36.57779
# r^2: 0.09233562
# cor(y,yhat): 0.3086102
# SNPs with importance > 0: 27
# Type: Regression
# Number of trees: 500
# Sample size: 5109
# Number of independent variables: 73
# Mtry: 14
# Target node size: 5
# Variable importance mode: impurity_corrected
# Splitrule: variance
# OOB prediction error (MSE): 36.57779
# R squared (OOB): 0.09233562
set.seed(131)
rf <- randomForest(df$score ~ ., data=df[-1], mtry=ncol(df) - 2, importance=TRUE, ntree=500)
print(rf)
# Call:
# randomForest(formula = df$score ~ ., data = df[-1], mtry = ncol(df) - 2, importance = TRUE, ntree = 500)
# Type of random forest: regression
# Number of trees: 500
# No. of variables tried at each split: 73
#
# Mean of squared residuals: 36.20158
# % Var explained: 10.15
imp <- data.frame(importance(rf))
imp.order <- imp[order(imp),]
tail(na.omit(imp.order))
# X.IncMSE IncNodePurity
# gc.dwtd12 15.46798 2341.063
# gene.dwtd6 16.71434 4060.381
# gene.dwtd7 17.03512 3537.681
# gene.dwtd8 17.99514 2831.067
# tempraw 19.78707 7880.367
# gcraw 19.80757 7959.672
module load python
source /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/scripts/loadcondaandes.sh
conda activate /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/Libraries/andes/envs/test
library(dplyr)
library(reshape2)
library(wmtsa)
library(randomForest)
setwd("/gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli")
df <- read.delim("ecoli.features.500bp.DWT.txt", header=T, sep="\t", stringsAsFactors = F)
#### add raw data
setwd("/gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli")
gene <- read.delim("ecoli.gene.windows500.bed", header=F, sep="\t", stringsAsFactors = F)
structure <- read.delim("500bp.protein.structure.fa", header=T, sep="\t", stringsAsFactors = F)
nuc <- read.delim("nucleotide_counts_500bp_temp.txt", header=T, sep="\t", stringsAsFactors = F)
rnaseq <- read.delim("ecoli.rnaseq.average.windows500.bed", header=T, sep="\t", stringsAsFactors = F)
window <- read.delim("ecoli.500bp.windows.bed", header=F, sep="\t", stringsAsFactors = F)
score <- read.delim("window500.score.avg.txt", header=T, sep="\t", stringsAsFactors = F)
gene.bin <- gene %>% group_by(V1, V2, V3) %>% mutate(gene.count = n())
gene.count <- unique(gene.bin[,c(1:3,14)])
structure$window <- seq.int(nrow(structure))
structure$window <- as.character(structure$window-1)
gene.count$window <- seq.int(nrow(gene.count))
gene.count$window <- as.character(gene.count$window-1)
nuc$window <- seq.int(nrow(nuc))
nuc$window <- as.character(nuc$window-1)
rnaseq$window <- seq.int(nrow(rnaseq))
rnaseq$window <- as.character(rnaseq$window-1)
structure$scale <- "raw"
gene.count$scale <- "raw"
nuc$scale <- "raw"
rnaseq$scale <- "raw"
colnames(window) <- c("chr", "start", "end")
window$window <- seq.int(nrow(window))
window$window <- as.character(window$window-1)
colnames(score) <- c("chr", "start", "end", "score")
window.score <- left_join(window, score, by=c("chr", "start", "end"))
window.score.temp <- left_join(window.score, nuc[,8:10], by="window")
window.temp.gc <- left_join(window.score.temp, nuc[,c(7,9:10)], by=c("window", "scale"))
window.temp.gc.gene <- left_join(window.temp.gc, gene.count[,c(4:6)], by=c("window", "scale"))
window.temp.gc.gene.rna <- left_join(window.temp.gc.gene, rnaseq[,c(4:6)], by=c("window", "scale"))
window.temp.gc.gene.rna.structure <- left_join(window.temp.gc.gene.rna, structure[,c(2,22,23)], by=c("window", "scale"))
colnames(window.temp.gc.gene.rna.structure) <- c("chr", "start", "end", "window", "score", "temp", "scale", "gc", "gene", "rna", "structure")
## add one-hot data
setwd("/gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli")
sgRNA <- read.delim("sgRNA.coord.txt", header=T, sep="\t", stringsAsFactors = F)
colnames(sgRNA) <- c("chr", "start", "end", "sgRNAID", "id", "seq", "id2", "cut.score", "gid", "change.val", "quality")
sgRNA.score <- sgRNA[,c(1:4,8)]
library(tidygenomics)
window.sgRNA <- genome_intersect(window, sgRNA.score, by=c("chr", "start", "end"))
window.temp.gc.gene.rna.structure.sgRNA <- left_join(window.temp.gc.gene.rna.structure, window.sgRNA[,c(2,3,4)], by="window")
setwd("/gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli")
# onehot.ind1 <- read.delim("DataS1_independent1.txt")
# onehot.ind2 <- read.delim("DataS1_independent2.txt")
# onehot.dep1 <- read.delim("DataS1_dependent1.txt")
# onehot.dep2 <- read.delim("DataS1_dependent2.txt")
onehot.ind1 <- read.delim("DataS1_ind1.txt", header=T, sep=" ")
onehot.ind2 <- read.delim("DataS1_ind2.txt", header=T, sep=" ")
onehot.dep1 <- read.delim("DataS1_dep1.txt", header=T, sep=" ")
onehot.dep2 <- read.delim("DataS1_dep2.txt", header=T, sep=" ")
onehot.ind <- full_join(onehot.ind1, onehot.ind2, by="sgRNAID")
onehot.dep <- full_join(onehot.dep1, onehot.dep2, by="sgRNAID")
onehot <- full_join(onehot.ind, onehot.dep, by="sgRNAID")
#onehot <- onehot[2:nrow(onehot),]
#colnames(onehot) <- c("sgRNA", "onehot.ind1", "onehot.ind2", "onehot.dep1", "onehot.dep2")
onehot$scale <- "raw"
onehot.data <- onehot[colSums(!is.na(onehot)) > 0]
data.onehot <- left_join(window.temp.gc.gene.rna.structure.sgRNA, onehot.data, by=c("sgRNAID", "scale"))
#df.melt <- melt(data.onehot[,c(6:17)], id=c("sgRNA", "cut.score", "scale"))
#df2.melt <- melt(data.onehot[,c(4:11,14:17)], id=c("window", "score", "scale"))
#df2.melt <- melt(data.onehot[,c(4:11,14:144)], id=c("window", "score", "scale"))
df2.melt <- melt(data.onehot[,c(4:11,14:357)], id=c("window", "score", "scale"))
df2 <- na.omit(df2.melt)
df3 <- rbind(df, df2)
library(tidyr)
df.id <- df3 %>% unite(feature.scale, c(variable, scale), sep = "")
df.dcast <- df.id %>% dcast(window + score ~ feature.scale, value.var = "value", fun.aggregate=mean)
#df.dcast.na <- na.omit(df.dcast)
setwd("/gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli")
#write.table(df.id, "ecoli.dwt.raw.onehot.txt", quote=F, row.names=F, sep="\t")
#write.table(df.dcast, "ecoli.dwt.raw.onehot.dcast.txt", quote=F, row.names=F, sep="\t")
write.table(df.id, "ecoli.dwt.raw.onehotsep.txt", quote=F, row.names=F, sep="\t")
write.table(df.dcast, "ecoli.dwt.raw.onehotsep.dcast.txt", quote=F, row.names=F, sep="\t")
library(randomForest)
set.seed(131)
rf <- randomForest(df.dcast$score ~ ., data=df.dcast[-2], mtry=ncol(df.dcast)-2, importance=TRUE, ntree=500)
print(rf)
# Call:
# randomForest(formula = df.dcast$score ~ ., data = df.dcast[-2], mtry = ncol(df.dcast) - 2, importance = TRUE, ntree = 500)
# Type of random forest: regression
# Number of trees: 500
# No. of variables tried at each split: 77
#
# Mean of squared residuals: 35.68755
# % Var explained: 11.43
imp <- data.frame(importance(rf))
imp.order <- imp[order(imp),]
tail(na.omit(imp.order))
# X.IncMSE IncNodePurity
# temp.dwtd12 14.12777 1986.439
# structure.dwtd8 14.85301 2720.283
# gene.dwtd7 15.16247 3243.014
# gene.dwtd8 16.17213 2711.455
# gcraw 20.21452 7192.470
# tempraw 20.78803 7713.837
imp.order
# X.IncMSE IncNodePurity
# gene.dwtd2 -2.2908030 4617.2721
# gene.dwtd1 -1.4818029 3941.8291
# generaw -0.1802341 483.1302
# structure.dwtd1 0.6216740 3866.6907
# temp.dwtd1 1.3325950 1946.7604
# gc.dwtd1 1.6206323 2032.6931
# rna.dwtd1 1.8217925 4142.8643
# structure.dwtd2 2.9806875 3868.8344
# rna.dwtd2 3.2249017 3909.5253
# gene.dwtd3 4.3559369 4531.8029
# temp.dwtd3 4.5542185 2037.8998
# gc.dwtd3 4.5562323 2100.0347
# structureraw 5.2671777 3143.9862
# onehot.dep1raw 5.4170942 5473.5726
# temp.dwtd5 5.4857999 1635.6251
# gc.dwtd10 5.5791035 1029.6304
# gc.dwtd7 5.8790319 1653.0664
# temp.dwtd2 5.8902502 2217.1715
# temp.dwtd11 5.9276785 707.7486
# gc.dwtd5 6.1953374 1639.0137
# structure.dwtd13 6.5786518 639.8278
# gc.dwtd4 6.7088371 1976.7228
# gc.dwtd11 6.9665691 694.1019
# gc.dwtd6 6.9890496 1528.9234
# rnaraw 6.9973124 3548.3206
# gene.dwtd13 7.0291691 682.0167
# gc.dwtd2 7.0416921 2162.6168
# onehot.dep2raw 7.1375485 6567.1226
# temp.dwtd7 7.2725657 1728.7649
# structure.dwtd3 7.2907651 3839.4401
# temp.dwtd13 7.5657081 831.5895
# temp.dwtd6 7.7541796 1481.9715
# onehot.ind2raw 7.7740876 6558.2690
# gene.dwtd12 7.8020429 1325.9120
# temp.dwtd8 7.9052542 1156.4873
# window 7.9939871 1199.9813
# gc.dwtd9 8.1305465 1224.1759
# temp.dwtd10 8.1452694 1016.3032
# temp.dwtd4 8.1544189 1895.8269
# gene.dwtd4 8.2468860 3848.6761
# rna.dwtd4 8.6903228 3512.6840
# temp.dwtd9 8.7706466 1105.2972
# onehot.ind1raw 8.7762093 6175.6191
# gene.dwts13 8.9741701 1473.4873
# rna.dwtd12 9.0679968 1110.0526
# gc.dwtd13 9.2013256 807.2902
# temp.dwts13 9.2394694 961.5499
# rna.dwtd11 9.4740479 1623.8936
# structure.dwts13 9.6437396 1606.8043
# gc.dwtd8 9.7545694 1233.4242
# rna.dwtd3 10.1164856 4140.3232
# structure.dwtd4 10.1345454 3550.4243
# gene.dwtd5 10.3259332 3558.3964
# gc.dwts13 10.5119750 1009.8998
# rna.dwtd9 10.7965287 2020.2374
# rna.dwtd5 10.8198778 3388.8309
# rna.dwtd8 10.9798279 2256.0594
# gene.dwtd11 11.3637461 1441.4841
# rna.dwtd10 11.4362994 1875.4523
# structure.dwtd5 11.5175757 3334.4448
# rna.dwtd6 11.6540225 3065.1007
# gene.dwtd10 11.6818297 1926.4395
# structure.dwtd10 11.8693545 1774.5585
# structure.dwtd12 12.2038177 1813.0523
# rna.dwtd7 12.3179673 2567.8775
# structure.dwtd7 12.4009575 2641.3164
# structure.dwtd9 12.4565781 2489.2755
# gene.dwtd6 12.6830382 3581.3370
# structure.dwtd6 12.8811956 3184.6202
# gene.dwtd9 12.9371233 2624.0316
# gc.dwtd12 13.4716909 2027.0937
# structure.dwtd11 13.5693475 2273.5326
# temp.dwtd12 14.1277675 1986.4390
# structure.dwtd8 14.8530069 2720.2827
# gene.dwtd7 15.1624731 3243.0144
# gene.dwtd8 16.1721328 2711.4545
# gcraw 20.2145156 7192.4698
# tempraw 20.7880291 7713.8373
iRF <- function(xmat, y, ntree=500, iter=5, classification=F, threads=1,alwayssplits=NULL, saveall=T)
{
tmp <- cbind(xmat, Y = y)
wt <- rep(1/ncol(xmat), ncol(xmat)) # start with equal sample weighting per SNP
rfs <- list()
for(i in 1:iter)
{
cat("\niRF iteration ",i,"\n")
cat("=================\n")
mtry = 1*sum(wt>0)
rf <- ranger::ranger(dependent.variable.name = "Y", data = tmp, num.trees=ntree,
split.select.weights = wt, classification = classification,
mtry = mtry, importance = "impurity_corrected", num.threads=threads, write.forest = T,
always.split.variables = alwayssplits)
wt <- rf$variable.importance / sum(abs(rf$variable.importance)) # scale importance to range(0,1)
wt[wt<0] <- 0 # set negative weights to zero
cat("mtry: ", mtry, "\n")
cat("prediction error: ",rf$prediction.error,"\n")
if(classification==FALSE) cat("r^2: ",rf$r.squared,"\n")
if(classification==TRUE) print(rf$confusion.matrix)
cat("cor(y,yhat): ",cor(rf$predictions,y),"\n")
cat("SNPs with importance > 0:",sum(wt>0),"\n")
if(saveall) rfs[[i]] <- rf
if(sum(wt>0) < max(0.01*(ncol(xmat)-1), 10))
{
if(!saveall) rfs <- rf
break
}
}
return(rfs)
}
iRF(df.dcast[,3:ncol(df.dcast)], df.dcast$score)
df.rf <- ranger(df.dcast[,3:ncol(df.dcast)], df.dcast$score, num.trees=500, classification=F, mtry=ncol(df.dcast)-2, importance="impurity_corrected", num.threads=1, write.forest=T, always.split.variables=NULL)
### mtry = 0.5
iRF iteration 1
=================
mtry: 38.5
prediction error: 36.48304
r^2: 0.0946867
cor(y,yhat): 0.3077696
SNPs with importance > 0: 59
### mtry = 1
# Call:
# ranger::ranger(dependent.variable.name = "Y", data = tmp, num.trees = ntree,split.select.weights = wt, classification = classification,mtry = mtry, importance = "impurity_corrected", num.threads = threads,write.forest = T, always.split.variables = alwayssplits)
iRF iteration 1
=================
mtry: 77
prediction error: 36.59711
r^2: 0.09185611
cor(y,yhat): 0.3037172
SNPs with importance > 0: 56
iRF iteration 2
=================
mtry: 56
prediction error: 36.46622
r^2: 0.09510406
cor(y,yhat): 0.3088277
SNPs with importance > 0: 42
iRF iteration 3
=================
mtry: 42
prediction error: 36.17064
r^2: 0.1024387
cor(y,yhat): 0.3201388
SNPs with importance > 0: 35
iRF iteration 4
=================
mtry: 35
prediction error: 36.29261
r^2: 0.09941227
cor(y,yhat): 0.3166615
SNPs with importance > 0: 34
iRF iteration 5
=================
mtry: 34
prediction error: 36.35674
r^2: 0.09782076
cor(y,yhat): 0.3138919
SNPs with importance > 0: 32
module load python
source /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/scripts/loadcondaandes.sh
conda activate /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/Libraries/andes/envs/test
setwd("/gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli")
#df.id <- read.delim("ecoli.dwt.raw.onehot.txt", header=F, sep="\t")
#df.dcast <- read.delim("ecoli.dwt.raw.onehot.dcast.txt", header=T, sep="\t")
df.id <- read.delim("ecoli.dwt.raw.onehotsep.txt", header=F, sep="\t")
df.dcast <- read.delim("ecoli.dwt.raw.onehotsep.dcast.txt", header=T, sep="\t")
library(ranger)
xmat = df.dcast[,3:ncol(df.dcast)]
y = df.dcast$score
tmp <- cbind(xmat, Y = y)
wt <- rep(1/ncol(xmat), ncol(xmat))
df.rf <- ranger(dependent.variable.name = "Y", data=tmp, num.trees=500, split.select.weights = wt, classification=F, mtry=ncol(df.dcast)-2, importance="impurity_corrected", num.threads=1, write.forest=T, always.split.variables=NULL)
# Type: Regression
# Number of trees: 500
# Sample size: 5109
# Number of independent variables: 77
# Mtry: 77
# Target node size: 5
# Variable importance mode: impurity_corrected
# Splitrule: variance
# OOB prediction error (MSE): 36.4089
# R squared (OOB): 0.09652656
imp <- data.frame(importance(df.rf))
imp$feature <- as.character(rownames(imp))
imp.order <- na.omit(imp[order(imp),])
imp.order$imp <- as.numeric(imp.order$importance.df.rf.)
tail(imp.order)
# importance.df.rf. feature imp
# gc.dwtd12 1236.294 gc.dwtd12 1236.294
# onehot.ind1raw 1574.486 onehot.ind1raw 1574.486
# onehot.dep2raw 1936.528 onehot.dep2raw 1936.528
# onehot.ind2raw 1987.384 onehot.ind2raw 1987.384
# gcraw 5327.299 gcraw 5327.299
# tempraw 5546.159 tempraw 5546.159
library(ggplot2)
pdf("ecoli.dwt.raw.onehot.ranger.importance.pdf")
ggplot(imp.order, aes(x=reorder(feature, imp), y=imp, fill=imp))+ geom_bar(stat="identity", position="dodge")+coord_flip() +ylab("Variable Importance")+xlab("")+scale_fill_gradient2(low = "red", high = "blue") + theme_classic()
dev.off()
pred.df <- data.frame(score = df.dcast$score, pred = df.rf$predictions)
pdf("ecoli.dwt.raw.onehot.ranger.pred.pdf")
ggplot(pred.df, aes(x=score, y=pred)) + geom_point() + ylab("Prediction") + xlab("500bp Score") + theme_classic()
dev.off()
# df.raw <- df.dcast[,c(17,32,49,64,79)]
# df.dwt <- df.dcast[,c(3:16,18:31,37:48,50:63,65:78)]
# df.onehot <- df.dcast[,c(33:36)]
# df.raw <- df.dcast[,c(17,32,176,191,206)]
# df.dwt <- df.dcast[,c(3:16,18:31,164:175,177:190,192:205)]
# df.onehot <- df.dcast[,c(33:163)]
df.raw <- df.dcast[,c(28,44,384,399,416)]
df.dwt <- df.dcast[,c(14:27,30:43,372:383,385:398,402:415)]
df.onehot <- df.dcast[,c(3:13,29,45:371,400:401,417:419)]
# just raw
xmat = df.raw
y = df.dcast$score
tmp <- cbind(xmat, Y = y)
wt <- rep(1/ncol(xmat), ncol(xmat))
df.rf.raw <- ranger(dependent.variable.name = "Y", data=tmp, num.trees=500, split.select.weights = wt, classification=F, mtry=ncol(df.raw), importance="impurity_corrected", num.threads=1, write.forest=T, always.split.variables=NULL)
# Type: Regression
# Number of trees: 500
# Sample size: 5109
# Number of independent variables: 5
# Mtry: 5
# Target node size: 5
# Variable importance mode: impurity_corrected
# Splitrule: variance
# OOB prediction error (MSE): 39.89624
# R squared (OOB): 0.009989516
cor(df.rf.raw$predictions,y)
# 0.178336
# Type: Regression
# Number of trees: 500
# Sample size: 1679
# Number of independent variables: 5
# Mtry: 5
# Target node size: 5
# Variable importance mode: impurity_corrected
# Splitrule: variance
# OOB prediction error (MSE): 32.94673
# R squared (OOB): 0.0293786
# 0.2556712
### prediction
# 0.7*nrow(xmat) = 1175.3
xmat = df.raw
xmat.test = xmat[1:1175,]
y = df.dcast[1:1175,]
y.test = y$score
tmp <- cbind(xmat.test, Y = y.test)
wt <- rep(1/ncol(xmat.test), ncol(xmat.test))
df.rf.raw.test <- ranger(dependent.variable.name = "Y", data=tmp, num.trees=500, split.select.weights = wt, classification=F, mtry=ncol(xmat.test), num.threads=1, write.forest=T, always.split.variables=NULL)
pred <- predict(df.rf.raw.test, xmat[1176:nrow(xmat),])
pred$predictions
df.dcast.pred <- df.dcast[1176:nrow(xmat),]
pred.df <- data.frame(score = df.dcast.pred$score, pred = pred$predictions)
cor(pred.df$score, pred.df$pred)
# 0.1492853
# 0.1335178
# just dwt
xmat = df.dwt
y = df.dcast$score
tmp <- cbind(xmat, Y = y)
wt <- rep(1/ncol(xmat), ncol(xmat))
df.rf.dwt <- ranger(dependent.variable.name = "Y", data=tmp, num.trees=500, split.select.weights = wt, classification=F, mtry=ncol(df.dwt), importance="impurity_corrected", num.threads=1, write.forest=T, always.split.variables=NULL)
# Type: Regression
# Number of trees: 500
# Sample size: 5109
# Number of independent variables: 68
# Mtry: 68
# Target node size: 5
# Variable importance mode: impurity_corrected
# Splitrule: variance
# OOB prediction error (MSE): 37.83855
# R squared (OOB): 0.06105016
cor(df.rf.dwt$predictions,y)
# 0.2496184
# Type: Regression
# Number of trees: 500
# Sample size: 1679
# Number of independent variables: 68
# Mtry: 68
# Target node size: 5
# Variable importance mode: impurity_corrected
# Splitrule: variance
# OOB prediction error (MSE): 34.03493
# R squared (OOB): -0.002680206
# 0.1248095
### prediction
xmat = df.dwt
xmat.test = xmat[1:1175,]
y = df.dcast[1:1175,]
y.test = y$score
tmp <- cbind(xmat.test, Y = y.test)
wt <- rep(1/ncol(xmat.test), ncol(xmat.test))
df.rf.test <- ranger(dependent.variable.name = "Y", data=tmp, num.trees=500, split.select.weights = wt, classification=F, mtry=ncol(xmat.test), num.threads=1, write.forest=T, always.split.variables=NULL)
pred <- predict(df.rf.test, xmat[1176:nrow(xmat),])
pred$predictions
df.dcast.pred <- df.dcast[1176:nrow(xmat),]
pred.df <- data.frame(score = df.dcast.pred$score, pred = pred$predictions)
cor(pred.df$score, pred.df$pred)
# 0.03319047
# 0.06278793
# just onehot
xmat = df.onehot
y = df.dcast$score
tmp <- cbind(xmat, Y = y)
wt <- rep(1/ncol(xmat), ncol(xmat))
df.rf.onehot <- ranger(dependent.variable.name = "Y", data=tmp, num.trees=500, split.select.weights = wt, classification=F, mtry=ncol(df.onehot), importance="impurity_corrected", num.threads=1, write.forest=T, always.split.variables=NULL)
# Type: Regression
# Number of trees: 500
# Sample size: 5109
# Number of independent variables: 4
# Mtry: 4
# Target node size: 5
# Variable importance mode: impurity_corrected
# Splitrule: variance
# OOB prediction error (MSE): 41.48125
# R squared (OOB): -0.02934197
cor(df.rf.onehot$predictions,y)
# 0.04315756
### separated onehot
# Type: Regression
# Number of trees: 500
# Sample size: 1679
# Number of independent variables: 405
# Mtry: 405
# Target node size: 5
# Variable importance mode: impurity_corrected
# Splitrule: variance
# OOB prediction error (MSE): 32.09072
# R squared (OOB): 0.0545969
cor(df.rf.onehot$predictions,y)
# 0.2370724
# Type: Regression
# Number of trees: 500
# Sample size: 1679
# Number of independent variables: 405
# Mtry: 405
# Target node size: 5
# Variable importance mode: impurity_corrected
# Splitrule: variance
# OOB prediction error (MSE): 32.49179
# R squared (OOB): 0.04278116
# > cor(df.rf.onehot$predictions,y)
# [1] 0.2064228
### prediction
xmat = df.onehot
xmat.test = xmat[1:3576,]
y = df.dcast[1:3576,]
y.test = y$score
tmp <- cbind(xmat.test, Y = y.test)
wt <- rep(1/ncol(xmat.test), ncol(xmat.test))
xmat.test[is.na(xmat.test)] <- 0
df.rf.test <- ranger(dependent.variable.name = "Y", data=tmp, num.trees=500, split.select.weights = wt, classification=F, mtry=ncol(xmat.test), num.threads=1, write.forest=T, always.split.variables=NULL)
pred <- predict(df.rf.test, xmat[3577:5109,])
pred$predictions
df.dcast.pred <- df.dcast[3577:5109,]
pred.df <- data.frame(score = df.dcast.pred$score, pred = pred$predictions)
cor(pred.df$score, pred.df$pred)
# 0.1002623
# 0.3582551
# 0.1791703
# remove dwt (raw + onehot)
xmat = cbind(df.raw, df.onehot)
y = df.dcast$score
tmp <- cbind(xmat, Y = y)
wt <- rep(1/ncol(xmat), ncol(xmat))
df.rf.raw.onehot <- ranger(dependent.variable.name = "Y", data=tmp, num.trees=500, split.select.weights = wt, classification=F, mtry=ncol(xmat), importance="impurity_corrected", num.threads=1, write.forest=T, always.split.variables=NULL)
# Type: Regression
# Number of trees: 500
# Sample size: 5109
# Number of independent variables: 9
# Mtry: 9
# Target node size: 5
# Variable importance mode: impurity_corrected
# Splitrule: variance
# OOB prediction error (MSE): 39.06231
# R squared (OOB): 0.03068315
cor(df.rf.raw.onehot$predictions,y)
# 0.1962338
# Type: Regression
# Number of trees: 500
# Sample size: 1679
# Number of independent variables: 410
# Mtry: 410
# Target node size: 5
# Variable importance mode: impurity_corrected
# Splitrule: variance
# OOB prediction error (MSE): 29.5016
# R squared (OOB): 0.130873
# 0.3616121
### prediction
xmat = cbind(df.raw, df.onehot)
xmat.test = xmat[1:1175,]
y = df.dcast[1:1175,]
y.test = y$score
tmp <- cbind(xmat.test, Y = y.test)
wt <- rep(1/ncol(xmat.test), ncol(xmat.test))
df.rf.raw.test <- ranger(dependent.variable.name = "Y", data=tmp, num.trees=500, split.select.weights = wt, classification=F, mtry=ncol(xmat.test), num.threads=1, write.forest=T, always.split.variables=NULL)
pred <- predict(df.rf.raw.test, xmat[1176:nrow(xmat),])
pred$predictions
df.dcast.pred <- df.dcast[1176:nrow(xmat),]
pred.df <- data.frame(score = df.dcast.pred$score, pred = pred$predictions)
cor(pred.df$score, pred.df$pred)
# 0.1837235
# expanded onehot 0.3445921
# remove onehot (dwt + raw)
xmat = cbind(df.dwt, df.raw)
y = df.dcast$score
tmp <- cbind(xmat, Y = y)
wt <- rep(1/ncol(xmat), ncol(xmat))
df.rf.dwt.raw <- ranger(dependent.variable.name = "Y", data=tmp, num.trees=500, split.select.weights = wt, classification=F, mtry=ncol(xmat), importance="impurity_corrected", num.threads=1, write.forest=T, always.split.variables=NULL)
# Type: Regression
# Number of trees: 500
# Sample size: 5109
# Number of independent variables: 73
# Mtry: 73
# Target node size: 5
# Variable importance mode: impurity_corrected
# Splitrule: variance
# OOB prediction error (MSE): 36.41806
# R squared (OOB): 0.09629922
cor(df.rf.dwt.raw$predictions,y)
# 0.310147
# Type: Regression
# Number of trees: 500
# Sample size: 1679
# Number of independent variables: 73
# Mtry: 73
# Target node size: 5
# Variable importance mode: impurity_corrected
# Splitrule: variance
# OOB prediction error (MSE): 30.34057
# R squared (OOB): 0.1061568
# 0.328672
### prediction
xmat = cbind(df.dwt, df.raw)
xmat.test = xmat[1:1175,]
y = df.dcast[1:1175,]
y.test = y$score
tmp <- cbind(xmat.test, Y = y.test)
wt <- rep(1/ncol(xmat.test), ncol(xmat.test))
df.rf.test <- ranger(dependent.variable.name = "Y", data=tmp, num.trees=500, split.select.weights = wt, classification=F, mtry=ncol(xmat.test), num.threads=1, write.forest=T, always.split.variables=NULL)
pred <- predict(df.rf.test, xmat[1176:nrow(xmat),])
pred$predictions
df.dcast.pred <- df.dcast[1176:nrow(xmat),]
pred.df <- data.frame(score = df.dcast.pred$score, pred = pred$predictions)
cor(pred.df$score, pred.df$pred)
# 0.1610386
# 0.1972109
# remove raw (dwt + onehot)
xmat = cbind(df.dwt, df.onehot)
y = df.dcast$score
tmp <- cbind(xmat, Y = y)
wt <- rep(1/ncol(xmat), ncol(xmat))
df.rf.dwt.onehot <- ranger(dependent.variable.name = "Y", data=tmp, num.trees=500, split.select.weights = wt, classification=F, mtry=ncol(xmat), importance="impurity_corrected", num.threads=1, write.forest=T, always.split.variables=NULL)
# Type: Regression
# Number of trees: 500
# Sample size: 5109
# Number of independent variables: 72
# Mtry: 72
# Target node size: 5
# Variable importance mode: impurity_corrected
# Splitrule: variance
# OOB prediction error (MSE): 37.60152
# R squared (OOB): 0.06693207
cor(df.rf.dwt.onehot$predictions,y)
# 0.2593089
# Type: Regression
# Number of trees: 500
# Sample size: 1679
# Number of independent variables: 473
# Mtry: 473
# Target node size: 5
# Variable importance mode: impurity_corrected
# Splitrule: variance
# OOB prediction error (MSE): 31.8652
# R squared (OOB): 0.06124087
# 0.2512214
xmat = cbind(df.dwt, df.onehot)
xmat.test = xmat[1:1175,]
y = df.dcast[1:1175,]
y.test = y$score
tmp <- cbind(xmat.test, Y = y.test)
wt <- rep(1/ncol(xmat.test), ncol(xmat.test))
df.rf.test <- ranger(dependent.variable.name = "Y", data=tmp, num.trees=500, split.select.weights = wt, classification=F, mtry=ncol(xmat.test), num.threads=1, write.forest=T, always.split.variables=NULL)
pred <- predict(df.rf.test, xmat[1176:nrow(xmat),])
pred$predictions
df.dcast.pred <- df.dcast[1176:nrow(xmat),]
pred.df <- data.frame(score = df.dcast.pred$score, pred = pred$predictions)
cor(pred.df$score, pred.df$pred)
# 0.07345225
# 0.3123946
# all (dwt + raw + onehot)
xmat = cbind(df.dwt, df.onehot, df.raw)
y = df.dcast$score
tmp <- cbind(xmat, Y = y)
wt <- rep(1/ncol(xmat), ncol(xmat))
df.rf.all <- ranger(dependent.variable.name = "Y", data=tmp, num.trees=500, split.select.weights = wt, classification=F, mtry=ncol(xmat), importance="impurity_corrected", num.threads=1, write.forest=T, always.split.variables=NULL)
# Type: Regression
# Number of trees: 500
# Sample size: 5109
# Number of independent variables: 77
# Mtry: 77
# Target node size: 5
# Variable importance mode: impurity_corrected
# Splitrule: variance
# OOB prediction error (MSE): 36.58831
# R squared (OOB): 0.09207459
cor(df.rf.all$predictions,y)
# 0.3036853
# all w/ onehot separated
# Type: Regression
# Number of trees: 500
# Sample size: 5109
# Number of independent variables: 204
# Mtry: 204
# Target node size: 5
# Variable importance mode: impurity_corrected
# Splitrule: variance
# OOB prediction error (MSE): 36.48951
# R squared (OOB): 0.09452611
cor(df.rf.all$predictions,y)
# 0.3073853 cor(y,yhat)
sqrt(36.48951)
# 6.040655 RMSE
### onehot separated (actually this time? - 17 june 2021)
# Type: Regression
# Number of trees: 500
# Sample size: 1679
# Number of independent variables: 478
# Mtry: 478
# Target node size: 5
# Variable importance mode: impurity_corrected
# Splitrule: variance
# OOB prediction error (MSE): 29.17818
# R squared (OOB): 0.1404014
# > cor(df.rf.all$predictions,y)
# [1] 0.3770797
xmat = cbind(df.dwt, df.onehot, df.raw)
xmat.test = xmat[1:1175,]
y = df.dcast[1:1175,]
y.test = y$score
tmp <- cbind(xmat.test, Y = y.test)
wt <- rep(1/ncol(xmat.test), ncol(xmat.test))
df.rf.all.test <- ranger(dependent.variable.name = "Y", data=tmp, num.trees=500, split.select.weights = wt, classification=F, mtry=ncol(xmat.test), num.threads=1, write.forest=T, always.split.variables=NULL)
pred <- predict(df.rf.all.test, xmat[1176:nrow(xmat),])
pred$predictions
df.dcast.pred <- df.dcast[1176:nrow(xmat),]
pred.df <- data.frame(score = df.dcast.pred$score, pred = pred$predictions)
cor(pred.df$score, pred.df$pred)
# 0.3055084
imp <- data.frame(importance(df.rf.all))
imp$feature <- as.character(rownames(imp))
imp.order <- na.omit(imp[order(imp),])
imp.order$imp <- as.numeric(imp.order$importance.df.rf.all.)
tail(imp.order)
# importance.df.rf.all. feature imp
# rna.dwtd10 -124.8561 rna.dwtd10 -124.8561
# structure.dwtd1 -155.4961 structure.dwtd1 -155.4961
# temp.dwtd12 1044.2449 temp.dwtd12 1044.2449
# p3.x.xraw 2656.9501 p3.x.xraw 2656.9501
# gcraw 4869.1155 gcraw 4869.1155
# tempraw 5012.3953 tempraw 5012.3953
# importance.df.rf.all. feature imp
# V295raw 445.0751 V295raw 445.0751
# V279raw 650.6209 V279raw 650.6209
# V17.xraw -124.6107 V17.xraw -124.6107
# V7raw 1052.4298 V7raw 1052.4298
# gcraw 2247.1354 gcraw 2247.1354
# tempraw 2502.5628 tempraw 2502.5628
# iRF
iRF <- function(xmat, y, ntree=500, iter=5, classification=F, threads=1,alwayssplits=NULL, saveall=T)
{
tmp <- cbind(xmat, Y = y)
wt <- rep(1/ncol(xmat), ncol(xmat)) # start with equal sample weighting per SNP
rfs <- list()
for(i in 1:iter)
{
cat("\niRF iteration ",i,"\n")
cat("=================\n")
mtry = 1*sum(wt>0)
rf <- ranger::ranger(dependent.variable.name = "Y", data = tmp, num.trees=ntree,
split.select.weights = wt, classification = classification,
mtry = mtry, importance = "impurity_corrected", num.threads=threads, write.forest = T,
always.split.variables = alwayssplits)
wt <- rf$variable.importance / sum(abs(rf$variable.importance)) # scale importances to range(0,1)
wt[wt<0] <- 0 # set negative weights to zero
cat("mtry: ", mtry, "\n")
cat("prediction error: ",rf$prediction.error,"\n")
if(classification==FALSE) cat("r^2: ",rf$r.squared,"\n")
if(classification==TRUE) print(rf$confusion.matrix)
cat("cor(y,yhat): ",cor(rf$predictions,y),"\n")
cat("SNPs with importance > 0:",sum(wt>0),"\n")
if(saveall) rfs[[i]] <- rf
if(sum(wt>0) < max(0.01*(ncol(xmat)-1), 10))
{
if(!saveall) rfs <- rf
break
}
}
return(rfs)
}
xmat = cbind(df.dwt, df.onehot, df.raw)
y = df.dcast$score
iRF(xmat, y)
# iRF iteration 1
# =================
# mtry: 204
# prediction error: 36.45043
# r^2: 0.09549585
# cor(y,yhat): 0.3094516
# SNPs with importance > 0: 182
# Type: Regression
# Number of trees: 500
# Sample size: 5109
# Number of independent variables: 204
# Mtry: 204
# Target node size: 5
# Variable importance mode: impurity_corrected
# Splitrule: variance
# OOB prediction error (MSE): 36.45043
# R squared (OOB): 0.09549585
#
# iRF iteration 2
# =================
# mtry: 182
# prediction error: 36.43556
# r^2: 0.09586483
# cor(y,yhat): 0.310043
# SNPs with importance > 0: 169
#
# iRF iteration 3
# =================
# mtry: 169
# prediction error: 36.64926
# r^2: 0.09056201
# cor(y,yhat): 0.3012155
# SNPs with importance > 0: 162
#
# iRF iteration 4
# =================
# mtry: 162
# prediction error: 36.13965
# r^2: 0.1032079
# cor(y,yhat): 0.3215349
# SNPs with importance > 0: 151
#
# iRF iteration 5
# =================
# mtry: 151
# prediction error: 36.26501
# r^2: 0.100097
# cor(y,yhat): 0.3164983
# SNPs with importance > 0: 144
### prediction
nrow(df.dcast)*0.7
# 3576.3
xmat = cbind(df.dwt, df.onehot, df.raw)
xmat.test = xmat[1:3576,]
y = df.dcast[1:3576,]
y.test = y$score
tmp <- cbind(xmat.test, Y = y.test)
wt <- rep(1/ncol(xmat.test), ncol(xmat.test))
df.rf.all.test <- ranger(dependent.variable.name = "Y", data=tmp, num.trees=500, split.select.weights = wt, classification=F, mtry=ncol(xmat.test), num.threads=1, write.forest=T, always.split.variables=NULL)
pred <- predict(df.rf.all.test, xmat[3577:5109,])
pred$predictions
df.dcast.pred <- df.dcast[3577:5109,]
pred.df <- data.frame(score = df.dcast.pred$score, pred = pred$predictions)
cor(pred.df$score, pred.df$pred)
# 0.1683098
# 0.1846074 <- onehot separated
cor(pred.df$score, pred.df$pred, method = c("spearman"))
# 0.1304906
cor(pred.df$score, pred.df$pred, method = c("kendall"))
# 0.08856798
cor(pred.df$score, pred.df$pred, method = c("pearson"))
# 0.1829811
#### need to implement a kfold cross validation scheme
pdf("ecoli.dwt.raw.onehot.ranger.prediction.pdf")
ggplot(pred.df, aes(x=score, y=pred)) + geom_point() + ylab("Prediction") + xlab("500bp Score") + theme_classic()
dev.off()
## Quantile regression forest
df.rf.all.test <- ranger(dependent.variable.name = "Y", data=tmp, num.trees=500, split.select.weights = wt, classification=F, mtry=ncol(xmat.test), num.threads=1, write.forest=T, always.split.variables=NULL, quantreg = TRUE)
pred <- predict(df.rf.all.test, xmat[3577:5109,], type = "quantiles")
pred$predictions
df.dcast.pred <- df.dcast[3577:5109,]
pred.df <- data.frame(score = df.dcast.pred$score, pred1 = pred$predictions)
cor(pred.df[,1], pred.df[,2])
# ** 0.1723876
# 0.1447041 <- onehot separated
cor(pred.df[,1], pred.df[,3])
# 0.1563982
# ** 0.1836663 <- onehot separated
cor(pred.df[,1], pred.df[,4])
# 0.1050769
# 0.1495977 <- onehot separated
module load python
source /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/scripts/loadcondaandes.sh
conda activate /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/Libraries/andes/envs/test
library(dplyr)
library(reshape2)
library(wmtsa)
library(tidyr)
setwd("/gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli")
gene <- read.delim("ecoli.gene.windows500.bed", header=F, sep="\t", stringsAsFactors = F)
structure <- read.delim("500bp.protein.structure.fa", header=T, sep="\t", stringsAsFactors = F)
nuc <- read.delim("nucleotide_counts_500bp_temp.txt", header=T, sep="\t", stringsAsFactors = F)
rnaseq <- read.delim("ecoli.rnaseq.average.windows500.bed", header=T, sep="\t", stringsAsFactors = F)
window <- read.delim("ecoli.500bp.windows.bed", header=F, sep="\t", stringsAsFactors = F)
#score <- read.delim("window500.score.avg.txt", header=T, sep="\t", stringsAsFactors = F)
#score <- read.delim("sgRNA.coord.txt", header=T, sep="\t", stringsAsFactors = F)
score <- read.delim("sgRNA.rbs.coord.txt", header=T, sep="\t", stringsAsFactors = F)
colnames(score) <- c("chr", "start", "end", "sgRNA", "id", "seq", "id2", "cut.score", "gid", "change.val", "quality")
score.df <- score[,c(1:4,8)]
colnames(window) <- c("chr", "start", "end")
gene.bin <- gene %>% group_by(V1, V2, V3) %>% mutate(gene.count = n())
gene.count <- unique(gene.bin[,c(1:3,14)])
colnames(gene.count) <- c("chr", "start", "end", "gene.count")
gene.window <- left_join(window, gene.count, by=c("chr", "start", "end"))
gene.window[is.na(gene.window)] <- 0
gene.df <- gene.window$gene.count
colnames(rnaseq) <- c("chr", "start", "end", "avg.fpkm")
rnaseq.window <- left_join(window, rnaseq, by=c("chr", "start", "end"))
rnaseq.window[is.na(rnaseq.window)] <- 0
rna.df <- rnaseq.window$avg.fpkm
structure.df <- structure[,2]
gc.df <- nuc[,7]
temp.df <- nuc[,8]
# Run DWT instead of CWT
temp.modwt <- wavMODWT(temp.df)
temp.modwt.df <- as.matrix(temp.modwt)
temp.modwt.label <- data.frame(label = row.names(temp.modwt.df), temp.modwt.df)
temp.modwt.name <- temp.modwt.label %>% separate(label, c("name", "number"), sep = "[^[:alnum:]]+")
colnames(temp.modwt.name) <- c("scale", "window", "temp.dwt")
gc.modwt <- wavMODWT(gc.df)
gc.modwt.df <- as.matrix(gc.modwt)
gc.modwt.label <- data.frame(label = row.names(gc.modwt.df), gc.modwt.df)
gc.modwt.name <- gc.modwt.label %>% separate(label, c("name", "number"), sep = "[^[:alnum:]]+")
colnames(gc.modwt.name) <- c("scale", "window", "gc.dwt")
structure.modwt <- wavMODWT(structure.df)
structure.modwt.df <- as.matrix(structure.modwt)
structure.modwt.label <- data.frame(label = row.names(structure.modwt.df), structure.modwt.df)
structure.modwt.name <- structure.modwt.label %>% separate(label, c("name", "number"), sep = "[^[:alnum:]]+")
colnames(structure.modwt.name) <- c("scale", "window", "structure.dwt")
rna.modwt <- wavMODWT(rna.df)
rna.modwt.df <- as.matrix(rna.modwt)
rna.modwt.label <- data.frame(label = row.names(rna.modwt.df), rna.modwt.df)
rna.modwt.name <- rna.modwt.label %>% separate(label, c("name", "number"), sep = "[^[:alnum:]]+")
colnames(rna.modwt.name) <- c("scale", "window", "rna.dwt")
gene.modwt <- wavMODWT(gene.df)
gene.modwt.df <- as.matrix(gene.modwt)
gene.modwt.label <- data.frame(label = row.names(gene.modwt.df), gene.modwt.df)
gene.modwt.name <- gene.modwt.label %>% separate(label, c("name", "number"), sep = "[^[:alnum:]]+")
colnames(gene.modwt.name) <- c("scale", "window", "gene.dwt")
colnames(window) <- c("chr", "start", "end")
window$window <- seq.int(nrow(window))
window$window <- as.character(window$window-1)
library(tidygenomics)
window.score <- genome_intersect(window, score.df, by=c("chr", "start", "end"))
window.score.df <- left_join(window, window.score[,2:4], by=c("window"))
window.score.temp <- left_join(window.score.df, temp.modwt.name, by="window")
window.temp.gc <- left_join(window.score.temp, gc.modwt.name, by=c("window", "scale"))
window.temp.gc.structure <- left_join(window.temp.gc, structure.modwt.name, by=c("window", "scale"))
window.temp.gc.structure.rna <- left_join(window.temp.gc.structure, rna.modwt.name, by=c("window", "scale"))
window.temp.gc.structure.rna.gene <- left_join(window.temp.gc.structure.rna, gene.modwt.name, by=c("window", "scale"))
window.temp.gc.structure.rna.gene.sgRNA <- subset(window.temp.gc.structure.rna.gene, window.temp.gc.structure.rna.gene$cut.score != "NA")
# write.table(window.temp.gc.structure.rna.gene.sgRNA, "ecoli.features.sgRNA.DWT.txt", quote=F, row.names=F, sep="\t")
df.melt <- melt(window.temp.gc.structure.rna.gene.sgRNA[,5:12], id=c("cut.score", "scale", "sgRNA"))
df <- na.omit(df.melt)
# df.melt <- melt(window.temp.gc.structure.rna.gene.sgRNA[,4:12], id=c("window", "cut.score", "scale", "sgRNA"))
# df <- na.omit(df.melt)
#
# write.table(df, "ecoli.features.sgRNA.DWT.txt", quote=F, row.names=F, sep="\t")
#
# library(tidyr)
# df.id <- df %>% unite(feature.scale, c(variable, scale), sep = "")
#
# df.dcast <- df.id %>% dcast(window + cut.score ~ feature.scale, value.var = "value", fun.aggregate=mean)
# df.dcast <- na.omit(df.dcast)
# write.table(df.dcast, "ecoli.features.sgRNA.DWT.dcast.txt", quote=F, row.names=F, sep="\t")
## add 500bp raw data
structure$window <- seq.int(nrow(structure))
structure$window <- as.character(structure$window-1)
gene.count$window <- seq.int(nrow(gene.count))
gene.count$window <- as.character(gene.count$window-1)
nuc$window <- seq.int(nrow(nuc))
nuc$window <- as.character(nuc$window-1)
rnaseq$window <- seq.int(nrow(rnaseq))
rnaseq$window <- as.character(rnaseq$window-1)
structure$scale <- "raw"
gene.count$scale <- "raw"
nuc$scale <- "raw"
rnaseq$scale <- "raw"
window.score.temp <- left_join(window.score.df, nuc[,8:10], by="window")
window.temp.gc <- left_join(window.score.temp, nuc[,c(7,9:10)], by=c("window", "scale"))
window.temp.gc.gene <- left_join(window.temp.gc, gene.count[,c(4:6)], by=c("window", "scale"))
window.temp.gc.gene.rna <- left_join(window.temp.gc.gene, rnaseq[,c(4:6)], by=c("window", "scale"))
window.temp.gc.gene.rna.structure <- left_join(window.temp.gc.gene.rna, structure[,c(2,22,23)], by=c("window", "scale"))
colnames(window.temp.gc.gene.rna.structure) <- c("chr", "start", "end", "window", "sgRNA", "cut.score", "temp", "scale", "gc", "gene", "rna", "structure")
df2.melt <- melt(window.temp.gc.gene.rna.structure[,5:12], id=c("cut.score", "scale", "sgRNA"))
df2 <- na.omit(df2.melt)
df <- rbind(df, df2)
# df2.melt <- melt(window.temp.gc.gene.rna.structure[,4:12], id=c("window", "cut.score", "scale", "sgRNA"))
# df2 <- na.omit(df2.melt)
# df3 <- rbind(df, df2)
#
# library(tidyr)
# df.id <- df3 %>% unite(feature.scale, c(variable, scale), sep = "")
#
# df.dcast <- df.id %>% dcast(window + cut.score ~ feature.scale, value.var = "value", fun.aggregate=mean)
# df.dcast <- na.omit(df.dcast)
#
# write.table(df.dcast, "ecoli.features.sgRNA.DWT.raw.dcast.txt", quote=F, row.names=F, sep="\t")
# add sgRNA raw data
setwd("/gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli")
#structure <- read.delim("ecoli.gRNA.structure.txt", header=T, sep="\t", stringsAsFactors = F)
#nuc <- read.delim("gRNA_nuc_counts_temp.txt", header=T, sep="\t", stringsAsFactors = F)
structure <- read.delim("ecoli.gRNA.rbs.structure.txt", header=T, sep="\t", stringsAsFactors = F)
nuc <- read.delim("gRNA_rbs_nuc_counts_temp.txt", header=T, sep="\t", stringsAsFactors = F)
structure$scale <- "sgRNA.raw"
nuc$scale <- "sgRNA.raw"
structure$sgRNA <- structure[,1]
nuc$sgRNA <- nuc[,1]
window.score.structure <- left_join(window.score.df, structure[,c(2,22,23)], by="sgRNA")
window.score.structure.temp <- left_join(window.score.structure, nuc[,8:10], by=c("sgRNA", "scale"))
window.score.structure.temp.gc <- left_join(window.score.structure.temp, nuc[,c(7,9:10)], by=c("sgRNA", "scale"))
colnames(window.score.structure.temp.gc) <- c("chr", "start", "end", "window", "sgRNA", "cut.score", "sgRNA.structure", "scale", "sgRNA.temp", "sgRNA.gc")
## add one-hot encoding of sequence
setwd("/gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli")
# onehot.ind1 <- read.delim("DataS1_independent1.txt")
# onehot.ind2 <- read.delim("DataS1_independent2.txt")
# onehot.dep1 <- read.delim("DataS1_dependent1.txt")
# onehot.dep2 <- read.delim("DataS1_dependent2.txt")
# onehot.ind1 <- read.delim("DataS1.rbs_independent1.txt")
# onehot.ind2 <- read.delim("DataS1.rbs_independent2.txt")
# onehot.dep1 <- read.delim("DataS1.rbs_dependent1.txt")
# onehot.dep2 <- read.delim("DataS1.rbs_dependent2.txt")
onehot.ind1 <- read.delim("DataS1_ind1.txt", header=T, sep=" ")
onehot.ind2 <- read.delim("DataS1_ind2.txt", header=T, sep=" ")
onehot.dep1 <- read.delim("DataS1_dep1.txt", header=T, sep=" ")
onehot.dep2 <- read.delim("DataS1_dep2.txt", header=T, sep=" ")
onehot.ind <- full_join(onehot.ind1, onehot.ind2, by="sgRNAID")
onehot.dep <- full_join(onehot.dep1, onehot.dep2, by="sgRNAID")
onehot <- full_join(onehot.ind, onehot.dep, by="sgRNAID")
onehot <- onehot[2:nrow(onehot),]
#colnames(onehot) <- c("sgRNA", "onehot.ind1", "onehot.ind2", "onehot.dep1", "onehot.dep2")
onehot$scale <- "sgRNA.raw"
colnames(window.score.structure.temp.gc) <- c("chr", "start", "end", "window", "sgRNAID", "cut.score", "sgRNA.structure", "scale", "sgRNA.temp", "sgRNA.gc")
data.onehot <- left_join(window.score.structure.temp.gc, onehot, by=c("sgRNAID", "scale"))
#df2.melt <- melt(data.onehot[,c(5:14)], id=c("cut.score", "scale", "sgRNAID"))
df2.melt <- melt(data.onehot[,c(5:370)], id=c("cut.score", "scale", "sgRNAID"))
df2 <- na.omit(df2.melt)
colnames(df) <- c("cut.score", "scale", "sgRNAID", "variable", "value")
df <- rbind(df, df2)
library(tidyr)
# df.norna <- subset(df, df$variable != "rna.dwt")
# df.norna2 <- subset(df.norna, df.norna$variable != "rna")
# df.norna.nogene <- subset(df.norna2, df.norna2$variable != "gene.dwt")
# df.norna.nogene2 <- subset(df.norna.nogene, df.norna.nogene$variable != "gene")
df.id <- df %>% unite(feature.scale, c(variable, scale), sep = "")
#df.id.na <- na.omit(df.id)
df.id$value <- as.numeric(df.id$value)
df.id.na <- df.id[!(is.na(df.id$value) | df.id$value==""), ]
df.dcast <- df.id.na %>% dcast(sgRNAID + cut.score ~ feature.scale, value.var = "value", fun.aggregate=mean, na.rm=TRUE)
#write.table(df.id.na, "ecoli.features.sgRNA.DWT.raw.onehot.sgRNAraw.txt", quote=F, row.names=F, sep="\t")
#write.table(df.dcast, "ecoli.features.sgRNA.DWT.raw.onehot.sgRNAraw.dcast.txt", quote=F, row.names=F, sep="\t")
write.table(df.id.na, "ecoli.features.sgRNA.rbs.DWT.raw.onehot.sgRNAraw.txt", quote=F, row.names=F, sep="\t")
write.table(df.dcast, "ecoli.features.sgRNA.rbs.DWT.raw.onehot.sgRNAraw.dcast.txt", quote=F, row.names=F, sep="\t")
nrow(df.dcast)
#[1] 44094
df.dcast.na <- na.omit(df.dcast)
nrow(df.dcast.na)
#[1] 25795
# random forest
xmat = df.dcast.na[,3:ncol(df.dcast.na)]
y = df.dcast.na$cut.score
tmp <- cbind(xmat, Y = y)
wt <- rep(1/ncol(xmat), ncol(xmat))
df.rf.all <- ranger(dependent.variable.name = "Y", data=tmp, num.trees=500, split.select.weights = wt, classification=F, mtry=ncol(xmat), importance="impurity_corrected", num.threads=1, write.forest=T, always.split.variables=NULL)
# Type: Regression
# Number of trees: 500
# Sample size: 40468
# Number of independent variables: 52
# Mtry: 52
# Target node size: 5
# Variable importance mode: impurity_corrected
# Splitrule: variance
# OOB prediction error (MSE): 102.2165
# R squared (OOB): 0.07503481
cor(df.rf.all$predictions,y)
# 0.2766977
imp <- data.frame(importance(df.rf.all))
imp$feature <- as.character(rownames(imp))
imp.order <- na.omit(imp[order(imp),])
imp.order$imp <- as.numeric(imp.order$importance.df.rf.)
tail(imp.order)
# importance.df.rf.all. feature imp
# tempraw 29503.98 tempraw 29503.98
# gcraw 38782.76 gcraw 38782.76
# sgRNA.gcsgRNA.raw 84615.19 sgRNA.gcsgRNA.raw 84615.19
# sgRNA.tempsgRNA.raw 87403.50 sgRNA.tempsgRNA.raw 87403.50
# structure.dwtd9 -10453.65 structure.dwtd9 -10453.65
# structure.dwtd13 -12437.75 structure.dwtd13 -12437.75
# 18 june 2021
df.rf.all <- ranger(dependent.variable.name = "Y", data=tmp, num.trees=500, split.select.weights = wt, classification=F, mtry=ncol(xmat), importance="impurity_corrected", num.threads=1, write.forest=T, always.split.variables=NULL)
# Type: Regression
# Number of trees: 500
# Sample size: 25795
# Number of independent variables: 422
# Mtry: 422
# Target node size: 5
# Variable importance mode: impurity_corrected
# Splitrule: variance
# OOB prediction error (MSE): 85.60527
# R squared (OOB): 0.2400421
cor(df.rf.all$predictions,y)
# 0.493704
# feature imp
# temp.dwtd13 -1016.4592327
# gene.dwtd1 -1065.5664660
# temp.dwtd6 -1122.5570922
# rna.dwtd4 -1128.1320518
# structure.dwtd10 -1198.7422774
# structure.dwtd2 -1253.7498268
# gc.dwtd13 -1274.5513324
# rna.dwtd10 -1342.8236309
# structure.dwtd13 -1351.4763685
# structure.dwtd6 -1354.2826572
# rna.dwtd6 -1486.7301791
# rna.dwtd9 -1522.7102903
# rna.dwtd13 -1546.3068285
# rna.dwtd2 -1734.5755193
# rna.dwtd5 -1907.7228666
# p18.TAsgRNA.raw 10799.7711777
# p17.AGsgRNA.raw 12119.0960680
# rna.dwtd12 12135.7327807
# p18.CGsgRNA.raw 13331.4705574
# CCsgRNA.raw 17255.5964514
# p17.GGsgRNA.raw 19170.2763147
# tempraw 22244.9079151
# gcraw 23122.0119420
# p15.CCsgRNA.raw 27624.2549475
# p18.CCsgRNA.raw 29401.6734910
# p19.CCsgRNA.raw 45821.7012868
# sgRNA.tempsgRNA.raw 45956.5966525
# sgRNA.gcsgRNA.raw 47500.1518008
# p19.ACsgRNA.raw 53745.0061203
# p19.GGsgRNA.raw 59517.7945230
# p19.AGsgRNA.raw 65351.3778560
# RBS
# random forest
xmat = df.dcast[,c(3:31,33:50,52:ncol(df.dcast))]
y = df.dcast$cut.score
tmp <- cbind(xmat, Y = y)
wt <- rep(1/ncol(xmat), ncol(xmat))
df.rf.all <- ranger(dependent.variable.name = "Y", data=tmp, num.trees=500, split.select.weights = wt, classification=F, mtry=ncol(xmat), importance="impurity_corrected", num.threads=1, write.forest=T, always.split.variables=NULL)
df.rf.all
# Type: Regression
# Number of trees: 500
# Sample size: 44094
# Number of independent variables: 80
# Mtry: 80
# Target node size: 5
# Variable importance mode: impurity_corrected
# Splitrule: variance
# OOB prediction error (MSE): 102.5005
# R squared (OOB): 0.07729709
cor(df.rf.all$predictions,y)
# 0.27951
imp <- data.frame(importance(df.rf.all))
imp$feature <- as.character(rownames(imp))
imp.order <- na.omit(imp[order(imp),])
imp.order$imp <- as.numeric(imp.order$importance.df.rf.)
tail(imp.order)
# importance.df.rf.all. feature imp
# gcraw 44937.41 gcraw 44937.41
# sgRNA.tempsgRNA.raw 93974.56 sgRNA.tempsgRNA.raw 93974.56
# sgRNA.gcsgRNA.raw 96584.47 sgRNA.gcsgRNA.raw 96584.47
# rna.dwtd10 -10297.30 rna.dwtd10 -10297.30
# rna.dwts13 -10396.14 rna.dwts13 -10396.14
# gene.dwts13 -10580.66 gene.dwts13 -10580.66
module load python
source /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/scripts/loadcondaandes.sh
conda activate /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/Libraries/andes/envs/test
library(dplyr)
library(ranger)
setwd("/gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli")
df.dcast <- read.delim("ecoli.features.sgRNA.DWT.raw.onehot.sgRNAraw.dcast.txt", header=T, sep="\t")
nrow(df.dcast)
# 40468
nrow(df.dcast)*0.7
# 28327.6
xmat = df.dcast[,3:ncol(df.dcast)]
xmat.test = xmat[1:28327,]
y = df.dcast[1:28327,]
y.test = y$cut.score
tmp <- cbind(xmat.test, Y = y.test)
wt <- rep(1/ncol(xmat.test), ncol(xmat.test))
df.rf.all.test <- ranger(dependent.variable.name = "Y", data=tmp, num.trees=500, split.select.weights = wt, classification=F, mtry=ncol(xmat.test), num.threads=1, write.forest=T, always.split.variables=NULL)
# Type: Regression
# Number of trees: 500
# Sample size: 28327
# Number of independent variables: 52
# Mtry: 52
# Target node size: 5
# Variable importance mode: none
# Splitrule: variance
# OOB prediction error (MSE): 109.6261
# R squared (OOB): 0.0326039
cor(df.rf.all.test$predictions,y.test)
# 0.2464242
pred <- predict(df.rf.all.test, xmat[28328:40468,])
df.dcast.pred <- df.dcast[28328:40468,]
pred.df <- data.frame(score = df.dcast.pred$cut.score, pred = pred$predictions)
cor(pred.df$score, pred.df$pred)
# 0.250226
cor(pred.df$score, pred.df$pred, method="spearman")
# 0.2462537
cor(pred.df$score, pred.df$pred, method="pearson")
# 0.249661
module load python
source /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/scripts/loadcondaandes.sh
conda activate /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/Libraries/andes/envs/test
library(dplyr)
library(ranger)
setwd("/gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli")
df.dcast <- read.delim("ecoli.features.sgRNA.DWT.raw.onehot.sgRNAraw.dcast.txt", header=T, sep="\t")
colnames(df.dcast)
# [1] "sgRNA" "cut.score"
# [3] "gc.dwtd1" "gc.dwtd10"
# [5] "gc.dwtd11" "gc.dwtd12"
# [7] "gc.dwtd13" "gc.dwtd2"
# [9] "gc.dwtd3" "gc.dwtd4"
# [11] "gc.dwtd5" "gc.dwtd6"
# [13] "gc.dwtd7" "gc.dwtd8"
# [15] "gc.dwtd9" "gc.dwts13"
# [17] "gcraw" "onehot.dep1sgRNA.raw"
# [19] "onehot.dep2sgRNA.raw" "onehot.ind1sgRNA.raw"
# [21] "onehot.ind2sgRNA.raw" "sgRNA.gcsgRNA.raw"
# [23] "sgRNA.structuresgRNA.raw" "sgRNA.tempsgRNA.raw"
# [25] "structure.dwtd1" "structure.dwtd10"
# [27] "structure.dwtd11" "structure.dwtd12"
# [29] "structure.dwtd13" "structure.dwtd2"
# [31] "structure.dwtd3" "structure.dwtd4"
# [33] "structure.dwtd5" "structure.dwtd6"
# [35] "structure.dwtd7" "structure.dwtd8"
# [37] "structure.dwtd9" "structure.dwts13"
# [39] "structureraw" "temp.dwtd1"
# [41] "temp.dwtd10" "temp.dwtd11"
# [43] "temp.dwtd12" "temp.dwtd13"
# [45] "temp.dwtd2" "temp.dwtd3"
# [47] "temp.dwtd4" "temp.dwtd5"
# [49] "temp.dwtd6" "temp.dwtd7"
# [51] "temp.dwtd8" "temp.dwtd9"
# [53] "temp.dwts13" "tempraw"
df.raw <- df.dcast[,c(17:24,39,54)]
df.dwt <- df.dcast[,c(3:16,25:38,40:53)]
# just raw
xmat = df.raw
y = df.dcast$cut.score
tmp <- cbind(xmat, Y = y)
wt <- rep(1/ncol(xmat), ncol(xmat))
df.rf.raw <- ranger(dependent.variable.name = "Y", data=tmp, num.trees=500, split.select.weights = wt, classification=F, mtry=ncol(df.raw), importance="impurity_corrected", num.threads=1, write.forest=T, always.split.variables=NULL)
# Type: Regression
# Number of trees: 500
# Sample size: 40468
# Number of independent variables: 10
# Mtry: 10
# Target node size: 5
# Variable importance mode: impurity_corrected
# Splitrule: variance
# OOB prediction error (MSE): 105.304
# R squared (OOB): 0.04709617
cor(df.rf.raw$predictions,y)
# 0.2282827
xmat = df.raw
xmat.test = xmat[1:28327,]
y = df.dcast[1:28327,]
y.test = y$cut.score
tmp <- cbind(xmat.test, Y = y.test)
wt <- rep(1/ncol(xmat.test), ncol(xmat.test))
df.rf.test <- ranger(dependent.variable.name = "Y", data=tmp, num.trees=500, split.select.weights = wt, classification=F, mtry=ncol(xmat.test), num.threads=1, write.forest=T, always.split.variables=NULL)
pred <- predict(df.rf.test, xmat[28328:40468,])
df.dcast.pred <- df.dcast[28328:40468,]
pred.df <- data.frame(score = df.dcast.pred$cut.score, pred = pred$predictions)
cor(pred.df$score, pred.df$pred)
# 0.2105759
# just dwt
xmat = df.dwt
y = df.dcast$cut.score
tmp <- cbind(xmat, Y = y)
wt <- rep(1/ncol(xmat), ncol(xmat))
df.rf.dwt <- ranger(dependent.variable.name = "Y", data=tmp, num.trees=500, split.select.weights = wt, classification=F, mtry=ncol(df.raw), importance="impurity_corrected", num.threads=1, write.forest=T, always.split.variables=NULL)
# Type: Regression
# Number of trees: 500
# Sample size: 40468
# Number of independent variables: 42
# Mtry: 10
# Target node size: 5
# Variable importance mode: impurity_corrected
# Splitrule: variance
# OOB prediction error (MSE): 107.8557
# R squared (OOB): 0.02400484
cor(df.rf.dwt$predictions,y)
# 0.1698784
xmat = df.dwt
xmat.test = xmat[1:28327,]
y = df.dcast[1:28327,]
y.test = y$cut.score
tmp <- cbind(xmat.test, Y = y.test)
wt <- rep(1/ncol(xmat.test), ncol(xmat.test))
df.rf.test <- ranger(dependent.variable.name = "Y", data=tmp, num.trees=500, split.select.weights = wt, classification=F, mtry=ncol(xmat.test), num.threads=1, write.forest=T, always.split.variables=NULL)
pred <- predict(df.rf.test, xmat[28328:40468,])
df.dcast.pred <- df.dcast[28328:40468,]
pred.df <- data.frame(score = df.dcast.pred$cut.score, pred = pred$predictions)
cor(pred.df$score, pred.df$pred)
#
# raw + dwt
xmat = cbind(df.raw, df.dwt)
y = df.dcast$cut.score
tmp <- cbind(xmat, Y = y)
wt <- rep(1/ncol(xmat), ncol(xmat))
df.rf.all <- ranger(dependent.variable.name = "Y", data=tmp, num.trees=500, split.select.weights = wt, classification=F, mtry=ncol(df.raw), importance="impurity_corrected", num.threads=1, write.forest=T, always.split.variables=NULL)
# Type: Regression
# Number of trees: 500
# Sample size: 40468
# Number of independent variables: 52
# Mtry: 10
# Target node size: 5
# Variable importance mode: impurity_corrected
# Splitrule: variance
# OOB prediction error (MSE): 101.6542
# R squared (OOB): 0.08012298
cor(df.rf.all$predictions,y)
# 0.2839779
xmat = cbind(df.raw, df.dwt)
xmat.test = xmat[1:28327,]
y = df.dcast[1:28327,]
y.test = y$cut.score
tmp <- cbind(xmat.test, Y = y.test)
wt <- rep(1/ncol(xmat.test), ncol(xmat.test))
df.rf.test <- ranger(dependent.variable.name = "Y", data=tmp, num.trees=500, split.select.weights = wt, classification=F, mtry=ncol(xmat.test), num.threads=1, write.forest=T, always.split.variables=NULL)
pred <- predict(df.rf.test, xmat[28328:40468,])
df.dcast.pred <- df.dcast[28328:40468,]
pred.df <- data.frame(score = df.dcast.pred$cut.score, pred = pred$predictions)
cor(pred.df$score, pred.df$pred)
module load python
source /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/scripts/loadcondaandes.sh
conda activate /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/Libraries/andes/envs/test
setwd("/gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli")
df.id <- read.delim("ecoli.features.sgRNA.rbs.DWT.raw.onehot.sgRNAraw.txt", header=F, sep="\t")
df.dcast <- read.delim("ecoli.features.sgRNA.rbs.DWT.raw.onehot.sgRNAraw.dcast.txt", header=T, sep="\t")
df.dcast <- na.omit(df.dcast)
library(ranger)
df.raw <- df.dcast[,c(28,44,386,404,421)]
df.dwt <- df.dcast[,c(14:27,30:43,372:385,390:403,407:420)]
df.onehot <- df.dcast[,c(3:13,29,45:371,405:406,422:424)]
# just raw
xmat = df.raw
y = df.dcast$cut.score
tmp <- cbind(xmat, Y = y)
wt <- rep(1/ncol(xmat), ncol(xmat))
df.rf.raw <- ranger(dependent.variable.name = "Y", data=tmp, num.trees=500, split.select.weights = wt, classification=F, mtry=ncol(xmat), importance="impurity_corrected", num.threads=1, write.forest=T, always.split.variables=NULL)
df.rf.raw
# Type: Regression
# Number of trees: 500
# Sample size: 25795
# Number of independent variables: 5
# Mtry: 5
# Target node size: 5
# Variable importance mode: impurity_corrected
# Splitrule: variance
# OOB prediction error (MSE): 114.9188
# R squared (OOB): -0.02018755
cor(df.rf.raw$predictions,y)
#0.07611119
### prediction
test = 0.7*nrow(xmat)
xmat = df.raw
xmat.test = xmat[1:test,]
y = df.dcast[1:test,]
y.test = y$cut.score
tmp <- cbind(xmat.test, Y = y.test)
wt <- rep(1/ncol(xmat.test), ncol(xmat.test))
df.rf.raw.test <- ranger(dependent.variable.name = "Y", data=tmp, num.trees=500, split.select.weights = wt, classification=F, mtry=ncol(xmat.test), num.threads=1, write.forest=T, always.split.variables=NULL)
pred <- predict(df.rf.raw.test, na.omit(xmat[test+1:nrow(xmat),]))
pred$predictions
df.dcast.pred <- na.omit(df.dcast[test+1:nrow(xmat),])
pred.df <- data.frame(score = df.dcast.pred$cut.score, pred = pred$predictions)
cor(pred.df$score, pred.df$pred)
# 0.04919696
# just onehot
xmat = df.onehot
y = df.dcast$cut.score
tmp <- cbind(xmat, Y = y)
wt <- rep(1/ncol(xmat), ncol(xmat))
df.rf.raw <- ranger(dependent.variable.name = "Y", data=tmp, num.trees=500, split.select.weights = wt, classification=F, mtry=ncol(xmat), importance="impurity_corrected", num.threads=1, write.forest=T, always.split.variables=NULL)
df.rf.raw
# Type: Regression
# Number of trees: 500
# Sample size: 25795
# Number of independent variables: 344
# Mtry: 5
# Target node size: 5
# Variable importance mode: impurity_corrected
# Splitrule: variance
# OOB prediction error (MSE): 100.9347
# R squared (OOB): 0.1039559
cor(df.rf.raw$predictions,y)
# 0.4357692
### prediction
test = 0.7*nrow(xmat)
xmat = df.onehot
xmat.test = xmat[1:test,]
y = df.dcast[1:test,]
y.test = y$cut.score
tmp <- cbind(xmat.test, Y = y.test)
wt <- rep(1/ncol(xmat.test), ncol(xmat.test))
df.rf.raw.test <- ranger(dependent.variable.name = "Y", data=tmp, num.trees=500, split.select.weights = wt, classification=F, mtry=ncol(xmat.test), num.threads=1, write.forest=T, always.split.variables=NULL)
pred <- predict(df.rf.raw.test, na.omit(xmat[test+1:nrow(xmat),]))
pred$predictions
df.dcast.pred <- na.omit(df.dcast[test+1:nrow(xmat),])
pred.df <- data.frame(score = df.dcast.pred$cut.score, pred = pred$predictions)
cor(pred.df$score, pred.df$pred)
# 0.4682523
# raw + onehot
xmat = cbind(df.raw, df.onehot)
y = df.dcast$cut.score
tmp <- cbind(xmat, Y = y)
wt <- rep(1/ncol(xmat), ncol(xmat))
df.rf.raw <- ranger(dependent.variable.name = "Y", data=tmp, num.trees=500, split.select.weights = wt, classification=F, mtry=ncol(xmat), importance="impurity_corrected", num.threads=1, write.forest=T, always.split.variables=NULL)
df.rf.raw
# Type: Regression
# Number of trees: 500
# Sample size: 25795
# Number of independent variables: 349
# Mtry: 5
# Target node size: 5
# Variable importance mode: impurity_corrected
# Splitrule: variance
# OOB prediction error (MSE): 100.4641
# R squared (OOB): 0.1081335
cor(df.rf.raw$predictions,y)
# 0.4498771
### prediction
test = 0.7*nrow(xmat)
xmat = cbind(df.raw, df.onehot)
xmat.test = xmat[1:test,]
y = df.dcast[1:test,]
y.test = y$cut.score
tmp <- cbind(xmat.test, Y = y.test)
wt <- rep(1/ncol(xmat.test), ncol(xmat.test))
df.rf.raw.test <- ranger(dependent.variable.name = "Y", data=tmp, num.trees=500, split.select.weights = wt, classification=F, mtry=ncol(xmat.test), num.threads=1, write.forest=T, always.split.variables=NULL)
df.rf.raw.test
# Type: Regression
# Number of trees: 500
# Sample size: 18056
# Number of independent variables: 349
# Mtry: 349
# Target node size: 5
# Variable importance mode: none
# Splitrule: variance
# OOB prediction error (MSE): 85.01476
# R squared (OOB): 0.2624812
pred <- predict(df.rf.raw.test, na.omit(xmat[test+1:nrow(xmat),]))
pred$predictions
df.dcast.pred <- na.omit(df.dcast[test+1:nrow(xmat),])
pred.df <- data.frame(score = df.dcast.pred$cut.score, pred = pred$predictions)
cor(pred.df$score, pred.df$pred)
# 0.4866023
# just dwt
xmat = df.dwt
y = df.dcast$cut.score
tmp <- cbind(xmat, Y = y)
wt <- rep(1/ncol(xmat), ncol(xmat))
df.rf.raw <- ranger(dependent.variable.name = "Y", data=tmp, num.trees=500, split.select.weights = wt, classification=F, mtry=ncol(xmat), importance="impurity_corrected", num.threads=1, write.forest=T, always.split.variables=NULL)
df.rf.raw
# Type: Regression
# Number of trees: 500
# Sample size: 25795
# Number of independent variables: 70
# Mtry: 70
# Target node size: 5
# Variable importance mode: impurity_corrected
# Splitrule: variance
# OOB prediction error (MSE): 110.9831
# R squared (OOB): 0.0147512
cor(df.rf.raw$predictions,y)
# 0.147607
### prediction
test = 0.7*nrow(xmat)
xmat = df.dwt
xmat.test = xmat[1:test,]
y = df.dcast[1:test,]
y.test = y$cut.score
tmp <- cbind(xmat.test, Y = y.test)
wt <- rep(1/ncol(xmat.test), ncol(xmat.test))
df.rf.raw.test <- ranger(dependent.variable.name = "Y", data=tmp, num.trees=500, split.select.weights = wt, classification=F, mtry=ncol(xmat.test), num.threads=1, write.forest=T, always.split.variables=NULL)
pred <- predict(df.rf.raw.test, na.omit(xmat[test+1:nrow(xmat),]))
pred$predictions
df.dcast.pred <- na.omit(df.dcast[test+1:nrow(xmat),])
pred.df <- data.frame(score = df.dcast.pred$cut.score, pred = pred$predictions)
cor(pred.df$score, pred.df$pred)
# 0.08021816
# all
xmat = cbind(df.raw, df.onehot, df.dwt)
y = df.dcast$cut.score
tmp <- cbind(xmat, Y = y)
wt <- rep(1/ncol(xmat), ncol(xmat))
df.rf.raw <- ranger(dependent.variable.name = "Y", data=tmp, num.trees=500, split.select.weights = wt, classification=F, mtry=ncol(xmat), importance="impurity_corrected", num.threads=1, write.forest=T, always.split.variables=NULL)
df.rf.raw
# Type: Regression
# Number of trees: 500
# Sample size: 18056
# Number of independent variables: 419
# Mtry: 419
# Target node size: 5
# Variable importance mode: none
# Splitrule: variance
# OOB prediction error (MSE): 86.80522
# R squared (OOB): 0.2469486
cor(df.rf.raw$predictions,y)
# 0.4017785
### prediction
test = 0.7*nrow(xmat)
xmat = cbind(df.raw, df.onehot, df.dwt)
xmat.test = xmat[1:test,]
y = df.dcast[1:test,]
y.test = y$cut.score
tmp <- cbind(xmat.test, Y = y.test)
wt <- rep(1/ncol(xmat.test), ncol(xmat.test))
df.rf.raw.test <- ranger(dependent.variable.name = "Y", data=tmp, num.trees=500, split.select.weights = wt, classification=F, mtry=ncol(xmat.test), num.threads=1, write.forest=T, always.split.variables=NULL)
pred <- predict(df.rf.raw.test, na.omit(xmat[test+1:nrow(xmat),]))
pred$predictions
df.dcast.pred <- na.omit(df.dcast[test+1:nrow(xmat),])
pred.df <- data.frame(score = df.dcast.pred$cut.score, pred = pred$predictions)
cor(pred.df$score, pred.df$pred)
# 0.465957
module load python
source /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/scripts/loadcondaandes.sh
conda activate /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/Libraries/andes/envs/test
library(dplyr)
library(reshape2)
library(wmtsa)
library(tidyr)
setwd("/gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli")
gene <- read.delim("ecoli.gene.windows500.bed", header=F, sep="\t", stringsAsFactors = F)
structure <- read.delim("500bp.protein.structure.fa", header=T, sep="\t", stringsAsFactors = F)
nuc <- read.delim("nucleotide_counts_500bp_temp.txt", header=T, sep="\t", stringsAsFactors = F)
rnaseq <- read.delim("ecoli.rnaseq.average.windows500.bed", header=T, sep="\t", stringsAsFactors = F)
window <- read.delim("ecoli.500bp.windows.bed", header=F, sep="\t", stringsAsFactors = F)
#score <- read.delim("window500.score.avg.txt", header=T, sep="\t", stringsAsFactors = F)
score <- read.delim("sgRNA.coord.txt", header=T, sep="\t", stringsAsFactors = F)
colnames(score) <- c("chr", "start", "end", "sgRNA", "id", "seq", "id2", "cut.score", "gid", "change.val", "quality")
score.df <- score[,c(1:4,8)]
gene.bin <- gene %>% group_by(V1, V2, V3) %>% mutate(gene.count = n())
gene.count <- unique(gene.bin[,c(1:3,14)])
gene.df <- gene.count$gene.count
structure.df <- structure[,2]
gc.df <- nuc[,7]
temp.df <- nuc[,8]
rna.df <- rnaseq[,4]
# Run DWT instead of CWT
temp.modwt <- wavMODWT(temp.df)
temp.modwt.df <- as.matrix(temp.modwt)
temp.modwt.label <- data.frame(label = row.names(temp.modwt.df), temp.modwt.df)
temp.modwt.name <- temp.modwt.label %>% separate(label, c("name", "number"), sep = "[^[:alnum:]]+")
colnames(temp.modwt.name) <- c("scale", "window", "temp.dwt")
gc.modwt <- wavMODWT(gc.df)
gc.modwt.df <- as.matrix(gc.modwt)
gc.modwt.label <- data.frame(label = row.names(gc.modwt.df), gc.modwt.df)
gc.modwt.name <- gc.modwt.label %>% separate(label, c("name", "number"), sep = "[^[:alnum:]]+")
colnames(gc.modwt.name) <- c("scale", "window", "gc.dwt")
structure.modwt <- wavMODWT(structure.df)
structure.modwt.df <- as.matrix(structure.modwt)
structure.modwt.label <- data.frame(label = row.names(structure.modwt.df), structure.modwt.df)
structure.modwt.name <- structure.modwt.label %>% separate(label, c("name", "number"), sep = "[^[:alnum:]]+")
colnames(structure.modwt.name) <- c("scale", "window", "structure.dwt")
rna.modwt <- wavMODWT(rna.df)
rna.modwt.df <- as.matrix(rna.modwt)
rna.modwt.label <- data.frame(label = row.names(rna.modwt.df), rna.modwt.df)
rna.modwt.name <- rna.modwt.label %>% separate(label, c("name", "number"), sep = "[^[:alnum:]]+")
colnames(rna.modwt.name) <- c("scale", "window", "rna.dwt")
gene.modwt <- wavMODWT(gene.df)
gene.modwt.df <- as.matrix(gene.modwt)
gene.modwt.label <- data.frame(label = row.names(gene.modwt.df), gene.modwt.df)
gene.modwt.name <- gene.modwt.label %>% separate(label, c("name", "number"), sep = "[^[:alnum:]]+")
colnames(gene.modwt.name) <- c("scale", "window", "gene.dwt")
colnames(window) <- c("chr", "start", "end")
window$window <- seq.int(nrow(window))
window$window <- as.character(window$window-1)
library(tidygenomics)
window.score <- genome_intersect(window, score.df, by=c("chr", "start", "end"))
window.score.df <- left_join(window, window.score[,2:4], by=c("window"))
window.score.temp <- left_join(window.score.df, temp.modwt.name, by="window")
window.temp.gc <- left_join(window.score.temp, gc.modwt.name, by=c("window", "scale"))
window.temp.gc.structure <- left_join(window.temp.gc, structure.modwt.name, by=c("window", "scale"))
window.temp.gc.structure.rna <- left_join(window.temp.gc.structure, rna.modwt.name, by=c("window", "scale"))
window.temp.gc.structure.rna.gene <- left_join(window.temp.gc.structure.rna, gene.modwt.name, by=c("window", "scale"))
window.temp.gc.structure.rna.gene.sgRNA <- subset(window.temp.gc.structure.rna.gene, window.temp.gc.structure.rna.gene$cut.score != "NA")
df.melt <- melt(window.temp.gc.structure.rna.gene.sgRNA[,5:12], id=c("cut.score", "scale", "sgRNA"))
df <- na.omit(df.melt)
## add CWT
library(MassSpecWavelet)
scales <- seq(1, 100, 2)
wCoefs.temp <- cwt(temp.df, scales=scales, wavelet='mexh')
wCoefs.gc <- cwt(gc.df, scales=scales, wavelet='mexh')
wCoefs.structure <- cwt(structure.df, scales=scales, wavelet='mexh')
wCoefs.rna <- cwt(rna.df, scales=scales, wavelet='mexh')
wCoefs.gene <- cwt(gene.df, scales=scales, wavelet='mexh')
wCoefs.temp.melt <- melt(wCoefs.temp)
wCoefs.gc.melt <- melt(wCoefs.gc)
wCoefs.structure.melt <- melt(wCoefs.structure)
wCoefs.rna.melt <- melt(wCoefs.rna)
wCoefs.gene.melt <- melt(wCoefs.gene)
colnames(wCoefs.temp.melt) <- c("window", "scale", "temp.cwt")
colnames(wCoefs.gc.melt) <- c("window", "scale", "gc.cwt")
colnames(wCoefs.structure.melt) <- c("window", "scale", "structure.cwt")
colnames(wCoefs.rna.melt) <- c("window", "scale", "rna.cwt")
colnames(wCoefs.gene.melt) <- c("window", "scale", "gene.cwt")
window.score$window <- as.numeric(window.score$window)
window.score.temp <- left_join(window.score, wCoefs.temp.melt, by="window")
window.temp.gc <- left_join(window.score.temp, wCoefs.gc.melt, by=c("window", "scale"))
window.temp.gc.structure <- left_join(window.temp.gc, wCoefs.structure.melt, by=c("window", "scale"))
window.temp.gc.structure.rna <- left_join(window.temp.gc.structure, wCoefs.rna.melt, by=c("window", "scale"))
window.temp.gc.structure.rna.gene <- left_join(window.temp.gc.structure.rna, wCoefs.gene.melt, by=c("window", "scale"))
window.temp.gc.structure.rna.gene.sgRNA <- subset(window.temp.gc.structure.rna.gene, window.temp.gc.structure.rna.gene$cut.score != "NA")
window.temp.gc.structure.rna.gene.sgRNA.na <- na.omit(window.temp.gc.structure.rna.gene.sgRNA[,c(3,4,7:12)])
df2.melt <- melt(window.temp.gc.structure.rna.gene.sgRNA.na, id=c("cut.score", "scale", "sgRNA"))
df2 <- na.omit(df2.melt)
df <- rbind(df, df2)
df2.test <- df2 %>% unite(feature.scale, c(variable, scale), sep = "")
df2.test$value <- as.numeric(df2.test$value)
df2.test.dcast <- df2.test %>% dcast(sgRNA + cut.score ~ feature.scale, value.var = "value", fun.aggregate=mean, na.rm=TRUE)
### why does this work but when I add the rest of the data for sgRNA it doesn't work anymore?? --> don't have data for all sgRNA?? Doesn't make sense because every sgRNA should overlap a 500bp bin which would have cwt and dwt data...
## add 500bp raw data
structure$window <- seq.int(nrow(structure))
structure$window <- as.character(structure$window-1)
gene.count$window <- seq.int(nrow(gene.count))
gene.count$window <- as.character(gene.count$window-1)
nuc$window <- seq.int(nrow(nuc))
nuc$window <- as.character(nuc$window-1)
rnaseq$window <- seq.int(nrow(rnaseq))
rnaseq$window <- as.character(rnaseq$window-1)
structure$scale <- "raw"
gene.count$scale <- "raw"
nuc$scale <- "raw"
rnaseq$scale <- "raw"
window.score.temp <- left_join(window.score.df, nuc[,8:10], by="window")
window.temp.gc <- left_join(window.score.temp, nuc[,c(7,9:10)], by=c("window", "scale"))
window.temp.gc.gene <- left_join(window.temp.gc, gene.count[,c(4:6)], by=c("window", "scale"))
window.temp.gc.gene.rna <- left_join(window.temp.gc.gene, rnaseq[,c(4:6)], by=c("window", "scale"))
window.temp.gc.gene.rna.structure <- left_join(window.temp.gc.gene.rna, structure[,c(2,22,23)], by=c("window", "scale"))
colnames(window.temp.gc.gene.rna.structure) <- c("chr", "start", "end", "window", "sgRNA", "cut.score", "temp", "scale", "gc", "gene", "rna", "structure")
df2.melt <- melt(window.temp.gc.gene.rna.structure[,5:12], id=c("cut.score", "scale", "sgRNA"))
df2 <- na.omit(df2.melt)
df <- rbind(df, df2)
# add sgRNA raw data
setwd("/gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli")
structure <- read.delim("ecoli.gRNA.structure.txt", header=T, sep="\t", stringsAsFactors = F)
nuc <- read.delim("gRNA_nuc_counts_temp.txt", header=T, sep="\t", stringsAsFactors = F)
structure$scale <- "sgRNA.raw"
nuc$scale <- "sgRNA.raw"
structure$sgRNA <- structure[,1]
nuc$sgRNA <- nuc[,1]
window.score.structure <- left_join(window.score.df, structure[,c(2,22,23)], by="sgRNA")
window.score.structure.temp <- left_join(window.score.structure, nuc[,8:10], by=c("sgRNA", "scale"))
window.score.structure.temp.gc <- left_join(window.score.structure.temp, nuc[,c(7,9:10)], by=c("sgRNA", "scale"))
colnames(window.score.structure.temp.gc) <- c("chr", "start", "end", "window", "sgRNA", "cut.score", "sgRNA.structure", "scale", "sgRNA.temp", "sgRNA.gc")
## add one-hot encoding of sequence
setwd("/gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli")
onehot.ind1 <- read.delim("DataS1_independent1.txt")
onehot.ind2 <- read.delim("DataS1_independent2.txt")
onehot.dep1 <- read.delim("DataS1_dependent1.txt")
onehot.dep2 <- read.delim("DataS1_dependent2.txt")
onehot.ind <- full_join(onehot.ind1, onehot.ind2, by="sgRNAID")
onehot.dep <- full_join(onehot.dep1, onehot.dep2, by="sgRNAID")
onehot <- full_join(onehot.ind, onehot.dep, by="sgRNAID")
onehot <- onehot[2:nrow(onehot),]
colnames(onehot) <- c("sgRNA", "onehot.ind1", "onehot.ind2", "onehot.dep1", "onehot.dep2")
onehot$scale <- "sgRNA.raw"
data.onehot <- left_join(window.score.structure.temp.gc, onehot, by=c("sgRNA", "scale"))
df2.melt <- melt(data.onehot[,c(5:14)], id=c("cut.score", "scale", "sgRNA"))
df2 <- na.omit(df2.melt)
df <- rbind(df, df2)
library(tidyr)
df.nogene.norna <- subset(df, df$variable != "rna.dwt" & df$variable != "rna" & df$variable != "gene.dwt" & df$variable != "gene" & df$variable != "rna.cwt" & df$variable != "gene.cwt")
df.id <- df.nogene.norna %>% unite(feature.scale, c(variable, scale), sep = "")
df.id$value <- as.numeric(df.id$value)
df.id.na <- df.id[!(is.na(df.id$value) | df.id$value==""), ]
df.dcast <- df.id.na %>% dcast(sgRNA + cut.score ~ feature.scale, value.var = "value", fun.aggregate=mean, na.rm=TRUE)
# 40468
df.dcast.na <- na.omit(df.dcast)
# 25802
write.table(df.id.na, "ecoli.features.sgRNA.CWT.DWT.raw.onehot.sgRNAraw.txt", quote=F, row.names=F, sep="\t")
write.table(df.dcast, "ecoli.features.sgRNA.CWT.DWT.raw.onehot.sgRNAraw.dcast.txt", quote=F, row.names=F, sep="\t")
setwd("/gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli")
df.dcast <- read.delim("ecoli.features.sgRNA.CWT.DWT.raw.onehot.sgRNAraw.dcast.txt", header=T, sep="\t")
# random forest
library(ranger)
xmat = df.dcast.na[,3:ncol(df.dcast.na)]
y = df.dcast.na$cut.score
tmp <- cbind(xmat, Y = y)
wt <- rep(1/ncol(xmat), ncol(xmat))
df.rf.all <- ranger(dependent.variable.name = "Y", data=tmp, num.trees=500, split.select.weights = wt, classification=F, mtry=ncol(xmat), importance="impurity_corrected", num.threads=1, write.forest=T, always.split.variables=NULL)
cor(df.rf.all$predictions,y)
# 0.2674121
imp <- data.frame(importance(df.rf.all))
imp$feature <- as.character(rownames(imp))
imp.order <- na.omit(imp[order(imp),])
imp.order$imp <- as.numeric(imp.order$importance.df.rf.)
tail(imp.order)
# importance.df.rf.all. feature imp
# temp.dwtd10 -3732.554 temp.dwtd10 -3732.554
# onehot.ind2sgRNA.raw 11888.652 onehot.ind2sgRNA.raw 11888.652
# gcraw 17292.429 gcraw 17292.429
# tempraw 17834.742 tempraw 17834.742
# sgRNA.tempsgRNA.raw 59200.137 sgRNA.tempsgRNA.raw 59200.137
# sgRNA.gcsgRNA.raw 62665.835 sgRNA.gcsgRNA.raw 62665.835
# salloc -A SYB105 -N 2 -t 4:00:00
module load python
source /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/scripts/loadcondaandes.sh
conda activate /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/Libraries/andes/envs/test
R
library(dplyr)
library(reshape2)
library(tidyr)
setwd("/gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli")
df.id <- read.delim("ecoli.features.sgRNA.rbs.DWT.raw.onehot.sgRNAraw.txt", header=T, sep="\t")
colnames(df.id) <- c("cut.score", "feature.scale", "sgRNAID", "value")
tensor <- read.delim("ecoli.sgRNA.thermal.tensors.melt.txt", header=T, sep="\t")
tensor$scale <- "raw"
tensor.id <- tensor %>% unite(feature.scale, c(position, variable, scale), sep = "")
tensor.id$value <- as.numeric(tensor.id$value)
tensor.id.na <- tensor.id[!(is.na(tensor.id$value) | tensor.id$value==""), ]
df.score <- unique(df.id[,c(1,3)])
tensor.score <- left_join(tensor.id, df.score, by="sgRNAID")
tensor.score.order <- tensor.score[,c(5,2,1,4)]
head(df.id)
head(tensor.score.order)
tensor.df <- rbind(df.id, tensor.score.order)
df.dcast <- tensor.df %>% dcast(sgRNAID + cut.score ~ feature.scale, value.var = "value", fun.aggregate=mean, na.rm=TRUE)
# 60369
df.dcast.na <- na.omit(df.dcast)
# 25795
write.table(tensor.df, "ecoli.features.sgRNA.CWT.DWT.raw.onehot.sgRNAraw.thermaltensor.txt", quote=F, row.names=F, sep="\t")
write.table(df.dcast.na, "ecoli.features.sgRNA.CWT.DWT.raw.onehot.sgRNAraw.thermaltensor.dcast.txt", quote=F, row.names=F, sep="\t")
module load python
source /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/scripts/loadcondaandes.sh
conda activate /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/Libraries/andes/envs/test
setwd("/gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli")
#df.id <- read.delim("ecoli.features.sgRNA.CWT.DWT.raw.onehot.sgRNAraw.thermaltensor.txt", header=F, sep="\t")
df.dcast <- read.delim("ecoli.features.sgRNA.CWT.DWT.raw.onehot.sgRNAraw.thermaltensor.dcast.txt", header=T, sep="\t")
df.dcast <- na.omit(df.dcast)
colnames(df.dcast)
df.raw <- df.dcast[,c(28,44,1626:1629,1644,1661)]
df.dwt <- df.dcast[,c(14:27,30:43,1612:1625,1630:1643,1647:1660)]
df.onehot <- df.dcast[,c(3:13,29,45:83,146:161,225:239,302:317,380:395,458:473,536:551,614:629,692:707,770:785,910:929,1054:1073,1136:1155,1218:1237,1300:1315,1378:1393,1456:1471,1534:1549,1645,1646,1662:1664)]
df.thermal <- df.dcast[,c(84:145,162:224,240:301,318:379,396:457,474:535,552:613,630:691,708:769,786:909,930:1053,1074:1135,1156:1217,1238:1299,1316:1377,1394:1455,1472:1533,1550:1611)]
library(ranger)
# raw (raw + onehot + thermal)
xmat = cbind(df.raw, df.onehot, df.thermal)
test = 0.7*nrow(xmat)
xmat.test = xmat[1:test,]
y = df.dcast[1:test,]
y.test = y$cut.score
tmp <- cbind(xmat.test, Y = y.test)
wt <- rep(1/ncol(xmat.test), ncol(xmat.test))
df.rf.raw.test <- ranger(dependent.variable.name = "Y", data=tmp, num.trees=500, split.select.weights = wt, classification=F, mtry=0.1*ncol(xmat.test), num.threads=1, write.forest=T, always.split.variables=NULL)
df.rf.raw.test
# Type: Regression
# Number of trees: 500
# Sample size: 18056
# Number of independent variables: 1592
# Mtry: 159
# Target node size: 5
# Variable importance mode: none
# Splitrule: variance
# OOB prediction error (MSE): 83.84774
# R squared (OOB): 0.2726053
cor(df.rf.raw.test$predictions,y.test)
# 0.5267411
pred <- predict(df.rf.raw.test, na.omit(xmat[test+1:nrow(xmat),]))
pred$predictions
df.dcast.pred <- na.omit(df.dcast[test+1:nrow(xmat),])
pred.df <- data.frame(score = df.dcast.pred$cut.score, pred = pred$predictions)
cor(pred.df$score, pred.df$pred)
# 0.5075938
# just thermal
xmat = df.thermal
test = 0.7*nrow(xmat)
xmat.test = xmat[1:test,]
y = df.dcast[1:test,]
y.test = y$cut.score
tmp <- cbind(xmat.test, Y = y.test)
wt <- rep(1/ncol(xmat.test), ncol(xmat.test))
df.rf.thermal.test <- ranger(dependent.variable.name = "Y", data=tmp, num.trees=500, split.select.weights = wt, classification=F, mtry=0.1*ncol(xmat.test), num.threads=1, write.forest=T, always.split.variables=NULL)
df.rf.thermal.test
# Type: Regression
# Number of trees: 500
# Sample size: 18056
# Number of independent variables: 1241
# Mtry: 124
# Target node size: 5
# Variable importance mode: none
# Splitrule: variance
# OOB prediction error (MSE): 86.92974
# R squared (OOB): 0.2458684
cor(df.rf.thermal.test$predictions,y.test)
# 0.4992982
pred <- predict(df.rf.thermal.test, na.omit(xmat[test+1:nrow(xmat),]))
pred$predictions
df.dcast.pred <- na.omit(df.dcast[test+1:nrow(xmat),])
pred.df <- data.frame(score = df.dcast.pred$cut.score, pred = pred$predictions)
cor(pred.df$score, pred.df$pred)
# 0.4719731
# all
xmat = cbind(df.raw, df.onehot, df.thermal, df.dwt)
test = 0.7*nrow(xmat)
xmat.test = xmat[1:test,]
y = df.dcast[1:test,]
y.test = y$cut.score
tmp <- cbind(xmat.test, Y = y.test)
wt <- rep(1/ncol(xmat.test), ncol(xmat.test))
df.rf.all.test <- ranger(dependent.variable.name = "Y", data=tmp, num.trees=500, split.select.weights = wt, classification=F, mtry=0.1*ncol(xmat.test), num.threads=1, write.forest=T, always.split.variables=NULL)
df.rf.all.test
# Type: Regression
# Number of trees: 500
# Sample size: 18056
# Number of independent variables: 1662
# Mtry: 166
# Target node size: 5
# Variable importance mode: none
# Splitrule: variance
# OOB prediction error (MSE): 83.86077
# R squared (OOB): 0.2724922
cor(df.rf.all.test$predictions,y.test)
# 0.5286732
pred <- predict(df.rf.all.test, na.omit(xmat[test+1:nrow(xmat),]))
pred$predictions
df.dcast.pred <- na.omit(df.dcast[test+1:nrow(xmat),])
pred.df <- data.frame(score = df.dcast.pred$cut.score, pred = pred$predictions)
cor(pred.df$score, pred.df$pred)
# 0.5112806
module load python
source /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/scripts/loadcondaandes.sh
conda activate /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/Libraries/andes/envs/test
setwd("/gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli")
#df.id <- read.delim("ecoli.features.sgRNA.CWT.DWT.raw.onehot.sgRNAraw.thermaltensor.txt", header=F, sep="\t")
df.dcast <- read.delim("ecoli.features.sgRNA.CWT.DWT.raw.onehot.sgRNAraw.thermaltensor.dcast.txt", header=T, sep="\t")
df.dcast <- na.omit(df.dcast)
#colnames(df.dcast)
df.raw <- df.dcast[,c(2,28,44,1626:1629,1644,1661)]
df.dwt <- df.dcast[,c(2,14:27,30:43,1612:1625,1630:1643,1647:1660)]
df.onehot <- df.dcast[,c(2,3:13,29,45:83,146:161,225:239,302:317,380:395,458:473,536:551,614:629,692:707,770:785,910:929,1054:1073,1136:1155,1218:1237,1300:1315,1378:1393,1456:1471,1534:1549,1645,1646,1662:1664)]
df.thermal <- df.dcast[,c(2,84:145,162:224,240:301,318:379,396:457,474:535,552:613,630:691,708:769,786:909,930:1053,1074:1135,1156:1217,1238:1299,1316:1377,1394:1455,1472:1533,1550:1611)]
# raw (raw + onehot + thermal)
data = cbind(df.raw, df.onehot, df.thermal)
xmat = data[,2:ncol(data)]
xmat.score = xmat$cut.score
library(gbm)
gbm.df <- gbm(formula=xmat.score ~ ., data=xmat, distribution = "gaussian", n.trees = 500, shrinkage = 0.1,
interaction.depth = 3, bag.fraction = 0.2, train.fraction = 0.8,
n.minobsinnode = 10, cv.folds = 5, keep.data = TRUE,
verbose = FALSE, n.cores = 1)
best.iter <- gbm.perf(gbm.df, method = "OOB")
print(best.iter)
best.iter <- gbm.perf(gbm.df, method = "cv")
print(best.iter)
head(summary(gbm.df, n.trees = best.iter))
# var rel.inf
# cut.score cut.score 9.999222e+01
# rnaraw rnaraw 4.404580e-04
# gcraw gcraw 3.156399e-04
# structureraw structureraw 3.093793e-04
# sgRNA.gcsgRNA.raw sgRNA.gcsgRNA.raw 1.268021e-04
# GsgRNA.raw GsgRNA.raw 9.665896e-05
data = cbind(df.raw, df.onehot, df.thermal)
xmat = data[,2:ncol(data)]
xmat.score = xmat$cut.score
0.7*nrow(data)
# 18056.5
train = 18056
xmat.train = data[1:train,2:ncol(data)]
xmat.train.score = data[1:train,1]
xmat.test = data[train+1:ncol(data),2:ncol(data)]
xmat.test.score = data[train+1:ncol(data),1]
library(gbm)
gbm.train <- gbm(formula=xmat.train.score ~ ., data=xmat.train, distribution = "gaussian", n.trees = 500, shrinkage = 0.1,
interaction.depth = 3, bag.fraction = 0.2, train.fraction = 0.8,
n.minobsinnode = 10, cv.folds = 5, keep.data = TRUE,
verbose = FALSE, n.cores = 1)
best.iter <- gbm.perf(gbm.train, method = "OOB")
Yhat <- predict(gbm.train, newdata = xmat.test, n.trees = best.iter, type = "link")
pred <- data.frame(pred.score = Yhat, exp.score = xmat.test.score)
cor(pred$pred.score, pred$exp.score)
# 0.9999052
best.iter <- gbm.perf(gbm.train, method = "cv")
Yhat <- predict(gbm.train, newdata = xmat.test, n.trees = best.iter, type = "link")
pred <- data.frame(pred.score = Yhat, exp.score = xmat.test.score)
cor(pred$pred.score, pred$exp.score)
# 0.9999073
24 June 2021 - https://topepo.github.io/caret/model-training-and-tuning.html
module load python
source /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/scripts/loadcondaandes.sh
conda activate /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/Libraries/andes/envs/test
setwd("/gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli")
#df.id <- read.delim("ecoli.features.sgRNA.CWT.DWT.raw.onehot.sgRNAraw.thermaltensor.txt", header=F, sep="\t")
df.dcast <- read.delim("ecoli.features.sgRNA.CWT.DWT.raw.onehot.sgRNAraw.thermaltensor.dcast.txt", header=T, sep="\t")
df.dcast <- na.omit(df.dcast)
#colnames(df.dcast)
df.raw <- df.dcast[,c(2,28,44,1626:1629,1644,1661)]
df.dwt <- df.dcast[,c(2,14:27,30:43,1612:1625,1630:1643,1647:1660)]
df.onehot <- df.dcast[,c(2,3:13,29,45:83,146:161,225:239,302:317,380:395,458:473,536:551,614:629,692:707,770:785,910:929,1054:1073,1136:1155,1218:1237,1300:1315,1378:1393,1456:1471,1534:1549,1645,1646,1662:1664)]
df.thermal <- df.dcast[,c(2,84:145,162:224,240:301,318:379,396:457,474:535,552:613,630:691,708:769,786:909,930:1053,1074:1135,1156:1217,1238:1299,1316:1377,1394:1455,1472:1533,1550:1611)]
# raw (raw + onehot + thermal)
data = cbind(df.raw, df.onehot, df.thermal)
library(caret)
set.seed(998)
inTraining <- createDataPartition(data$cut.score, p = .75, list = FALSE)
training <- data[ inTraining,]
testing <- data[-inTraining,]
fitControl <- trainControl(## 5-fold CV
method = "repeatedcv",
number = 5,
## repeated ten times
repeats = 10)
set.seed(825)
gbmFit1 <- train(cut.score ~ ., data = training,
method = "gbm",
trControl = fitControl,
## This last option is actually one
## for gbm() that passes through
verbose = FALSE)
gbmFit1
# Stochastic Gradient Boosting
#
# 19347 samples
# 1592 predictor
#
# No pre-processing
# Resampling: Cross-Validated (5 fold, repeated 10 times)
# Summary of sample sizes: 15477, 15477, 15478, 15479, 15477, 15477, ...
# Resampling results across tuning parameters:
#
# interaction.depth n.trees RMSE Rsquared MAE
# 1 50 9.824981 0.1813743 8.161379
# 1 100 9.553478 0.2169955 7.884544
# 1 150 9.399480 0.2345394 7.718378
# 2 50 9.539569 0.2214292 7.875140
# 2 100 9.280022 0.2510205 7.592141
# 2 150 9.152548 0.2673309 7.452097
# 3 50 9.390383 0.2398661 7.716956
# 3 100 9.151159 0.2682885 7.452563
# 3 150 9.043939 0.2819449 7.332865
#
# Tuning parameter 'shrinkage' was held constant at a value of 0.1
#
# Tuning parameter 'n.minobsinnode' was held constant at a value of 10
# RMSE was used to select the optimal model using the smallest value.
# The final values used for the model were n.trees = 150, interaction.depth =
# 3, shrinkage = 0.1 and n.minobsinnode = 10.
head(summary(gbmFit1))
# var rel.inf
# gcraw gcraw 6.816002
# p20homo_lumo_energygapraw p20homo_lumo_energygapraw 6.620943
# p19.GGsgRNA.raw p19.GGsgRNA.raw 6.117045
# sgRNA.gcsgRNA.raw sgRNA.gcsgRNA.raw 5.885311
# p20xz_quadrupoleraw p20xz_quadrupoleraw 4.879244
# p20homo_energyraw p20homo_energyraw 4.360515
pred <- predict(gbmFit1, newdata = testing)
pred.df <- data.frame(pred.score = pred, exp.score = testing$cut.score)
cor(pred.df$pred.score, pred.df$exp.score)
# 0.5412453
# all
data = cbind(df.raw, df.onehot, df.thermal, df.dwt)
library(caret)
set.seed(998)
inTraining <- createDataPartition(data$cut.score, p = .75, list = FALSE)
training <- data[ inTraining,]
testing <- data[-inTraining,]
fitControl <- trainControl(## 5-fold CV
method = "repeatedcv",
number = 5,
## repeated ten times
repeats = 10)
set.seed(825)
gbmFit1 <- train(cut.score ~ ., data = training,
method = "gbm",
trControl = fitControl,
## This last option is actually one
## for gbm() that passes through
verbose = FALSE)
gbmFit1
# Stochastic Gradient Boosting
#
# 19347 samples
# 1662 predictor
#
# No pre-processing
# Resampling: Cross-Validated (5 fold, repeated 10 times)
# Summary of sample sizes: 15477, 15477, 15478, 15479, 15477, 15477, ...
# Resampling results across tuning parameters:
#
# interaction.depth n.trees RMSE Rsquared MAE
# 1 50 9.819804 0.1858242 8.157205
# 1 100 9.532182 0.2242043 7.865208
# 1 150 9.364579 0.2433550 7.687117
# 2 50 9.524144 0.2282104 7.863145
# 2 100 9.243644 0.2595159 7.560315
# 2 150 9.108089 0.2763347 7.412402
# 3 50 9.368799 0.2466599 7.698566
# 3 100 9.110480 0.2770325 7.417358
# 3 150 8.998967 0.2909193 7.294279
#
# Tuning parameter 'shrinkage' was held constant at a value of 0.1
#
# Tuning parameter 'n.minobsinnode' was held constant at a value of 10
# RMSE was used to select the optimal model using the smallest value.
# The final values used for the model were n.trees = 150, interaction.depth =
# 3, shrinkage = 0.1 and n.minobsinnode = 10.
head(summary(gbmFit1))
# var rel.inf
# gcraw gcraw 6.148269
# p19.GGsgRNA.raw p19.GGsgRNA.raw 5.845378
# sgRNA.gcsgRNA.raw sgRNA.gcsgRNA.raw 5.817247
# p20homo_lumo_energygapraw p20homo_lumo_energygapraw 5.677869
# p20homo_energyraw p20homo_energyraw 5.155081
# p20xz_quadrupoleraw p20xz_quadrupoleraw 4.369718
pred <- predict(gbmFit1, newdata = testing)
pred.df <- data.frame(pred.score = pred, exp.score = testing$cut.score)
cor(pred.df$pred.score, pred.df$exp.score)
# 0.5533565
varImp(gbmFit1)
pdf("gbm.all.sgRNA.cor.pdf")
plot(pred.df$pred.score, pred.df$exp.score)
dev.off()
pdf("gbm.all.sgRNA.output.pdf")
plot(gbmFit1)
dev.off()
pdf("gbm.all.sgRNA.varImp.pdf")
plot(gbmFit1, top = dim(gbmFit1$importance)[1])
dev.off()
gbm_model <- gbmFit1$finalModel
write.table(gbm_model, "gbm.all.sgRNA.model.txt", quote=F, row.names=F, sep="\t")
# just raw
data = cbind(df.raw)
library(caret)
set.seed(998)
inTraining <- createDataPartition(data$cut.score, p = .75, list = FALSE)
training <- data[ inTraining,]
testing <- data[-inTraining,]
fitControl <- trainControl(## 5-fold CV
method = "repeatedcv",
number = 5,
## repeated ten times
repeats = 10)
set.seed(825)
gbmFit1 <- train(cut.score ~ ., data = training,
method = "gbm",
trControl = fitControl,
## This last option is actually one
## for gbm() that passes through
verbose = FALSE)
gbmFit1
# interaction.depth n.trees RMSE Rsquared MAE
# 1 50 10.32103 0.05932732 8.616267
# 1 100 10.29816 0.06202198 8.579170
# 1 150 10.29237 0.06269843 8.565979
# 2 50 10.29947 0.06180546 8.582098
# 2 100 10.28993 0.06308668 8.561558
# 2 150 10.28918 0.06321058 8.556044
# 3 50 10.29470 0.06240309 8.573183
# 3 100 10.29098 0.06289930 8.557920
# 3 150 10.29215 0.06279868 8.553862
# Tuning parameter 'shrinkage' was held constant at a value of 0.1
#
# Tuning parameter 'n.minobsinnode' was held constant at a value of 10
# RMSE was used to select the optimal model using the smallest value.
# The final values used for the model were n.trees = 150, interaction.depth =
# 2, shrinkage = 0.1 and n.minobsinnode = 10.
head(summary(gbmFit1))
# var rel.inf
# sgRNA.gcsgRNA.raw sgRNA.gcsgRNA.raw 50.7405695
# gcraw gcraw 28.8234500
# structureraw structureraw 8.2320491
# rnaraw rnaraw 6.0413337
# sgRNA.structuresgRNA.raw sgRNA.structuresgRNA.raw 5.5498241
# generaw generaw 0.6127737
pred <- predict(gbmFit1, newdata = testing)
pred.df <- data.frame(pred.score = pred, exp.score = testing$cut.score)
cor(pred.df$pred.score, pred.df$exp.score)
# 0.2418934
# just onehot
data = cbind(df.onehot)
library(caret)
set.seed(998)
inTraining <- createDataPartition(data$cut.score, p = .75, list = FALSE)
training <- data[ inTraining,]
testing <- data[-inTraining,]
fitControl <- trainControl(## 5-fold CV
method = "repeatedcv",
number = 5,
## repeated ten times
repeats = 10)
set.seed(825)
gbmFit1 <- train(cut.score ~ ., data = training,
method = "gbm",
trControl = fitControl,
## This last option is actually one
## for gbm() that passes through
verbose = FALSE)
gbmFit1
# interaction.depth n.trees RMSE Rsquared MAE
# 1 50 10.040069 0.1574158 8.381583
# 1 100 9.784247 0.1865807 8.118817
# 1 150 9.627834 0.2048009 7.949418
# 2 50 9.786419 0.1873221 8.124078
# 2 100 9.513097 0.2195149 7.825918
# 2 150 9.366956 0.2371466 7.664288
# 3 50 9.634821 0.2049887 7.960761
# 3 100 9.369978 0.2374645 7.669419
# 3 150 9.240020 0.2536497 7.526538
#
# Tuning parameter 'shrinkage' was held constant at a value of 0.1
#
# Tuning parameter 'n.minobsinnode' was held constant at a value of 10
# RMSE was used to select the optimal model using the smallest value.
# The final values used for the model were n.trees = 150, interaction.depth =
# 3, shrinkage = 0.1 and n.minobsinnode = 10.
head(summary(gbmFit1))
# var rel.inf
# p19.AGsgRNA.raw p19.AGsgRNA.raw 8.558241
# p19.GGsgRNA.raw p19.GGsgRNA.raw 7.878770
# p19.ACsgRNA.raw p19.ACsgRNA.raw 6.858890
# GGsgRNA.raw GGsgRNA.raw 5.574724
# CCsgRNA.raw CCsgRNA.raw 5.418142
# p19.CCsgRNA.raw p19.CCsgRNA.raw 4.852220
pred <- predict(gbmFit1, newdata = testing)
pred.df <- data.frame(pred.score = pred, exp.score = testing$cut.score)
cor(pred.df$pred.score, pred.df$exp.score)
# 0.5131852
# just thermal
data = cbind(df.thermal)
library(caret)
set.seed(998)
inTraining <- createDataPartition(data$cut.score, p = .75, list = FALSE)
training <- data[ inTraining,]
testing <- data[-inTraining,]
fitControl <- trainControl(## 5-fold CV
method = "repeatedcv",
number = 5,
## repeated ten times
repeats = 10)
set.seed(825)
gbmFit1 <- train(cut.score ~ ., data = training,
method = "gbm",
trControl = fitControl,
## This last option is actually one
## for gbm() that passes through
verbose = FALSE)
gbmFit1
# Stochastic Gradient Boosting
#
# 19347 samples
# 1241 predictor
#
# No pre-processing
# Resampling: Cross-Validated (5 fold, repeated 10 times)
# Summary of sample sizes: 15477, 15477, 15478, 15479, 15477, 15477, ...
# Resampling results across tuning parameters:
#
# interaction.depth n.trees RMSE Rsquared MAE
# 1 50 9.978523 0.1327457 8.279000
# 1 100 9.844334 0.1495939 8.122634
# 1 150 9.786317 0.1562208 8.048316
# 2 50 9.779368 0.1675387 8.073487
# 2 100 9.623184 0.1878665 7.893394
# 2 150 9.543410 0.1984202 7.798979
# 3 50 9.662972 0.1872118 7.952569
# 3 100 9.497427 0.2088485 7.763064
# 3 150 9.413170 0.2202414 7.666199
#
# Tuning parameter 'shrinkage' was held constant at a value of 0.1
#
# Tuning parameter 'n.minobsinnode' was held constant at a value of 10
# RMSE was used to select the optimal model using the smallest value.
# The final values used for the model were n.trees = 150, interaction.depth =
# 3, shrinkage = 0.1 and n.minobsinnode = 10.
head(summary(gbmFit1))
# var rel.inf
# p20xz_quadrupoleraw p20xz_quadrupoleraw 9.540019
# p20homo_lumo_energygapraw p20homo_lumo_energygapraw 6.656580
# p18homo_energyraw p18homo_energyraw 5.272879
# p20homo_energyraw p20homo_energyraw 4.609795
# p19num_singlebondsraw p19num_singlebondsraw 3.943510
# p19tot_dipoleraw p19tot_dipoleraw 3.632154
pred <- predict(gbmFit1, newdata = testing)
pred.df <- data.frame(pred.score = pred, exp.score = testing$cut.score)
cor(pred.df$pred.score, pred.df$exp.score)
# 0.4965279
# just dwt
data = cbind(df.dwt)
library(caret)
set.seed(998)
inTraining <- createDataPartition(data$cut.score, p = .75, list = FALSE)
training <- data[ inTraining,]
testing <- data[-inTraining,]
fitControl <- trainControl(## 5-fold CV
method = "repeatedcv",
number = 5,
## repeated ten times
repeats = 10)
set.seed(825)
gbmFit1 <- train(cut.score ~ ., data = training,
method = "gbm",
trControl = fitControl,
## This last option is actually one
## for gbm() that passes through
verbose = FALSE)
gbmFit1
# interaction.depth n.trees RMSE Rsquared MAE
# 1 50 10.55807 0.01440098 8.856540
# 1 100 10.54510 0.01637290 8.839414
# 1 150 10.53945 0.01718970 8.829440
# 2 50 10.54190 0.01717402 8.837964
# 2 100 10.53078 0.01877522 8.817642
# 2 150 10.52737 0.01947530 8.808135
# 3 50 10.53524 0.01806883 8.828227
# 3 100 10.52544 0.01982154 8.806886
# 3 150 10.52370 0.02068894 8.796359
#
# Tuning parameter 'shrinkage' was held constant at a value of 0.1
#
# Tuning parameter 'n.minobsinnode' was held constant at a value of 10
# RMSE was used to select the optimal model using the smallest value.
# The final values used for the model were n.trees = 150, interaction.depth =
# 3, shrinkage = 0.1 and n.minobsinnode = 10.
head(summary(gbmFit1))
# var rel.inf
# gene.dwtd12 gene.dwtd12 4.224424
# gene.dwtd13 gene.dwtd13 3.229718
# gene.dwts13 gene.dwts13 2.915104
# rna.dwtd5 rna.dwtd5 2.845085
# structure.dwtd12 structure.dwtd12 2.632542
# rna.dwtd12 rna.dwtd12 2.540682
pred <- predict(gbmFit1, newdata = testing)
pred.df <- data.frame(pred.score = pred, exp.score = testing$cut.score)
cor(pred.df$pred.score, pred.df$exp.score)
# 0.1439561
# just raw + onehot
data = cbind(df.raw, df.onehot)
library(caret)
set.seed(998)
inTraining <- createDataPartition(data$cut.score, p = .75, list = FALSE)
training <- data[ inTraining,]
testing <- data[-inTraining,]
fitControl <- trainControl(## 5-fold CV
method = "repeatedcv",
number = 5,
## repeated ten times
repeats = 10)
set.seed(825)
gbmFit1 <- train(cut.score ~ ., data = training,
method = "gbm",
trControl = fitControl,
## This last option is actually one
## for gbm() that passes through
verbose = FALSE)
gbmFit1
# Stochastic Gradient Boosting
#
# 19347 samples
# 351 predictor
#
# No pre-processing
# Resampling: Cross-Validated (5 fold, repeated 10 times)
# Summary of sample sizes: 15477, 15477, 15478, 15479, 15477, 15477, ...
# Resampling results across tuning parameters:
#
# interaction.depth n.trees RMSE Rsquared MAE
# 1 50 9.990543 0.1662874 8.337796
# 1 100 9.712332 0.2014576 8.057750
# 1 150 9.537199 0.2228342 7.875416
# 2 50 9.715686 0.2019246 8.063058
# 2 100 9.412083 0.2381101 7.742663
# 2 150 9.248526 0.2580056 7.567754
# 3 50 9.549697 0.2214349 7.890476
# 3 100 9.256474 0.2569762 7.577551
# 3 150 9.112253 0.2750217 7.421689
#
# Tuning parameter 'shrinkage' was held constant at a value of 0.1
#
# Tuning parameter 'n.minobsinnode' was held constant at a value of 10
# RMSE was used to select the optimal model using the smallest value.
# The final values used for the model were n.trees = 150, interaction.depth =
# 3, shrinkage = 0.1 and n.minobsinnode = 10.
head(summary(gbmFit1))
# var rel.inf
# sgRNA.gcsgRNA.raw sgRNA.gcsgRNA.raw 9.639308
# p19.AGsgRNA.raw p19.AGsgRNA.raw 7.998805
# p19.GGsgRNA.raw p19.GGsgRNA.raw 7.470751
# gcraw gcraw 7.038519
# p19.ACsgRNA.raw p19.ACsgRNA.raw 6.403688
# p19.CCsgRNA.raw p19.CCsgRNA.raw 4.419073
pred <- predict(gbmFit1, newdata = testing)
pred.df <- data.frame(pred.score = pred, exp.score = testing$cut.score)
cor(pred.df$pred.score, pred.df$exp.score)
# 0.5268188
# example from David:
## monte-carlo CV (random selection of training/validation samples)
# set.seed(999)
# leftout <- foreach(i=1:100) %do% {
# sample(1:nrow(pheno), size = nrow(pheno)*0.2)
# }
# doParallel::registerDoParallel(cores=10)
# res <- foreach(i = 1:length(leftout), .combine=rbind) %dopar%
# #res <- foreach(i = 1:50, .combine=rbind) %dopar%
# {
# print(i)
# valset <- leftout[[i]]
# trainset <- pheno
# trainset$VALUE[valset] <- NA
# gblup <- rrBLUP::kin.blup(data=trainset,geno = "GID",pheno = "VALUE",
# K = G.all,PEV=T)
# gblup.pcs <- rrBLUP::kin.blup(data=trainset,geno = "GID",pheno = "VALUE",
# K = G.all,PEV=T, covariate = c("PC1","PC2","PC3"))
# acc <- cor(gblup$pred[valset], pheno$VALUE[valset])
# acc.pcs <- cor(gblup.pcs$pred[valset], pheno$VALUE[valset])
# data.frame(RUN = i,
# ACC=c(acc,acc.pcs),
# TRAIT=c("DBH6","DBH6"),
# MODEL=c("GBLUP","GBLUP.3PCS"),
# NSNP=length(geno$snp.id))
# }
# doParallel::stopImplicitCluster()
# code for random forest and run for subsets
module load python
source /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/scripts/loadcondaandes.sh
conda activate /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/Libraries/andes/envs/test
R
setwd("/gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli")
df.dcast <- read.delim("ecoli.features.sgRNA.CWT.DWT.raw.onehot.sgRNAraw.thermaltensor.dcast.txt", header=T, sep="\t")
df.dcast <- na.omit(df.dcast)
df.raw <- df.dcast[,c(2,28,44,1626:1629,1644,1661)]
df.dwt <- df.dcast[,c(2,14:27,30:43,1612:1625,1630:1643,1647:1660)]
df.onehot <- df.dcast[,c(2,3:13,29,45:83,146:161,225:239,302:317,380:395,458:473,536:551,614:629,692:707,770:785,910:929,1054:1073,1136:1155,1218:1237,1300:1315,1378:1393,1456:1471,1534:1549,1645,1646,1662:1664)]
df.thermal <- df.dcast[,c(2,84:145,162:224,240:301,318:379,396:457,474:535,552:613,630:691,708:769,786:909,930:1053,1074:1135,1156:1217,1238:1299,1316:1377,1394:1455,1472:1533,1550:1611)]
library(ranger)
library(foreach)
# raw (raw + onehot + thermal)
xmat = cbind(df.raw, df.onehot, df.thermal)
set.seed(999)
leftout <- foreach(i=1:100) %do% {
sample(1:nrow(xmat), size = nrow(xmat)*0.2)
}
doParallel::registerDoParallel(cores=10)
kfoldRF <- foreach(i = 1:length(leftout), .combine=rbind) %dopar%
{
print(i)
valset <- leftout[[i]]
trainset <- xmat[,2:ncol(xmat)]
trainset.score <- xmat$cut.score
tmp <- cbind(trainset, Y = trainset.score)
wt <- rep(1/ncol(trainset), ncol(trainset))
rf.trainset <- ranger(dependent.variable.name = "Y", data=tmp, num.trees=500, split.select.weights = wt, classification=F, mtry=0.1*ncol(trainset), num.threads=1, write.forest=T, always.split.variables=NULL)
pred <- predict(rf.trainset, valset)
pred.df <- data.frame(score = valset$cut.score, pred = pred$predictions)
cor <- cor(pred.df$score, pred.df$pred)
data.frame(RUN = i,
COR=c(cor))
}
doParallel::stopImplicitCluster()
iRF function run...
set.seed(999)
# kfold(x, k=5, by)
leftout <- foreach(i=1:100) %do% {
sample(1:nrow(df), size = nrow(df)*0.2)
}
doParallel::registerDoParallel(cores=10)
kfoldRF <- foreach(i = 1:length(leftout), .combine=rbind) %dopar%
{
print(i)
valset <- leftout[[i]]
trainset <- df[,2:ncol(df)]
trainset$cut.score[valset] <- NA
iRF(df[,3:ncol(trainset)], trainset.score)
}
doParallel::stopImplicitCluster()
# just thermal
xmat = df.thermal
test = 0.7*nrow(xmat)
xmat.test = xmat[1:test,]
y = df.dcast[1:test,]
y.test = y$cut.score
tmp <- cbind(xmat.test, Y = y.test)
wt <- rep(1/ncol(xmat.test), ncol(xmat.test))
df.rf.thermal.test <- ranger(dependent.variable.name = "Y", data=tmp, num.trees=500, split.select.weights = wt, classification=F, mtry=0.1*ncol(xmat.test), num.threads=1, write.forest=T, always.split.variables=NULL)
pred <- predict(df.rf.thermal.test, na.omit(xmat[test+1:nrow(xmat),]))
pred$predictions
df.dcast.pred <- na.omit(df.dcast[test+1:nrow(xmat),])
pred.df <- data.frame(score = df.dcast.pred$cut.score, pred = pred$predictions)
cor(pred.df$score, pred.df$pred)
#
# all
xmat = cbind(df.raw, df.onehot, df.thermal, df.dwt)
test = 0.7*nrow(xmat)
xmat.test = xmat[1:test,]
y = df.dcast[1:test,]
y.test = y$cut.score
tmp <- cbind(xmat.test, Y = y.test)
wt <- rep(1/ncol(xmat.test), ncol(xmat.test))
df.rf.all.test <- ranger(dependent.variable.name = "Y", data=tmp, num.trees=500, split.select.weights = wt, classification=F, mtry=0.1*ncol(xmat.test), num.threads=1, write.forest=T, always.split.variables=NULL)
pred <- predict(df.rf.all.test, na.omit(xmat[test+1:nrow(xmat),]))
pred$predictions
df.dcast.pred <- na.omit(df.dcast[test+1:nrow(xmat),])
pred.df <- data.frame(score = df.dcast.pred$cut.score, pred = pred$predictions)
cor(pred.df$score, pred.df$pred)
#
https://topepo.github.io/caret/train-models-by-tag.html#random-forest
#!/bin/bash
#SBATCH -A SYB105
#SBATCH -J caret.rf
#SBATCH -N 4
#SBATCH -t 10:00:00
module load python
source /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/scripts/loadcondaandes.sh
conda activate /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/Libraries/andes/envs/test
cd /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli
R CMD BATCH caret.rf.all.R
R CMD BATCH caret.rf.3.R
R CMD BATCH caret.rf.2.R
R CMD BATCH caret.rf.thermal.R
#sbatch /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli/caret.rf.sh
# salloc -A SYB105 -N 2 -t 4:00:00
module load python
source /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/scripts/loadcondaandes.sh
conda activate /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/Libraries/andes/envs/test
setwd("/gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli")
df.dcast <- read.delim("ecoli.features.sgRNA.CWT.DWT.raw.onehot.sgRNAraw.thermaltensor.dcast.txt", header=T, sep="\t")
df.dcast <- na.omit(df.dcast)
#colnames(df.dcast)
df.raw <- df.dcast[,c(2,28,44,1626:1629,1644,1661)]
df.dwt <- df.dcast[,c(2,14:27,30:43,1612:1625,1630:1643,1647:1660)]
df.onehot <- df.dcast[,c(2,3:13,29,45:83,146:161,225:239,302:317,380:395,458:473,536:551,614:629,692:707,770:785,910:929,1054:1073,1136:1155,1218:1237,1300:1315,1378:1393,1456:1471,1534:1549,1645,1646,1662:1664)]
df.thermal <- df.dcast[,c(2,84:145,162:224,240:301,318:379,396:457,474:535,552:613,630:691,708:769,786:909,930:1053,1074:1135,1156:1217,1238:1299,1316:1377,1394:1455,1472:1533,1550:1611)]
# just raw
data = cbind(df.raw)
library(caret)
library(party)
set.seed(998)
inTraining <- createDataPartition(data$cut.score, p = .75, list = FALSE)
training <- data[ inTraining,]
testing <- data[-inTraining,]
seeds <- vector(mode = "list", length = nrow(training) + 1)
seeds <- lapply(seeds, function(x) 1:20)
rctrl1 <- trainControl(method = "cv", number = 5, returnResamp = "all", seeds = seeds)
set.seed(849)
rfFit1 <- train(cut.score ~ ., data = training,
method = "cforest",
trControl = rctrl1,
preProc = c("center", "scale"),
controls = party::cforest_unbiased(ntree = 20))
rfFit1
# mtry RMSE Rsquared MAE RMSESD RsquaredSD MAESD
# 1 2 10.39245 0.04900029 8.622768 0.08215155 0.006079585 0.04346392
# 2 5 10.46423 0.04315350 8.660200 0.08431236 0.002405330 0.05458081
# 3 8 10.48284 0.04170635 8.673062 0.07026902 0.003810565 0.04875492
pred <- predict(rfFit1, testing)
pred.df <- data.frame(pred.score = pred, exp.score = testing$cut.score)
cor(pred.df$pred.score, pred.df$exp.score)
# 0.2203867
# just onehot
data = cbind(df.onehot)
library(caret)
library(party)
set.seed(998)
inTraining <- createDataPartition(data$cut.score, p = .75, list = FALSE)
training <- data[ inTraining,]
testing <- data[-inTraining,]
seeds <- vector(mode = "list", length = nrow(training) + 1)
seeds <- lapply(seeds, function(x) 1:20)
rctrl1 <- trainControl(method = "cv", number = 5, returnResamp = "all", seeds = seeds)
set.seed(849)
rfFit1 <- train(cut.score ~ ., data = training,
method = "cforest",
trControl = rctrl1,
preProc = c("center", "scale"),
controls = party::cforest_unbiased(ntree = 20))
rfFit1
# Conditional Inference Random Forest
#
# 19347 samples
# 343 predictor
#
# Pre-processing: centered (343), scaled (343)
# Resampling: Cross-Validated (5 fold)
# Summary of sample sizes: 15478, 15477, 15477, 15477, 15479
# Resampling results across tuning parameters:
#
# mtry RMSE Rsquared MAE
# 2 10.254705 0.1388904 8.588795
# 172 9.328400 0.2312346 7.560305
# 343 9.356081 0.2253684 7.559720
#
# RMSE was used to select the optimal model using the smallest value.
# The final value used for the model was mtry = 172.
pred <- predict(rfFit1, testing)
pred.df <- data.frame(pred.score = pred, exp.score = testing$cut.score)
cor(pred.df$pred.score, pred.df$exp.score)
# 0.497679
# just thermal
data = cbind(df.thermal)
library(caret)
library(party)
set.seed(998)
inTraining <- createDataPartition(data$cut.score, p = .75, list = FALSE)
training <- data[ inTraining,]
testing <- data[-inTraining,]
seeds <- vector(mode = "list", length = nrow(training) + 1)
seeds <- lapply(seeds, function(x) 1:20)
rctrl1 <- trainControl(method = "cv", number = 5, returnResamp = "all", seeds = seeds)
set.seed(849)
rfFit1 <- train(cut.score ~ ., data = training,
method = "cforest",
trControl = rctrl1,
preProc = c("center", "scale"),
controls = party::cforest_unbiased(ntree = 20))
rfFit1
# Conditional Inference Random Forest
#
# 19347 samples
# 1241 predictor
#
# Pre-processing: centered (1241), scaled (1241)
# Resampling: Cross-Validated (5 fold)
# Summary of sample sizes: 15478, 15477, 15477, 15477, 15479
# Resampling results across tuning parameters:
#
# mtry RMSE Rsquared MAE
# 2 10.103186 0.1471344 8.435873
# 49 9.356295 0.2257056 7.572390
# 1241 9.413794 0.2157976 7.613720
#
# RMSE was used to select the optimal model using the smallest value.
# The final value used for the model was mtry = 49.
pred <- predict(rfFit1, testing)
pred.df <- data.frame(pred.score = pred, exp.score = testing$cut.score)
cor(pred.df$pred.score, pred.df$exp.score)
# 0.4863132
# just dwt
data = cbind(df.dwt)
library(caret)
library(party)
set.seed(998)
inTraining <- createDataPartition(data$cut.score, p = .75, list = FALSE)
training <- data[ inTraining,]
testing <- data[-inTraining,]
seeds <- vector(mode = "list", length = nrow(training) + 1)
seeds <- lapply(seeds, function(x) 1:20)
rctrl1 <- trainControl(method = "cv", number = 5, returnResamp = "all", seeds = seeds)
set.seed(849)
rfFit1 <- train(cut.score ~ ., data = training,
method = "cforest",
trControl = rctrl1,
preProc = c("center", "scale"),
controls = party::cforest_unbiased(ntree = 20))
rfFit1
# Conditional Inference Random Forest
#
# 19347 samples
# 70 predictor
#
# Pre-processing: centered (70), scaled (70)
# Resampling: Cross-Validated (5 fold)
# Summary of sample sizes: 15478, 15477, 15477, 15477, 15479
# Resampling results across tuning parameters:
#
# mtry RMSE Rsquared MAE
# 2 10.68510 0.02097101 8.830418
# 36 10.85795 0.01843467 8.920301
# 70 10.88517 0.01757394 8.931060
#
# RMSE was used to select the optimal model using the smallest value.
# The final value used for the model was mtry = 2.
pred <- predict(rfFit1, testing)
pred.df <- data.frame(pred.score = pred, exp.score = testing$cut.score)
cor(pred.df$pred.score, pred.df$exp.score)
# 0.1443533
# just raw + onehot
data = cbind(df.onehot, df.raw)
library(caret)
library(party)
set.seed(998)
inTraining <- createDataPartition(data$cut.score, p = .75, list = FALSE)
training <- data[ inTraining,]
testing <- data[-inTraining,]
seeds <- vector(mode = "list", length = nrow(training) + 1)
seeds <- lapply(seeds, function(x) 1:20)
rctrl1 <- trainControl(method = "cv", number = 5, returnResamp = "all", seeds = seeds)
set.seed(849)
rfFit1 <- train(cut.score ~ ., data = training,
method = "cforest",
trControl = rctrl1,
preProc = c("center", "scale"),
controls = party::cforest_unbiased(ntree = 20))
rfFit1
# Conditional Inference Random Forest
#
# 19347 samples
# 351 predictor
#
# Pre-processing: centered (351), scaled (351)
# Resampling: Cross-Validated (5 fold)
# Summary of sample sizes: 15478, 15477, 15477, 15477, 15479
# Resampling results across tuning parameters:
#
# mtry RMSE Rsquared MAE
# 2 10.219673 0.1470995 8.557476
# 176 9.225176 0.2485364 7.475893
# 351 9.240365 0.2445604 7.457529
#
# RMSE was used to select the optimal model using the smallest value.
# The final value used for the model was mtry = 176.
pred <- predict(rfFit1, testing)
pred.df <- data.frame(pred.score = pred, exp.score = testing$cut.score)
cor(pred.df$pred.score, pred.df$exp.score)
# 0.5110214
# just onehot + raw + thermal
data = cbind(df.onehot, df.raw, df.thermal)
library(caret)
library(party)
set.seed(998)
inTraining <- createDataPartition(data$cut.score, p = .75, list = FALSE)
training <- data[ inTraining,]
testing <- data[-inTraining,]
seeds <- vector(mode = "list", length = nrow(training) + 1)
seeds <- lapply(seeds, function(x) 1:20)
rctrl1 <- trainControl(method = "cv", number = 5, returnResamp = "all", seeds = seeds)
set.seed(849)
rfFit1 <- train(cut.score ~ ., data = training,
method = "cforest",
trControl = rctrl1,
preProc = c("center", "scale"),
controls = party::cforest_unbiased(ntree = 20))
rfFit1
# Conditional Inference Random Forest
#
# 19347 samples
# 1592 predictor
#
# Pre-processing: centered (1592), scaled (1592)
# Resampling: Cross-Validated (5 fold)
# Summary of sample sizes: 15478, 15477, 15477, 15477, 15479
# Resampling results across tuning parameters:
#
# mtry RMSE Rsquared MAE
# 2 10.130666 0.1494948 8.471277
# 56 9.280269 0.2396671 7.520867
# 1592 9.173566 0.2556840 7.395970
#
# RMSE was used to select the optimal model using the smallest value.
# The final value used for the model was mtry = 1592.
pred <- predict(rfFit1, testing)
pred.df <- data.frame(pred.score = pred, exp.score = testing$cut.score)
cor(pred.df$pred.score, pred.df$exp.score)
# 0.5251935
# all
data = cbind(df.onehot, df.raw, df.thermal, df.dwt)
library(caret)
library(party)
set.seed(998)
inTraining <- createDataPartition(data$cut.score, p = .75, list = FALSE)
training <- data[ inTraining,]
testing <- data[-inTraining,]
seeds <- vector(mode = "list", length = nrow(training) + 1)
seeds <- lapply(seeds, function(x) 1:20)
rctrl1 <- trainControl(method = "cv", number = 5, returnResamp = "all", seeds = seeds)
set.seed(849)
rfFit1 <- train(cut.score ~ ., data = training,
method = "cforest",
trControl = rctrl1,
preProc = c("center", "scale"),
controls = party::cforest_unbiased(ntree = 20))
rfFit1
# Conditional Inference Random Forest
#
# 19347 samples
# 1662 predictor
#
# Pre-processing: centered (1662), scaled (1662)
# Resampling: Cross-Validated (5 fold)
# Summary of sample sizes: 15478, 15477, 15477, 15477, 15479
# Resampling results across tuning parameters:
#
# mtry RMSE Rsquared MAE
# 2 10.138534 0.1474140 8.474578
# 57 9.254938 0.2442340 7.491158
# 1662 9.176497 0.2552291 7.396314
#
# RMSE was used to select the optimal model using the smallest value.
# The final value used for the model was mtry = 1662.
pred <- predict(rfFit1, testing)
pred.df <- data.frame(pred.score = pred, exp.score = testing$cut.score)
cor(pred.df$pred.score, pred.df$exp.score)
# 0.5262476
varImp(rfFit1)
pdf("rf.all.sgRNA.cor.pdf")
plot(pred.df$pred.score, pred.df$exp.score)
dev.off()
pdf("rf.all.sgRNA.output.pdf")
plot(rfFit1)
dev.off()
pdf("rf.all.sgRNA.varImp.pdf")
plot(rfFit1, top = dim(rfFit1$importance)[1])
dev.off()
rf_model <- rfFit1$finalModel
write.table(rf_model, "rf.all.sgRNA.model.txt", quote=F, row.names=F, sep="\t")
#!/bin/bash
#SBATCH -A SYB105
#SBATCH -J caret.all
#SBATCH -N 4
#SBATCH -t 10:00:00
module load python
source /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/scripts/loadcondaandes.sh
conda activate /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/Libraries/andes/envs/test
cd /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli
R CMD BATCH caret.rf.sgRNAseq.all.R
R CMD BATCH caret.gbm.sgRNAseq.all.R
R CMD BATCH caret.iRF.sgRNAseq.all.R
R CMD BATCH caret.iRF.sgRNAseq.onehot.R
R CMD BATCH caret.iRF.sgRNAseq.thermal.R
#sbatch /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli/caret.all.sgRNAseq.sh
#sbatch /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli/caret.onehot.thermal.sgRNAseq.sh
## onehot iRF output (n.tree=500)
Type: Regression
Number of trees: 500
Sample size: 19347
Number of independent variables: 343
Target node size: 5
Variable importance mode: impurity_corrected
Splitrule: variance
iRF iteration 1
=================
mtry: 171.5
prediction error: 86.94526
r^2: 0.2304447
cor(y,yhat): 0.4882309
SNPs with importance > 0: 209
iRF iteration 2
=================
mtry: 104.5
prediction error: 85.76939
r^2: 0.2408524
cor(y,yhat): 0.491079
SNPs with importance > 0: 156
iRF iteration 3
=================
mtry: 78
prediction error: 86.09305
r^2: 0.2379876
cor(y,yhat): 0.4880278
SNPs with importance > 0: 128
iRF iteration 4
=================
mtry: 64
prediction error: 87.10707
r^2: 0.2290125
cor(y,yhat): 0.4785498
SNPs with importance > 0: 103
iRF iteration 5
=================
mtry: 51.5
prediction error: 87.43266
r^2: 0.2261307
cor(y,yhat): 0.4771311
SNPs with importance > 0: 83
[[1]]
## thermal iRF output (n.tree=500)
Type: Regression
Number of trees: 500
Sample size: 19347
Number of independent variables: 1241
Target node size: 5
Variable importance mode: impurity_corrected
Splitrule: variance
iRF iteration 1
=================
mtry: 620.5
prediction error: 88.57172
r^2: 0.2160488
cor(y,yhat): 0.4696277
SNPs with importance > 0: 793
iRF iteration 2
=================
mtry: 396.5
prediction error: 88.08201
r^2: 0.2203832
cor(y,yhat): 0.4738372
SNPs with importance > 0: 520
iRF iteration 3
=================
mtry: 260
prediction error: 87.79589
r^2: 0.2229157
cor(y,yhat): 0.4763405
SNPs with importance > 0: 377
iRF iteration 4
=================
mtry: 188.5
prediction error: 87.48982
r^2: 0.2256247
cor(y,yhat): 0.4783408
SNPs with importance > 0: 295
iRF iteration 5
=================
mtry: 147.5
prediction error: 87.30739
r^2: 0.2272395
cor(y,yhat): 0.4795409
SNPs with importance > 0: 233
## all (raw+onehot+thermal+dwt) iRF output (n.tree=500)
iRF iteration 1
=================
mtry: 832.5
prediction error: 0.01307255
r^2: 0.9998843
cor(y,yhat): 0.9999439
SNPs with importance > 0: 944
# salloc -A SYB105 -N 2 -t 4:00:00
module load python
source /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/scripts/loadcondaandes.sh
conda activate /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/Libraries/andes/envs/test
setwd("/gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli")
df.dcast <- read.delim("ecoli.features.sgRNA.CWT.DWT.raw.onehot.sgRNAraw.thermaltensor.dcast.txt", header=T, sep="\t")
df.dcast <- na.omit(df.dcast)
df.raw <- df.dcast[,c(28,44,1626:1629,1644,1661)]
df.dwt <- df.dcast[,c(14:27,30:43,1612:1625,1630:1643,1647:1660)]
df.onehot <- df.dcast[,c(3:13,29,45:83,146:161,225:239,302:317,380:395,458:473,536:551,614:629,692:707,770:785,910:929,1054:1073,1136:1155,1218:1237,1300:1315,1378:1393,1456:1471,1534:1549,1645,1646,1662:1664)]
df.thermal <- df.dcast[,c(84:145,162:224,240:301,318:379,396:457,474:535,552:613,630:691,708:769,786:909,930:1053,1074:1135,1156:1217,1238:1299,1316:1377,1394:1455,1472:1533,1550:1611)]
library(ranger)
# all
xmat = cbind(df.raw, df.onehot, df.thermal, df.dwt)
test = 0.8*nrow(xmat)
xmat.test = xmat[1:test,2:ncol(xmat)]
y = df.dcast[1:test,]
y.test = y$cut.score
tmp <- cbind(xmat.test, Y = y.test)
wt <- rep(1/ncol(xmat.test), ncol(xmat.test))
df.rf.all.test <- ranger(dependent.variable.name = "Y", data=tmp, num.trees=500, split.select.weights = wt, classification=F, mtry=832.5, num.threads=1, write.forest=T, always.split.variables=NULL)
# Type: Regression
# Number of trees: 500
# Sample size: 20636
# Number of independent variables: 1665
# Mtry: 832
# Target node size: 5
# Variable importance mode: none
# Splitrule: variance
# OOB prediction error (MSE): 0.000531856
# R squared (OOB): 0.9999954
pred <- predict(df.rf.all.test, na.omit(xmat[test+1:nrow(xmat),]))
pred$predictions
df.dcast.pred <- na.omit(df.dcast[test+1:nrow(xmat),])
pred.df <- data.frame(score = df.dcast.pred$cut.score, pred = pred$predictions)
cor(pred.df$score, pred.df$pred)
# 0.464546
# onehot
xmat = cbind(df.onehot)
test = 0.8*nrow(xmat)
xmat.test = xmat[1:test,2:ncol(xmat)]
y = df.dcast[1:test,]
y.test = y$cut.score
tmp <- cbind(xmat.test, Y = y.test)
wt <- rep(1/ncol(xmat.test), ncol(xmat.test))
df.rf.onehot.test <- ranger(dependent.variable.name = "Y", data=tmp, num.trees=500, split.select.weights = wt, classification=F, mtry=104.5, num.threads=1, write.forest=T, always.split.variables=NULL)
# Type: Regression
# Number of trees: 500
# Sample size: 20636
# Number of independent variables: 343
# Mtry: 104
# Target node size: 5
# Variable importance mode: none
# Splitrule: variance
# OOB prediction error (MSE): 84.84716
# R squared (OOB): 0.259908
pred <- predict(df.rf.onehot.test, na.omit(xmat[test+1:nrow(xmat),]))
pred$predictions
df.dcast.pred <- na.omit(df.dcast[test+1:nrow(xmat),])
pred.df <- data.frame(score = df.dcast.pred$cut.score, pred = pred$predictions)
cor(pred.df$score, pred.df$pred)
# 0.4722664
df.rf.onehot.test <- ranger(dependent.variable.name = "Y", data=tmp, num.trees=1000, split.select.weights = wt, classification=F, mtry=104.5, num.threads=1, write.forest=T, always.split.variables=NULL)
df.rf.onehot.test
# Type: Regression
# Number of trees: 1000
# Sample size: 20636
# Number of independent variables: 342
# Mtry: 104
# Target node size: 5
# Variable importance mode: none
# Splitrule: variance
# OOB prediction error (MSE): 84.41277
# R squared (OOB): 0.263697
pred <- predict(df.rf.onehot.test, na.omit(xmat[test+1:nrow(xmat),]))
pred$predictions
df.dcast.pred <- na.omit(df.dcast[test+1:nrow(xmat),])
pred.df <- data.frame(score = df.dcast.pred$cut.score, pred = pred$predictions)
cor(pred.df$score, pred.df$pred)
# 0.4744112
# thermal
xmat = cbind(df.thermal)
test = 0.8*nrow(xmat)
xmat.test = xmat[1:test,2:ncol(xmat)]
y = df.dcast[1:test,]
y.test = y$cut.score
tmp <- cbind(xmat.test, Y = y.test)
wt <- rep(1/ncol(xmat.test), ncol(xmat.test))
df.rf.thermal.test <- ranger(dependent.variable.name = "Y", data=tmp, num.trees=500, split.select.weights = wt, classification=F, mtry=104.5, num.threads=1, write.forest=T, always.split.variables=NULL)
# Type: Regression
# Number of trees: 500
# Sample size: 20636
# Number of independent variables: 1241
# Mtry: 104
# Target node size: 5
# Variable importance mode: none
# Splitrule: variance
# OOB prediction error (MSE): 86.36993
# R squared (OOB): 0.2466254
pred <- predict(df.rf.thermal.test, na.omit(xmat[test+1:nrow(xmat),]))
pred$predictions
df.dcast.pred <- na.omit(df.dcast[test+1:nrow(xmat),])
pred.df <- data.frame(score = df.dcast.pred$cut.score, pred = pred$predictions)
cor(pred.df$score, pred.df$pred)
# 0.464546
df.rf.thermal.test <- ranger(dependent.variable.name = "Y", data=tmp, num.trees=1000, split.select.weights = wt, classification=F, mtry=104.5, num.threads=1, write.forest=T, always.split.variables=NULL)
df.rf.thermal.test
# Type: Regression
# Number of trees: 1000
# Sample size: 20636
# Number of independent variables: 1240
# Mtry: 104
# Target node size: 5
# Variable importance mode: none
# Splitrule: variance
# OOB prediction error (MSE): 86.17939
# R squared (OOB): 0.2482874
pred <- predict(df.rf.thermal.test, na.omit(xmat[test+1:nrow(xmat),]))
pred$predictions
df.dcast.pred <- na.omit(df.dcast[test+1:nrow(xmat),])
pred.df <- data.frame(score = df.dcast.pred$cut.score, pred = pred$predictions)
cor(pred.df$score, pred.df$pred)
# 0.4644311
# salloc -A SYB105 -N 2 -t 4:00:00
module load python
source /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/scripts/loadcondaandes.sh
conda activate /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/Libraries/andes/envs/test
R
setwd("/gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli")
df.dcast <- read.delim("ecoli.features.sgRNA.CWT.DWT.raw.onehot.sgRNAraw.thermaltensor.dcast.txt", header=T, sep="\t")
df.dcast <- na.omit(df.dcast)
df.raw <- df.dcast[,c(2,28,44,1626:1629,1644,1661)]
df.dwt <- df.dcast[,c(2,14:27,30:43,1612:1625,1630:1643,1647:1660)]
df.onehot <- df.dcast[,c(2,3:13,29,45:83,146:161,225:239,302:317,380:395,458:473,536:551,614:629,692:707,770:785,910:929,1054:1073,1136:1155,1218:1237,1300:1315,1378:1393,1456:1471,1534:1549,1645,1646,1662:1664)]
df.thermal <- df.dcast[,c(2,84:145,162:224,240:301,318:379,396:457,474:535,552:613,630:691,708:769,786:909,930:1053,1074:1135,1156:1217,1238:1299,1316:1377,1394:1455,1472:1533,1550:1611)]
iRF <- function(xmat, y, ntree=500, iter=5, classification=F, threads=1,alwayssplits=NULL, saveall=T)
{
tmp <- cbind(xmat, Y = y)
wt <- rep(1/ncol(xmat), ncol(xmat)) # start with equal sample weighting per SNP
rfs <- list()
for(i in 1:iter)
{
cat("\niRF iteration ",i,"\n")
cat("=================\n")
mtry = 0.5*sum(wt>0)
rf <- ranger::ranger(dependent.variable.name = "Y", data = tmp, num.trees=ntree,
split.select.weights = wt, classification = classification,
# mtry = mtry, importance = "impurity_corrected", num.threads=threads, write.forest = T,
mtry = mtry, importance = "impurity", num.threads=threads, write.forest = T,
always.split.variables = alwayssplits)
wt <- rf$variable.importance / sum(abs(rf$variable.importance)) # scale importances to range(0,1)
wt[wt<0] <- 0 # set negative weights to zero
cat("mtry: ", mtry, "\n")
cat("prediction error: ",rf$prediction.error,"\n")
if(classification==FALSE) cat("r^2: ",rf$r.squared,"\n")
if(classification==TRUE) print(rf$confusion.matrix)
cat("cor(y,yhat): ",cor(rf$predictions,y),"\n")
cat("SNPs with importance > 0:",sum(wt>0),"\n")
if(saveall) rfs[[i]] <- rf
if(sum(wt>0) < max(0.01*(ncol(xmat)-1), 10))
{
if(!saveall) rfs <- rf
break
}
}
return(rfs)
}
library(ranger)
library(tidyverse)
library(caret)
################################################################################
# https://stackoverflow.com/questions/37557985/loop-for-repeated-k-fold-cross-validation
library(cvTools)
df <- df.dwt
k <- 5 #the number of folds
folds <- cvFolds(NROW(df), K=k)
mypreds <- data.frame(matrix(0, nrow(df), ncol = 5)) # create a dataframe to store results of all 5 k-fold repetitions
row.names(mypreds) <- row.names(df) # row names for the dataframe
names(mypreds) <- paste("K", (1:5), sep = "") # column names
set.seed(123)
j <- 1
nsim = 10 # number of repetitions
df$kfoldlpred <- rep(0,nrow(df)) # append a column to original dataframe to temporarily store results of each k-fold
# the loop for repeated cross-validation
repeatcv <- function(){
while (j <= nsim){
for(i in 1:k){
train <- df[folds$subsets[folds$which != i], 2:ncol(df)] #Set the training set
train_response <- df[folds$subsets[folds$which != i], 1] # set the training set response
validation <- df[folds$subsets[folds$which == i], ] #Set the validation set
iRF.model <- iRF(train, train_response)
iRF.train <- iRF.model[[5]]
iRF.validate <- predict(iRF.train, validation[,2:ncol(validation)])
iRF.pred <- iRF.validate$prediction
df[folds$subsets[folds$which == i],]$kfoldlpred <- iRF.pred
}
mypreds[,j] <- df$kfoldlpred
j <- j+1
}
return(mypreds)
}
repeatcv()
################################################################################
# kfold
library(dismo)
kfold(df.onehot, k=5)
################################################################################
# CV
set.seed(245)
trctrl <- trainControl(method = "cv", number = 5, savePredictions=TRUE)
nb_fit <- train(factor(cut.score) ~., data = df.onehot, method = "naive_bayes", trControl=trctrl, tuneLength = 0)
nb_fit
## can't install package "naivebayes"
pred <- nb_fit$pred
pred$equal <- ifelse(pred$pred == pred$obs, 1,0)
eachfold <- pred %>%
group_by(Resample) %>%
summarise_at(vars(equal),
list(Accuracy = mean))
eachfold
################################################################################
# R validation
# R program to implement
# validation set approach
# setting seed to generate a
# reproducible random sampling
set.seed(123)
# creating training data as 80% of the dataset
random_sample <- createDataPartition(df.dwt $ cut.score, p = 0.8, list = FALSE)
# generating training dataset
# from the random_sample
training_dataset <- df.dwt[random_sample, ]
# generating testing dataset
# from rows which are not
# included in random_sample
testing_dataset <- df.dwt[-random_sample, ]
# Building the model
# training the model by assigning sales column
# as target variable and rest other columns
# as independent variables
model <- lm(cut.score ~., data = training_dataset)
# predicting the target variable
predictions <- predict(model, testing_dataset)
# computing model performance metrics
data.frame( R2 = R2(predictions, testing_dataset $ cut.score),
RMSE = RMSE(predictions, testing_dataset $ cut.score),
MAE = MAE(predictions, testing_dataset $ cut.score))
model.iRF <- iRF(training_dataset[,2:ncol(training_dataset)], training_dataset[,1])
predictions <- predict(model.iRF, testing_dataset[,2:ncol(testing_dataset)])
data.frame( R2 = R2(predictions, testing_dataset $ cut.score),
RMSE = RMSE(predictions, testing_dataset $ cut.score),
MAE = MAE(predictions, testing_dataset $ cut.score))
################################################################################
# https://topepo.github.io/caret/using-your-own-model-in-train.html
### caret cross validation with iRF model
################################################################################
# https://cran.r-project.org/web/packages/groupdata2/vignettes/cross-validation_with_groupdata2.html
setwd("/gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli")
df.dcast <- read.delim("ecoli.features.sgRNA.CWT.DWT.raw.onehot.sgRNAraw.thermaltensor.dcast.txt", header=T, sep="\t")
df.dcast <- na.omit(df.dcast)
df <- df.dcast[,c(1,2,3:13,29,45:83,146:161,225:239,302:317,380:395,458:473,536:551,614:629,692:707,770:785,910:929,1054:1073,1136:1155,1218:1237,1300:1315,1378:1393,1456:1471,1534:1549,1645,1646,1662:1664)]
xpectr::set_test_seed(1) # For reproducibility
# Split data in 20/80 (percentage)
partition(df, p = 0.2, id_col = "sgRNAID") %>%
.[1] %>% # See only the test set
kable() # Pretty tables :)
xpectr::set_test_seed(1) # For reproducibility
# Split data in 20/80 (percentage)
parts <- partition(df, p = 0.2, id_col = "sgRNAID", cat_col = 'cut.score')
test_set <- parts[[1]]
train_set <- parts[[2]]
# Show test_set
test_set %>% kable()
train_set %>%
count(cut.score) %>%
kable(align = 'c')
train_set %>%
count(sgRNAID) %>%
kable(align = 'c')
xpectr::set_test_seed(1) # For reproducibility
train_set <- fold(train_set, k = 4, cat_col = 'cut.score', id_col = 'sgRNAID')
# Order by .folds
train_set <- train_set %>% arrange(.folds)
train_set %>% kable()
train_set %>%
count(sgRNAID, cut.score) %>%
kable(align = 'c')
crossvalidate <- function(data, k, model, dependent, random = FALSE){
# 'data' is the training set with the ".folds" column
# 'k' is the number of folds we have
# 'model' is a string describing a linear regression model formula
# 'dependent' is a string with the name of the score column we want to predict
# 'random' is a logical (TRUE/FALSE); do we have random effects in the model?
# Initialize empty list for recording performances
performances <- c()
# One iteration per fold
for (fold in 1:k){
# Create training set for this iteration
# Subset all the datapoints where .folds does not match the current fold
training_set <- data[data$.folds != fold,]
# Create test set for this iteration
# Subset all the datapoints where .folds matches the current fold
testing_set <- data[data$.folds == fold,]
## Train model
# If there is a random effect,
# use lmer() to train model
# else use lm()
if (isTRUE(random)){
# Train linear mixed effects model on training set
model <- lmer(model, training_set, REML = FALSE)
} else {
# Train linear model on training set
model <- lm(model, training_set)
}
## Test model
# Predict the dependent variable in the testing_set with the trained model
predicted <- predict(model, testing_set, allow.new.levels = TRUE)
# Get the Root Mean Square Error between the predicted and the observed
RMSE <- rmse(predicted, testing_set[[dependent]])
# Add the RMSE to the performance list
performances[fold] <- RMSE
}
# Return the mean of the recorded RMSEs
return(c('RMSE' = mean(performances)))
}
m0 <- 'iRF(train_set[,3:ncol(train_set)], train_set[,1])'
crossvalidate(train_set, k = 4, model = m0, dependent = 'score', random = TRUE)
#!/bin/bash
#SBATCH -A SYB105
#SBATCH -J kfold.iRF
#SBATCH -N 4
#SBATCH -t 10:00:00
module load python
source /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/scripts/loadcondaandes.sh
conda activate /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/Libraries/andes/envs/test
cd /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli
R CMD BATCH kfold.iRF.R
R CMD BATCH kfold.iRF.onehot.R
R CMD BATCH kfold.iRF.thermal.R
R CMD BATCH kfold.iRF.all.R
#sbatch /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli/kfold.iRF.sh
#sbatch /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli/kfold.iRF.dwt.sh
#sbatch /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli/kfold.iRF.onehot.sh
#sbatch /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli/kfold.iRF.thermal.sh
#sbatch /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli/kfold.iRF.all.sh
module load python
source /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/scripts/loadcondaandes.sh
conda activate /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/Libraries/andes/envs/test
R
setwd("/gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli")
df.dcast <- read.delim("ecoli.features.sgRNA.CWT.DWT.raw.onehot.sgRNAraw.thermaltensor.dcast.txt", header=T, sep="\t")
df.dcast <- na.omit(df.dcast)
kfold <- read.delim("kfold.iRF.dwt.txt", header=T, sep="\t")
kfold$avg <- rowMeans(kfold, na.rm = TRUE)
library(dplyr)
kfold.df <- cbind(df.dcast[,1:2], kfold)
kfold.df$score <- as.numeric(kfold.df$cut.score)
cor(kfold.df$score, kfold.df$avg)
# salloc -A SYB105 -N 2 -t 4:00:00
module load python
source /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/scripts/loadcondaandes.sh
conda activate /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/Libraries/andes/envs/test
R
setwd("/gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli")
df.dcast <- read.delim("ecoli.features.sgRNA.CWT.DWT.raw.onehot.sgRNAraw.thermaltensor.dcast.txt", header=T, sep="\t")
df.dcast <- na.omit(df.dcast)
## sample ID
df.raw <- df.dcast[,c(1,28,44,1626:1629,1644,1661)]
df.dwt <- df.dcast[,c(1,14:27,30:43,1612:1625,1630:1643,1647:1660)]
df.onehot <- df.dcast[,c(1,3:13,29,45:83,146:161,225:239,302:317,380:395,458:473,536:551,614:629,692:707,770:785,910:929,1054:1073,1136:1155,1218:1237,1300:1315,1378:1393,1456:1471,1534:1549,1645,1646,1662:1664)]
df.thermal <- df.dcast[,c(1,84:145,162:224,240:301,318:379,396:457,474:535,552:613,630:691,708:769,786:909,930:1053,1074:1135,1156:1217,1238:1299,1316:1377,1394:1455,1472:1533,1550:1611)]
df.all <- cbind(df.raw, df.dwt, df.onehot, df.thermal)
df.raw.onehot <- cbind(df.raw, df.onehot)
df.score <- df.dcast[,c(1,2)]
setwd("/gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli/iRF.run")
write.table(df.raw, "df.raw_overlap.txt", quote=F, row.names=F, sep="\t")
write.table(df.onehot, "df.onehot_overlap.txt", quote=F, row.names=F, sep="\t")
write.table(df.thermal, "df.thermal_overlap.txt", quote=F, row.names=F, sep="\t")
write.table(df.raw.onehot, "df.raw.onehot_overlap.txt", quote=F, row.names=F, sep="\t")
write.table(df.all, "df.all_overlap.txt", quote=F, row.names=F, sep="\t")
write.table(df.score, "df.score_overlap.txt", quote=F, row.names=F, sep="\t")
setwd("/gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli/iRF.run")
write.table(df.raw, "df.raw.txt", quote=F, row.names=F, sep="\t")
write.table(df.onehot, "df.onehot.txt", quote=F, row.names=F, sep="\t")
write.table(df.thermal, "df.thermal.txt", quote=F, row.names=F, sep="\t")
write.table(df.raw.onehot, "df.raw.onehot.txt", quote=F, row.names=F, sep="\t")
write.table(df.all, "df.all.txt", quote=F, row.names=F, sep="\t")
write.table(df.score, "df.score.txt", quote=F, row.names=F, sep="\t")
## no sample ID
## <DataFile>_overlap_noSampleIDs.txt & <YFile>_overlap_noSampleIDs.txt
df.raw <- df.dcast[,c(28,44,1626:1629,1644,1661)]
df.dwt <- df.dcast[,c(14:27,30:43,1612:1625,1630:1643,1647:1660)]
df.onehot <- df.dcast[,c(3:13,29,45:83,146:161,225:239,302:317,380:395,458:473,536:551,614:629,692:707,770:785,910:929,1054:1073,1136:1155,1218:1237,1300:1315,1378:1393,1456:1471,1534:1549,1645,1646,1662:1664)]
df.thermal <- df.dcast[,c(84:145,162:224,240:301,318:379,396:457,474:535,552:613,630:691,708:769,786:909,930:1053,1074:1135,1156:1217,1238:1299,1316:1377,1394:1455,1472:1533,1550:1611)]
df.all <- cbind(df.raw, df.dwt, df.onehot, df.thermal)
df.raw.onehot <- cbind(df.raw, df.onehot)
setwd("/gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli/iRF.run")
write.table(df.raw, "df.raw_overlap_noSampleIDs.txt", quote=F, row.names=F, sep="\t")
write.table(df.onehot, "df.onehot_overlap_noSampleIDs.txt", quote=F, row.names=F, sep="\t")
write.table(df.thermal, "df.thermal_overlap_noSampleIDs.txt", quote=F, row.names=F, sep="\t")
write.table(df.raw.onehot, "df.raw.onehot_overlap_noSampleIDs.txt", quote=F, row.names=F, sep="\t")
write.table(df.all, "df.all_overlap_noSampleIDs.txt", quote=F, row.names=F, sep="\t")
df.score <- data.frame(df.dcast[,c(2)])
colnames(df.score) <- "cut.score"
setwd("/gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli/iRF.run")
write.table(df.score, "df.score_overlap_noSampleIDs.txt", quote=F, row.names=F, sep="\t")
# run python scripts on Andes
# Builder script: /gpfs/alpine/syb105/proj-shared/Projects/iRF/iRF_LOOP_SetUp_CrossLayer.py
# [python iRF_LOOP_SetUp_CrossLayer.py --DataFile --YFile --System Summit --NodesPer 1 --TotalNodes 10 --RunTime 2 --Account SYB105 --NumTrees 1000 --NumIterations 5 --RunName iRF.XX --bypass --Prediction]
cd /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli/iRF.run
module load python/3.7-anaconda3
cd /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli/iRF.run/raw.features
python /gpfs/alpine/syb105/proj-shared/Projects/iRF/iRF_LOOP_SetUp_CrossLayer.py --System Summit --NodesPer 1 --TotalNodes 10 --RunTime 20 --Account SYB105 --NumTrees 1000 --NumIterations 5 --RunName iRF.RawFeatures --bypass --Prediction /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli/iRF.run/df.raw.txt /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli/iRF.run/df.score.txt
cd /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli/iRF.run/onehot.features
python /gpfs/alpine/syb105/proj-shared/Projects/iRF/iRF_LOOP_SetUp_CrossLayer.py --System Summit --NodesPer 1 --TotalNodes 50 --RunTime 20 --Account SYB105 --NumTrees 1000 --NumIterations 5 --RunName iRF.OnehotFeatures --bypass --Prediction /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli/iRF.run/df.onehot.txt /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli/iRF.run/df.score.txt
cd /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli/iRF.run/tensor.features
python /gpfs/alpine/syb105/proj-shared/Projects/iRF/iRF_LOOP_SetUp_CrossLayer.py --System Summit --NodesPer 1 --TotalNodes 50 --RunTime 20 --Account SYB105 --NumTrees 1000 --NumIterations 5 --RunName iRF.TensorFeatures --bypass --Prediction /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli/iRF.run/df.thermal.txt /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli/iRF.run/df.score.txt
cd /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli/iRF.run/raw.onehot.features
python /gpfs/alpine/syb105/proj-shared/Projects/iRF/iRF_LOOP_SetUp_CrossLayer.py --System Summit --NodesPer 1 --TotalNodes 50 --RunTime 20 --Account SYB105 --NumTrees 1000 --NumIterations 5 --RunName iRF.RawOnehotFeatures --bypass --Prediction /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli/iRF.run/df.raw.onehot.txt /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli/iRF.run/df.score.txt
cd /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli/iRF.run/all.features
python /gpfs/alpine/syb105/proj-shared/Projects/iRF/iRF_LOOP_SetUp_CrossLayer.py --System Summit --NodesPer 1 --TotalNodes 50 --RunTime 20 --Account SYB105 --NumTrees 1000 --NumIterations 5 --RunName iRF.AllFeatures --bypass --Prediction /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli/iRF.run/df.all.txt /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli/iRF.run/df.score.txt
# switch to Summit command line and submit jobs
module load python/3.7.0-anaconda3-5.3.0
# full
bsub /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli/iRF.run/raw.features/Submits/submit_full_iRF.RawFeatures_0.sh
bsub /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli/iRF.run/onehot.features/Submits/submit_full_iRF.OnehotFeatures_0.sh
bsub /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli/iRF.run/tensor.features/Submits/submit_full_iRF.TensorFeatures_0.sh
bsub /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli/iRF.run/raw.onehot.features/Submits/submit_full_iRF.RawOnehotFeatures_0.sh
bsub /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli/iRF.run/all.features/Submits/submit_full_iRF.AllFeatures_0.sh
# train
bsub /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli/iRF.run/raw.features/Submits/submit_train_iRF.RawFeatures_0.sh
bsub /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli/iRF.run/onehot.features/Submits/submit_train_iRF.OnehotFeatures_0.sh
bsub /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli/iRF.run/tensor.features/Submits/submit_train_iRF.TensorFeatures_0.sh
bsub /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli/iRF.run/raw.onehot.features/Submits/submit_train_iRF.RawOnehotFeatures_0.sh
bsub /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli/iRF.run/all.features/Submits/submit_train_iRF.AllFeatures_0.sh
# once the train submissions are done run the test submissions
# test
bsub /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli/iRF.run/raw.features/Submits/submit_test_iRF.RawFeatures_0.sh
bsub /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli/iRF.run/onehot.features/Submits/submit_test_iRF.OnehotFeatures_0.sh
bsub /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli/iRF.run/tensor.features/Submits/submit_test_iRF.TensorFeatures_0.sh
bsub /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli/iRF.run/raw.onehot.features/Submits/submit_test_iRF.RawOnehotFeatures_0.sh
bsub /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli/iRF.run/all.features/Submits/submit_test_iRF.AllFeatures_0.sh
# Post-Processing
# run python scripts on Andes
# postprocessing: /gpfs/alpine/syb105/proj-shared/Projects/iRF/iRF_postProcessing.py
# [python iRF_postProcessing.ppy --YNames --RunName --Iterations 5 --Prediction --PredAccuracy MAE,MAEA,medianAE --varTot 95]
cd /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli/iRF.run
module load python/3.7-anaconda3
cd /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli/iRF.run/raw.features
python /gpfs/alpine/syb105/proj-shared/Projects/iRF/iRF_postProcessing.py --Iterations 5 --Prediction --PredAccuracy MAE,MAEA,MSE,R2 --varTot 95 /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli/iRF.run/YNames.txt iRF.RawFeatures
cd /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli/iRF.run/onehot.features
python /gpfs/alpine/syb105/proj-shared/Projects/iRF/iRF_postProcessing.py --Iterations 5 --Prediction --PredAccuracy MAE,MAEA,MSE,R2 --varTot 95 /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli/iRF.run/YNames.txt iRF.OnehotFeatures
cd /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli/iRF.run/tensor.features
python /gpfs/alpine/syb105/proj-shared/Projects/iRF/iRF_postProcessing.py --Iterations 5 --Prediction --PredAccuracy MAE,MAEA,MSE,R2 --varTot 95 /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli/iRF.run/YNames.txt iRF.TensorFeatures
cd /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli/iRF.run/raw.onehot.features
python /gpfs/alpine/syb105/proj-shared/Projects/iRF/iRF_postProcessing.py --Iterations 5 --Prediction --PredAccuracy MAE,MAEA,MSE,R2 --varTot 95 /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli/iRF.run/YNames.txt iRF.RawOnehotFeatures
cd /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli/iRF.run/all.features
python /gpfs/alpine/syb105/proj-shared/Projects/iRF/iRF_postProcessing.py --Iterations 5 --Prediction --PredAccuracy MAE,MAEA,MSE,R2 --varTot 95 /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli/iRF.run/YNames.txt iRF.AllFeatures
# why is the R2 so low but the pearson correlation is higher?? R versus R2... which is the better value to use??
setwd("/gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli/iRF.run/all.features/cut.score/foldRuns/fold9/Runs/Set4")
pred <- read.delim("iRF.AllFeatures_Set4_test.prediction", header=T, sep="\t")
head(pred)
# Predictions.
# 1 19.6021
# 2 17.4961
# 3 21.3566
# 4 24.3650
# 5 22.0712
# 6 22.6134
y <- read.delim("set4_Y_test_noSampleIDs.txt", header=T, sep="\t")
head(y)
# cut.score
# 1 37.05
# 2 16.07
# 3 29.66
# 4 29.08
# 5 34.90
# 6 18.89
cor(y$cut.score, pred$Predictions.)
#[1] 0.4907236
## just tensors
setwd("/gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli/iRF.run/tensor.features/cut.score/foldRuns/fold9/Runs/Set4")
pred <- read.delim("iRF.TensorFeatures_Set4_test.prediction", header=T, sep="\t")
head(pred)
y <- read.delim("set4_Y_test_noSampleIDs.txt", header=T, sep="\t")
head(y)
cor(y$cut.score, pred$Predictions.)
#[1] 0.4841755
## just onehot
setwd("/gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli/iRF.run/onehot.features/cut.score/foldRuns/fold9/Runs/Set4")
pred <- read.delim("iRF.OnehotFeatures_Set4_test.prediction", header=T, sep="\t")
head(pred)
y <- read.delim("set4_Y_test_noSampleIDs.txt", header=T, sep="\t")
head(y)
cor(y$cut.score, pred$Predictions.)
#[1] 0.4239201
## raw + onehot
setwd("/gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli/iRF.run/raw.onehot.features/cut.score/foldRuns/fold9/Runs/Set4")
pred <- read.delim("iRF.RawOnehotFeatures_Set4_test.prediction", header=T, sep="\t")
head(pred)
y <- read.delim("set4_Y_test_noSampleIDs.txt", header=T, sep="\t")
head(y)
cor(y$cut.score, pred$Predictions.)
#[1] 0.4403425
### All Features Run - Feature Importance
### topVarEdges
# gene.dwtd6 cut.score 0.014782717426879869
# sgRNA.tempsgRNA.raw cut.score 0.014925077384290612
# structure.dwtd1 cut.score 0.015263337099152476
# gene.dwtd4 cut.score 0.015452973443115732
# gene.dwtd2 cut.score 0.015858573699601423
# rna.dwtd3 cut.score 0.015880371793999884
# gene.dwtd3 cut.score 0.016087630623369688
# gcraw cut.score 0.021291217930665164
# tempraw cut.score 0.021523884279155255
# p20homo_lumo_energygapraw cut.score 0.022698752026108784
### normalizedEdgeFiles
# p13rot_temp_yraw cut.score 9.670675038754563e-06
# p7num_Natomsraw cut.score 9.727187306212915e-06
# p5num_Catomsraw cut.score 9.730018227563366e-08
# p6num_bondsraw cut.score 9.755178041065488e-06
# p12num_Catomsraw cut.score 9.762326117475374e-07
# p9relativenum_Natomsraw cut.score 9.764767787140137e-06
# p12lumo_energyraw cut.score 9.783310321985583e-06
# p14molecular_weightraw cut.score 9.785928924234751e-05
# p14tot_dipoleraw cut.score 9.823084766959402e-05
# p9rot_temp_zraw cut.score 9.84799687484336e-06
### 500bp windows
cd /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli
bedtools getfasta -fi genome/GCF_000005845.2_ASM584v2_genomic.fna -bed ecoli.500bp.windows.bed -fo ecoli.500bp.fa
# count nucleotides
cd /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli
python
input_file = open('ecoli.500bp.fa', 'r')
output_file = open('nucleotide_counts_500bp.tsv','w')
output_file.write('Window\tA\tC\tG\tT\tLength\tCG%\n')
from Bio import SeqIO
for cur_record in SeqIO.parse(input_file, "fasta") :
gene_name = cur_record.name
A_count = cur_record.seq.count('A')
C_count = cur_record.seq.count('C')
G_count = cur_record.seq.count('G')
T_count = cur_record.seq.count('T')
length = len(cur_record.seq)
cg_percentage = float(C_count + G_count) / length
output_line = '%s\t%i\t%i\t%i\t%i\t%i\t%f\n' % \
(gene_name, A_count, C_count, G_count, T_count, length, cg_percentage)
output_file.write(output_line)
output_file.close()
input_file.close()
exit()
# Melting temperature(°C) = 64.9 + 41 * (nG+nC-16.4)/(nA+nT+nG+nC)
R
library(dplyr)
setwd("/gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli")
df <- read.delim("nucleotide_counts_500bp.tsv", header=T, sep="\t")
df.melt <- df %>% mutate(MeltingTemp = 64.9 + 41 * (G+C-16.4) / (A+T+G+C))
write.table(df.melt, "nucleotide_counts_500bp_temp.txt", quote=F, row.names=F, sep="\t")
q()
cd /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli
python /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/Libraries/andes/iFeature/iFeature.py --file ecoli.500bp.fa --type AAC --out 500bp.protein.structure.fa
library(tidyr)
library(dplyr)
setwd("/gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli/genome")
# sed '1d' GCF_000005845.2_ASM584v2_genomic.gff | sed '1d' | sed '1d' | sed '1d' | sed '1d' | sed '1d' | sed '1d' > GCF_000005845.2_ASM584v2_genomic.txt
annotation <- read.delim("GCF_000005845.2_ASM584v2_genomic.txt", header=F, sep="\t")
gene <- subset(annotation, annotation$V3 == "gene")
gene.id <- separate(gene, V9, c("id1", "id2"), sep="EcoGene:")
gene.id$gene_id <- substr(gene.id$id2, 1, 7)
setwd("/gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli")
rna <- read.delim("GSM2267479_Sample-1.genes.results.txt", header=T, sep="\t")
rna.id <- left_join(rna, gene.id, by="gene_id")
rna.id.idf <- na.omit(rna.id[,c(8,11,12,1,3:7)])
write.table(rna.id.idf, "GSM2267479.fpkm.coord.txt", quote=F, row.names=F, sep="\t")
####
bedtools intersect -wo -a ecoli.500bp.windows.bed -b GSM2267479.fpkm.coord.bed > ecoli.rnaseq.windows500.bed
####
setwd("/gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli")
window <- read.delim("ecoli.rbs.rnaseq.windows500.bed", header=F, sep="\t")
window.df <- window %>% group_by(V1, V2, V3) %>% mutate(avg.fpkm = mean(V12))
window.uniq <- unique(window.df[,c(1:3,14)])
write.table(window.uniq, "ecoli.rbs.rnaseq.average.windows500.bed", quote=F, row.names=F, sep="\t")
# structure
module load python
source /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/scripts/loadcondaandes.sh
conda activate /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/Libraries/andes/envs/test
cp /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/Libraries/andes/sgRNA-cleavage-activity-prediction/ecoli_sgRNA_fullseq.fasta /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli/.
cd /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli
python /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/Libraries/andes/iFeature/iFeature.py --file ecoli_sgRNA_fullseq.fasta --type AAC --out ecoli.gRNA.rbs.fullseq.structure.txt
# melting temp & gc content
cd /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli
python
input_file = open('ecoli_sgRNA_fullseq.fasta', 'r')
output_file = open('gRNA_rbs_fullseq_nuc_counts.tsv','w')
output_file.write('Window\tA\tC\tG\tT\tLength\tCG%\n')
from Bio import SeqIO
for cur_record in SeqIO.parse(input_file, "fasta") :
gene_name = cur_record.name
A_count = cur_record.seq.count('A')
C_count = cur_record.seq.count('C')
G_count = cur_record.seq.count('G')
T_count = cur_record.seq.count('T')
length = len(cur_record.seq)
cg_percentage = float(C_count + G_count) / length
output_line = '%s\t%i\t%i\t%i\t%i\t%i\t%f\n' % \
(gene_name, A_count, C_count, G_count, T_count, length, cg_percentage)
output_file.write(output_line)
output_file.close()
input_file.close()
exit()
R
library(dplyr)
setwd("/gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli")
df <- read.delim("gRNA_rbs_fullseq_nuc_counts.tsv", header=T, sep="\t")
df.melt <- df %>% mutate(MeltingTemp = 64.9 + 41 * (G+C-16.4) / (A+T+G+C))
write.table(df.melt, "gRNA_rbs_fullseq_nuc_counts_temp.txt", quote=F, row.names=F, sep="\t")
q()
#python path/to/encode_sequences.py path/to/data.txt
cd /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli/
python encode_sequences.py rbs.fullseq.txt
# separate nucleotide sequence values into individual columns in data frame so each position counts as one feature
sed '1d' rbs.fullseq_independent1.txt | sed '1d' | awk '{gsub(/./,"& ",$2);print $0}' | sed '1i sgRNAID A C T G' | cut -d ' ' -f 1-5 > rbs.fullseq_ind1.txt
sed '1d' rbs.fullseq_independent2.txt | sed '1d' | awk '{gsub(/./,"& ",$2);print $0}' | sed '1i sgRNAID AA AC AT AG CA CC CT CG TA TC TT TG GA GC GT GG' | cut -d ' ' -f 1-17 > rbs.fullseq_ind2.txt
sed '1d' rbs.fullseq_dependent1.txt | sed '1d' | awk '{gsub(/./,"& ",$2);print $0}' | sed '1i sgRNAID p1.A p1.C p1.T p1.G p2.A p2.C p2.T p2.G p3.A p3.C p3.T p3.G p4.A p4.C p4.T p4.G p5.A p5.C p5.T p5.G p6.A p6.C p6.T p6.G p7.A p7.C p7.T p7.G p8.A p8.C p8.T p8.G p9.A p9.C p9.T p9.G p10.A p10.C p10.T p10.G p11.A p11.C p11.T p11.G p12.A p12.C p12.T p12.G p13.A p13.C p13.T p13.G p14.A p14.C p14.T p14.G p15.A p15.C p15.T p15.G p16.A p16.C p16.T p16.G p17.A p17.C p17.T p17.G p18.A p18.C p18.T p18.G p19.A p19.C p19.T p19.G p20.A p20.C p20.T p20.G p21.A p21.C p21.T p21.G p22.A p22.C p22.T p22.G p23.A p23.C p23.T p23.G p24.A p24.C p24.T p24.G p25.A p25.C p25.T p25.G p26.A p26.C p26.T p26.G p27.A p27.C p27.T p27.G p28.A p28.C p28.T p28.G p29.A p29.C p29.T p29.G p30.A p30.C p30.T p30.G p31.A p31.C p31.T p31.G p32.A p32.C p32.T p32.G' | cut -d ' ' -f 1-129 > rbs.fullseq_dep1.txt
sed '1d' rbs.fullseq_dependent2.txt | sed '1d' | awk '{gsub(/./,"& ",$2);print $0}' | sed '1i sgRNAID p1.AA p1.AC p1.AT p1.AG p1.CA p1.CC p1.CT p1.CG p1.TA p1.TC p1.TT p1.TG p1.GA p1.GC p1.GT p1.GG p2.AA p2.AC p2.AT p2.AG p2.CA p2.CC p2.CT p2.CG p2.TA p2.TC p2.TT p2.TG p2.GA p2.GC p2.GT p2.GG p3.AA p3.AC p3.AT p3.AG p3.CA p3.CC p3.CT p3.CG p3.TA p3.TC p3.TT p3.TG p3.GA p3.GC p3.GT p3.GG p4.AA p4.AC p4.AT p4.AG p4.CA p4.CC p4.CT p4.CG p4.TA p4.TC p4.TT p4.TG p4.GA p4.GC p4.GT p4.GG p5.AA p5.AC p5.AT p5.AG p5.CA p5.CC p5.CT p5.CG p5.TA p5.TC p5.TT p5.TG p5.GA p5.GC p5.GT p5.GG p6.AA p6.AC p6.AT p6.AG p6.CA p6.CC p6.CT p6.CG p6.TA p6.TC p6.TT p6.TG p6.GA p6.GC p6.GT p6.GG p7.AA p7.AC p7.AT p7.AG p7.CA p7.CC p7.CT p7.CG p7.TA p7.TC p7.TT p7.TG p7.GA p7.GC p7.GT p7.GG p8.AA p8.AC p8.AT p8.AG p8.CA p8.CC p8.CT p8.CG p8.TA p8.TC p8.TT p8.TG p8.GA p8.GC p8.GT p8.GG p9.AA p9.AC p9.AT p9.AG p9.CA p9.CC p9.CT p9.CG p9.TA p9.TC p9.TT p9.TG p9.GA p9.GC p9.GT p9.GG p10.AA p10.AC p10.AT p10.AG p10.CA p10.CC p10.CT p10.CG p10.TA p10.TC p10.TT p10.TG p10.GA p10.GC p10.GT p10.GG p11.AA p11.AC p11.AT p11.AG p11.CA p11.CC p11.CT p11.CG p11.TA p11.TC p11.TT p11.TG p11.GA p11.GC p11.GT p11.GG p12.AA p12.AC p12.AT p12.AG p12.CA p12.CC p12.CT p12.CG p12.TA p12.TC p12.TT p12.TG p12.GA p12.GC p12.GT p12.GG p13.AA p13.AC p13.AT p13.AG p13.CA p13.CC p13.CT p13.CG p13.TA p13.TC p13.TT p13.TG p13.GA p13.GC p13.GT p13.GG p14.AA p14.AC p14.AT p14.AG p14.CA p14.CC p14.CT p14.CG p14.TA p14.TC p14.TT p14.TG p14.GA p14.GC p14.GT p14.GG p15.AA p15.AC p15.AT p15.AG p15.CA p15.CC p15.CT p15.CG p15.TA p15.TC p15.TT p15.TG p15.GA p15.GC p15.GT p15.GG p16.AA p16.AC p16.AT p16.AG p16.CA p16.CC p16.CT p16.CG p16.TA p16.TC p16.TT p16.TG p16.GA p16.GC p16.GT p16.GG p17.AA p17.AC p17.AT p17.AG p17.CA p17.CC p17.CT p17.CG p17.TA p17.TC p17.TT p17.TG p17.GA p17.GC p17.GT p17.GG p18.AA p18.AC p18.AT p18.AG p18.CA p18.CC p18.CT p18.CG p18.TA p18.TC p18.TT p18.TG p18.GA p18.GC p18.GT p18.GG p19.AA p19.AC p19.AT p19.AG p19.CA p19.CC p19.CT p19.CG p19.TA p19.TC p19.TT p19.TG p19.GA p19.GC p19.GT p19.GG p20.AA p20.AC p20.AT p20.AG p20.CA p20.CC p20.CT p20.CG p20.TA p20.TC p20.TT p20.TG p20.GA p20.GC p20.GT p20.GG p21.AA p21.AC p21.AT p21.AG p21.CA p21.CC p21.CT p21.CG p21.TA p21.TC p21.TT p21.TG p21.GA p21.GC p21.GT p21.GG p22.AA p22.AC p22.AT p22.AG p22.CA p22.CC p22.CT p22.CG p22.TA p22.TC p22.TT p22.TG p22.GA p22.GC p22.GT p22.GG p23.AA p23.AC p23.AT p23.AG p23.CA p23.CC p23.CT p23.CG p23.TA p23.TC p23.TT p23.TG p23.GA p23.GC p23.GT p23.GG p24.AA p24.AC p24.AT p24.AG p24.CA p24.CC p24.CT p24.CG p24.TA p24.TC p24.TT p24.TG p24.GA p24.GC p24.GT p24.GG p25.AA p25.AC p25.AT p25.AG p25.CA p25.CC p25.CT p25.CG p25.TA p25.TC p25.TT p25.TG p25.GA p25.GC p25.GT p25.GG p26.AA p26.AC p26.AT p26.AG p26.CA p26.CC p26.CT p26.CG p26.TA p26.TC p26.TT p26.TG p26.GA p26.GC p26.GT p26.GG p27.AA p27.AC p27.AT p27.AG p27.CA p27.CC p27.CT p27.CG p27.TA p27.TC p27.TT p27.TG p27.GA p27.GC p27.GT p27.GG p28.AA p28.AC p28.AT p28.AG p28.CA p28.CC p28.CT p28.CG p28.TA p28.TC p28.TT p28.TG p28.GA p28.GC p28.GT p28.GG p29.AA p29.AC p29.AT p29.AG p29.CA p29.CC p29.CT p29.CG p29.TA p29.TC p29.TT p29.TG p29.GA p29.GC p29.GT p29.GG p30.AA p30.AC p30.AT p30.AG p30.CA p30.CC p30.CT p30.CG p30.TA p30.TC p30.TT p30.TG p30.GA p30.GC p30.GT p30.GG p31.AA p31.AC p31.AT p31.AG p31.CA p31.CC p31.CT p31.CG p31.TA p31.TC p31.TT p31.TG p31.GA p31.GC p31.GT p31.GG p32.AA p32.AC p32.AT p32.AG p32.CA p32.CC p32.CT p32.CG p32.TA p32.TC p32.TT p32.TG p32.GA p32.GC p32.GT p32.GG' | cut -d ' ' -f 1-497 > rbs.fullseq_dep2.txt
# thermal tensors
cd /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli/
sed '1d' rbs.fullseq.txt | awk '{gsub(/./,"& ",$2);print $0}' | sed '1i sgRNAID p1 p2 p3 p4 p5 p6 p7 p8 p9 p10 p11 p12 p13 p14 p15 p16 p17 p18 p19 p20 p21 p22 p23 p24 p25 p26 p27 p28 p29 p30 p31 p32' | cut -d ' ' -f 1-33 > rbs.fullseq_sequence.txt
module load python
source /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/scripts/loadcondaandes.sh
conda activate /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/Libraries/andes/envs/test
R
library(dplyr)
library(reshape2)
setwd("/gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli/")
tensor <- read.delim("protein_rna_dna-vector_lee_nucleotide_dna_data.txt", header=T, sep="\t", stringsAsFactors = F)
seq <- read.delim("rbs.fullseq_sequence.txt", header=T, sep=" ", stringsAsFactors = F)
tensor.features <- tensor[,1]
rownames(tensor) <- tensor[,1]
tensor.df <- tensor[,2:5]
tensor.t <- as.data.frame(t(tensor.df))
tensor.t$base <- c("A", "C", "G", "T")
rownames(seq) <- seq[,1]
seq.df <- seq[,2:32]
seq.melt <- melt(seq, id="sgRNAID")
colnames(seq.melt) <- c("sgRNAID", "position", "base")
seq.tensor <- left_join(seq.melt, tensor.t, by="base")
seq.tensor.melt <- melt(seq.tensor, id=c("sgRNAID", "position", "base"))
seq.tensor.dcast <- dcast(seq.tensor.melt, sgRNAID ~ position + variable, value.var="value")
write.table(seq.tensor.dcast, "ecoli.sgRNA.fullseq.thermal.tensors.txt", quote=F, row.names=F, sep="\t")
write.table(seq.tensor.melt, "ecoli.sgRNA.fullseq.thermal.tensors.melt.txt", quote=F, row.names=F, sep="\t")
# salloc -A SYB105 -N 2 -t 4:00:00
module load python
source /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/scripts/loadcondaandes.sh
conda activate /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/Libraries/andes/envs/test
R
library(dplyr)
library(reshape2)
library(wmtsa)
library(tidyr)
setwd("/gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli")
gene <- read.delim("ecoli.gene.windows500.bed", header=F, sep="\t", stringsAsFactors = F)
structure <- read.delim("500bp.protein.structure.fa", header=T, sep="\t", stringsAsFactors = F)
nuc <- read.delim("nucleotide_counts_500bp_temp.txt", header=T, sep="\t", stringsAsFactors = F)
rnaseq <- read.delim("ecoli.rnaseq.average.windows500.bed", header=T, sep="\t", stringsAsFactors = F)
window <- read.delim("ecoli.500bp.windows.bed", header=F, sep="\t", stringsAsFactors = F)
score <- read.delim("sgRNA.rbs.coord.txt", header=T, sep="\t", stringsAsFactors = F)
colnames(score) <- c("chr", "start", "end", "sgRNAID", "id", "seq", "id2", "cut.score", "gid", "change.val", "quality")
seq <- read.delim("rbs.fullseq.score.txt", header=T, sep="\t", stringsAsFactors = F)
# 3644
fullseq.df <- subset(seq, seq$fullseq != "NA")
# 1772
score.fullseq <- inner_join(score, fullseq.df, by="sgRNAID")
colnames(score.fullseq) <- c("chr", "start", "end", "sgRNAID", "id", "seq", "id2", "cut.score", "gid", "change.val", "quality", "RBS.ID", "fullseq", "score")
score.df <- score.fullseq[,c(1:4,8)]
# 1763
colnames(window) <- c("chr", "start", "end")
gene.bin <- gene %>% group_by(V1, V2, V3) %>% mutate(gene.count = n())
gene.count <- unique(gene.bin[,c(1:3,14)])
colnames(gene.count) <- c("chr", "start", "end", "gene.count")
gene.window <- left_join(window, gene.count, by=c("chr", "start", "end"))
gene.window[is.na(gene.window)] <- 0
gene.df <- gene.window$gene.count
colnames(rnaseq) <- c("chr", "start", "end", "avg.fpkm")
rnaseq.window <- left_join(window, rnaseq, by=c("chr", "start", "end"))
rnaseq.window[is.na(rnaseq.window)] <- 0
rna.df <- rnaseq.window$avg.fpkm
structure.df <- structure[,2]
gc.df <- nuc[,7]
temp.df <- nuc[,8]
# Run DWT instead of CWT
temp.modwt <- wavMODWT(temp.df)
temp.modwt.df <- as.matrix(temp.modwt)
temp.modwt.label <- data.frame(label = row.names(temp.modwt.df), temp.modwt.df)
temp.modwt.name <- temp.modwt.label %>% separate(label, c("name", "number"), sep = "[^[:alnum:]]+")
colnames(temp.modwt.name) <- c("scale", "window", "temp.dwt")
gc.modwt <- wavMODWT(gc.df)
gc.modwt.df <- as.matrix(gc.modwt)
gc.modwt.label <- data.frame(label = row.names(gc.modwt.df), gc.modwt.df)
gc.modwt.name <- gc.modwt.label %>% separate(label, c("name", "number"), sep = "[^[:alnum:]]+")
colnames(gc.modwt.name) <- c("scale", "window", "gc.dwt")
structure.modwt <- wavMODWT(structure.df)
structure.modwt.df <- as.matrix(structure.modwt)
structure.modwt.label <- data.frame(label = row.names(structure.modwt.df), structure.modwt.df)
structure.modwt.name <- structure.modwt.label %>% separate(label, c("name", "number"), sep = "[^[:alnum:]]+")
colnames(structure.modwt.name) <- c("scale", "window", "structure.dwt")
rna.modwt <- wavMODWT(rna.df)
rna.modwt.df <- as.matrix(rna.modwt)
rna.modwt.label <- data.frame(label = row.names(rna.modwt.df), rna.modwt.df)
rna.modwt.name <- rna.modwt.label %>% separate(label, c("name", "number"), sep = "[^[:alnum:]]+")
colnames(rna.modwt.name) <- c("scale", "window", "rna.dwt")
gene.modwt <- wavMODWT(gene.df)
gene.modwt.df <- as.matrix(gene.modwt)
gene.modwt.label <- data.frame(label = row.names(gene.modwt.df), gene.modwt.df)
gene.modwt.name <- gene.modwt.label %>% separate(label, c("name", "number"), sep = "[^[:alnum:]]+")
colnames(gene.modwt.name) <- c("scale", "window", "gene.dwt")
colnames(window) <- c("chr", "start", "end")
window$window <- seq.int(nrow(window))
window$window <- as.character(window$window-1)
library(tidygenomics)
window.score <- genome_intersect(window, score.df, by=c("chr", "start", "end"))
#window.score.df <- left_join(window, window.score[,2:4], by=c("window"))
window.score.df <- left_join(window.score[,2:4], window, by=c("window"))
window.score.df <- window.score.df[,c(4:6,1:3)]
window.score.temp <- left_join(window.score.df, temp.modwt.name, by="window")
window.temp.gc <- left_join(window.score.temp, gc.modwt.name, by=c("window", "scale"))
window.temp.gc.structure <- left_join(window.temp.gc, structure.modwt.name, by=c("window", "scale"))
window.temp.gc.structure.rna <- left_join(window.temp.gc.structure, rna.modwt.name, by=c("window", "scale"))
window.temp.gc.structure.rna.gene <- left_join(window.temp.gc.structure.rna, gene.modwt.name, by=c("window", "scale"))
window.temp.gc.structure.rna.gene.sgRNA <- subset(window.temp.gc.structure.rna.gene, window.temp.gc.structure.rna.gene$cut.score != "NA")
df.melt <- melt(window.temp.gc.structure.rna.gene.sgRNA[,5:12], id=c("cut.score", "scale", "sgRNAID"))
df <- na.omit(df.melt)
## add 500bp raw data
structure$window <- seq.int(nrow(structure))
structure$window <- as.character(structure$window-1)
gene.count$window <- seq.int(nrow(gene.count))
gene.count$window <- as.character(gene.count$window-1)
nuc$window <- seq.int(nrow(nuc))
nuc$window <- as.character(nuc$window-1)
rnaseq$window <- seq.int(nrow(rnaseq))
rnaseq$window <- as.character(rnaseq$window-1)
structure$scale <- "raw"
gene.count$scale <- "raw"
nuc$scale <- "raw"
rnaseq$scale <- "raw"
window.score.temp <- left_join(window.score.df, nuc[,8:10], by="window")
window.temp.gc <- left_join(window.score.temp, nuc[,c(7,9:10)], by=c("window", "scale"))
window.temp.gc.gene <- left_join(window.temp.gc, gene.count[,c(4:6)], by=c("window", "scale"))
window.temp.gc.gene.rna <- left_join(window.temp.gc.gene, rnaseq[,c(4:6)], by=c("window", "scale"))
window.temp.gc.gene.rna.structure <- left_join(window.temp.gc.gene.rna, structure[,c(2,22,23)], by=c("window", "scale"))
colnames(window.temp.gc.gene.rna.structure) <- c("chr", "start", "end", "window", "sgRNA", "cut.score", "temp", "scale", "gc", "gene", "rna", "structure")
df2.melt <- melt(window.temp.gc.gene.rna.structure[,5:12], id=c("cut.score", "scale", "sgRNA"))
colnames(df2.melt) <- c("cut.score", "scale", "sgRNAID", "variable", "value")
df2 <- na.omit(df2.melt)
df <- rbind(df, df2)
# add sgRNA raw data
setwd("/gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli")
structure <- read.delim("ecoli.gRNA.rbs.fullseq.structure.txt", header=T, sep="\t", stringsAsFactors = F)
nuc <- read.delim("gRNA_rbs_fullseq_nuc_counts_temp.txt", header=T, sep="\t", stringsAsFactors = F)
structure$scale <- "sgRNA.raw"
nuc$scale <- "sgRNA.raw"
structure$sgRNA <- structure[,1]
nuc$sgRNA <- nuc[,1]
structure$sgRNAID <- structure$sgRNA
nuc$sgRNAID <- nuc$sgRNA
window.score.structure <- left_join(window.score.df, structure[,c(2,22,24)], by="sgRNAID")
window.score.structure.temp <- left_join(window.score.structure, nuc[,c(8,9,11)], by=c("sgRNAID", "scale"))
window.score.structure.temp.gc <- left_join(window.score.structure.temp, nuc[,c(7,9,11)], by=c("sgRNAID", "scale"))
colnames(window.score.structure.temp.gc) <- c("chr", "start", "end", "window", "sgRNAID", "cut.score", "sgRNA.structure", "scale", "sgRNA.temp", "sgRNA.gc")
## add one-hot encoding of sequence
setwd("/gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli")
onehot.ind1 <- read.delim("rbs.fullseq_ind1.txt", header=T, sep=" ")
onehot.ind2 <- read.delim("rbs.fullseq_ind2.txt", header=T, sep=" ")
onehot.dep1 <- read.delim("rbs.fullseq_dep1.txt", header=T, sep=" ")
onehot.dep2 <- read.delim("rbs.fullseq_dep2.txt", header=T, sep=" ")
onehot.ind <- full_join(onehot.ind1, onehot.ind2, by="sgRNAID")
onehot.dep <- full_join(onehot.dep1, onehot.dep2, by="sgRNAID")
onehot <- full_join(onehot.ind, onehot.dep, by="sgRNAID")
onehot <- onehot[2:nrow(onehot),]
onehot$scale <- "sgRNA.raw"
colnames(window.score.structure.temp.gc) <- c("chr", "start", "end", "window", "sgRNAID", "cut.score", "sgRNA.structure", "scale", "sgRNA.temp", "sgRNA.gc")
data.onehot <- left_join(window.score.structure.temp.gc, onehot, by=c("sgRNAID", "scale"))
df2.melt <- melt(data.onehot[,c(5:370)], id=c("cut.score", "scale", "sgRNAID"))
df2 <- na.omit(df2.melt)
colnames(df) <- c("cut.score", "scale", "sgRNAID", "variable", "value")
df <- rbind(df, df2)
df.id <- df %>% unite(feature.scale, c(variable, scale), sep = "")
df.id$value <- as.numeric(df.id$value)
df.id <- df.id[!(is.na(df.id$value) | df.id$value==""), ]
colnames(df.id) <- c("cut.score", "feature.scale", "sgRNAID", "value")
setwd("/gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli")
tensor <- read.delim("ecoli.sgRNA.fullseq.thermal.tensors.melt.txt", header=T, sep="\t")
tensor[is.na(tensor)] <- 0
tensor$scale <- "raw"
tensor.id <- tensor %>% unite(feature.scale, c(position, variable, scale), sep = "")
tensor.id$value <- as.numeric(tensor.id$value)
#tensor.id.na <- tensor.id[!(is.na(tensor.id$value) | tensor.id$value==""), ]
tensor.id[is.na(tensor.id)] <- 0
df.score <- unique(df.id[,c(1,3)])
tensor.score <- inner_join(tensor.id, df.score, by="sgRNAID")
tensor.score.order <- tensor.score[,c(5,2,1,4)]
head(df.id)
head(tensor.score.order)
tensor.df <- rbind(df.id, tensor.score.order)
df.dcast <- tensor.df %>% dcast(sgRNAID + cut.score ~ feature.scale, value.var = "value", fun.aggregate=mean, na.rm=TRUE)
# 1763
df.dcast.na <- na.omit(df.dcast)
# 735
write.table(tensor.df, "ecoli.features.sgRNA.fullseq.DWT.raw.onehot.sgRNAraw.thermaltensor.txt", quote=F, row.names=F, sep="\t")
write.table(df.dcast, "ecoli.features.sgRNA.fullseq.DWT.raw.onehot.sgRNAraw.thermaltensor.dcast.na.txt", quote=F, row.names=F, sep="\t")
df.dcast[is.na(df.dcast)] <- 0
write.table(df.dcast, "ecoli.features.sgRNA.fullseq.DWT.raw.onehot.sgRNAraw.thermaltensor.dcast.txt", quote=F, row.names=F, sep="\t")
#!/bin/bash
#SBATCH -A SYB105
#SBATCH -J caret.gbm
#SBATCH -N 4
#SBATCH -t 10:00:00
module load python
source /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/scripts/loadcondaandes.sh
conda activate /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/Libraries/andes/envs/test
cd /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli
R CMD BATCH caret.gbm.sgRNAfullseq.raw.R
R CMD BATCH caret.gbm.sgRNAfullseq.dwt.R
R CMD BATCH caret.gbm.sgRNAfullseq.onehot.ind.R
R CMD BATCH caret.gbm.sgRNAfullseq.onehot.dep.R
R CMD BATCH caret.gbm.sgRNAfullseq.onehot.R
R CMD BATCH caret.gbm.sgRNAfullseq.thermal.R
R CMD BATCH caret.gbm.sgRNAfullseq.raw.onehot.R
R CMD BATCH caret.gbm.sgRNAfullseq.raw.onehot.thermal.R
R CMD BATCH caret.gbm.sgRNAfullseq.all.R
#sbatch /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli/caret.gbm.sgRNAfullseq.sh
# salloc -A SYB105 -N 2 -t 4:00:00
module load python
source /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/scripts/loadcondaandes.sh
conda activate /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/Libraries/andes/envs/test
setwd("/gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli")
df.dcast <- read.delim("ecoli.features.sgRNA.fullseq.DWT.raw.onehot.sgRNAraw.thermaltensor.dcast.txt", header=T, sep="\t")
df.dcast <- na.omit(df.dcast)
#df.dcast[is.na(df.dcast)] <- 0
#colnames(df.dcast)
df.raw <- df.dcast[,c(2,28,44,2386:2389,2404,2421)]
df.dwt <- df.dcast[,c(2,14:27,30:43,2372:2385,2390:2403,2407:2420)]
df.onehot.ind <- df.dcast[,c(2,3:13,29,45:47,2405:2406,2422:2424)]
df.onehot.dep <- df.dcast[,c(2,48:87,150:169,232:251,314:333,396:403,466:469,532:535,598:601,664:667,730:733,858:881,944:947,1010:1013,1076:1079,1142:1145,1208:1211,1274:1277,1340:1343,1406:1409,1472:1475,1600:1623,1686:1689,1752:1755,1880:1899,1962:1981,2044:2063,2126:2145,2208:2227,2290:2309)]
df.thermal.sgRNA <- df.dcast[,c(2,88:149,170:231,252:313,334:395,404:465,470:531,536:597,602:663,668:729,734:857,882:943,948:1009,1014:1075,1080:1141,1146:1207,1212:1273,1278:1339,1344:1405,1410:1471,1476:1599,1624:1685,1690:1751,1756:1879,1900:1961,1982:2043,2064:2125,2146:2207,2228:2289,2310:2371)]
# onehot
data = cbind(df.onehot.ind, df.onehot.dep)
library(caret)
set.seed(998)
inTraining <- createDataPartition(data$cut.score, p = .75, list = FALSE)
training <- data[ inTraining,]
testing <- data[-inTraining,]
fitControl <- trainControl(## 5-fold CV
method = "repeatedcv",
number = 5,
## repeated ten times
repeats = 10)
set.seed(825)
gbmFit1 <- train(cut.score ~ ., data = training,
method = "gbm",
trControl = fitControl,
## This last option is actually one
## for gbm() that passes through
verbose = FALSE)
gbmFit1
head(summary(gbmFit1))
pred <- predict(gbmFit1, newdata = testing)
pred.df <- data.frame(pred.score = pred, exp.score = testing$cut.score)
cor(pred.df$pred.score, pred.df$exp.score)
library(ggplot2)
pdf("gbmFit.onehot.cor.pdf")
ggplot(pred.df, aes(x=pred.score, y=exp.score)) + geom_point() + theme_classic()
dev.off()
write.table(gbmFit1, "gbmFit.onehot.txt", quote=F, row.names=F)
# test with features from manuscript and GBR criteria
## min_samples_split = 250, min_samples_leaf = 20, max_features = 86, subsample = 0.95, max_depth = 7, learning_rate = 0.05, n_estimators = 500
# salloc -A SYB105 -N 2 -t 4:00:00
module load python
source /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/scripts/loadcondaandes.sh
conda activate /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/Libraries/andes/envs/test
setwd("/gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli")
df.dcast <- read.delim("ecoli.features.sgRNA.fullseq.DWT.raw.onehot.sgRNAraw.thermaltensor.dcast.txt", header=T, sep="\t")
df.dcast <- na.omit(df.dcast)
df.raw <- df.dcast[,c(2,28,44,2386:2389,2404,2421)]
df.onehot.ind <- df.dcast[,c(2,3:13,29,45:47,2405:2406,2422:2424)]
df.onehot.dep <- df.dcast[,c(2,48:87,150:169,232:251,314:333,396:403,466:469,532:535,598:601,664:667,730:733,858:881,944:947,1010:1013,1076:1079,1142:1145,1208:1211,1274:1277,1340:1343,1406:1409,1472:1475,1600:1623,1686:1689,1752:1755,1880:1899,1962:1981,2044:2063,2126:2145,2208:2227,2290:2309)]
# sgRNA raw + sgRNA onehot
data = cbind(df.raw, df.onehot.ind, df.onehot.dep)
library(caret)
set.seed(998)
inTraining <- createDataPartition(data$cut.score, p = .80, list = FALSE)
training <- data[ inTraining,]
testing <- data[-inTraining,]
fitControl <- trainControl(## 5-fold CV
method = "repeatedcv",
number = 5,
## repeated ten times
repeats = 10)
set.seed(825)
gbmFit1 <- train(cut.score ~ ., data = training,
method = "gbm",
trControl = fitControl,
## This last option is actually one
## for gbm() that passes through
verbose = FALSE)
gbmFit1
head(summary(gbmFit1))
pred <- predict(gbmFit1, newdata = testing)
pred.df <- data.frame(pred.score = pred, exp.score = testing$cut.score)
cor(pred.df$pred.score, pred.df$exp.score)
#
–> try 20bp (p5-p25 for onehot and thermal tensors) with same sample set…
# salloc -A SYB105 -N 2 -t 4:00:00
module load python
source /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/scripts/loadcondaandes.sh
conda activate /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/Libraries/andes/envs/test
setwd("/gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli")
df.dcast <- read.delim("ecoli.features.sgRNA.fullseq.DWT.raw.onehot.sgRNAraw.thermaltensor.dcast.txt", header=T, sep="\t")
df.dcast <- na.omit(df.dcast)
#df.dcast[is.na(df.dcast)] <- 0
#colnames(df.dcast)
df.raw <- df.dcast[,c(2,28,44,2386:2389,2404,2421)]
df.dwt <- df.dcast[,c(2,14:27,30:43,2372:2385,2390:2403,2407:2420)]
df.onehot.ind <- df.dcast[,c(2,3:13,29,45:47,2405:2406,2422:2424)]
df.onehot.dep <- df.dcast[,c(2,68:87,150:169,232:251,314:333,396:403,466:469,532:535,598:601,664:667,730:733,858:881,944:947,1010:1013,1076:1079,1142:1145,1208:1211,1962:1981,2044:2063,2126:2145,2208:2227,2290:2309)]
df.thermal.sgRNA <- df.dcast[,c(2,88:149,170:231,252:313,334:395,404:465,470:531,536:597,602:663,668:729,734:795,882:943,948:1009,1014:1075,1080:1141,1146:1207,1212:1273,1982:2043,2064:2125,2146:2207,2228:2289,2310:2371)]
# onehot
#data = cbind(df.onehot.ind, df.onehot.dep)
#data = cbind(df.thermal.sgRNA)
data = cbind(df.onehot.ind, df.onehot.dep, df.thermal.sgRNA, df.raw)
library(caret)
set.seed(998)
inTraining <- createDataPartition(data$cut.score, p = .80, list = FALSE)
training <- data[ inTraining,]
testing <- data[-inTraining,]
fitControl <- trainControl(## 5-fold CV
method = "repeatedcv",
number = 5,
## repeated ten times
repeats = 10)
set.seed(825)
gbmFit1 <- train(cut.score ~ ., data = training,
method = "gbm",
trControl = fitControl,
## This last option is actually one
## for gbm() that passes through
verbose = FALSE)
gbmFit1
head(summary(gbmFit1))
pred <- predict(gbmFit1, newdata = testing)
pred.df <- data.frame(pred.score = pred, exp.score = testing$cut.score)
cor(pred.df$pred.score, pred.df$exp.score)
#
#!/bin/bash
#SBATCH -A SYB105
#SBATCH -J caret.rf
#SBATCH -N 4
#SBATCH -t 10:00:00
module load python
source /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/scripts/loadcondaandes.sh
conda activate /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/Libraries/andes/envs/test
cd /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli
R CMD BATCH caret.rf.sgRNAfullseq.raw.R
R CMD BATCH caret.rf.sgRNAfullseq.dwt.R
R CMD BATCH caret.rf.sgRNAfullseq.onehot.ind.R
R CMD BATCH caret.rf.sgRNAfullseq.onehot.dep.R
R CMD BATCH caret.rf.sgRNAfullseq.onehot.R
R CMD BATCH caret.rf.sgRNAfullseq.thermal.R
R CMD BATCH caret.rf.sgRNAfullseq.raw.onehot.R
R CMD BATCH caret.rf.sgRNAfullseq.raw.onehot.thermal.R
R CMD BATCH caret.rf.sgRNAfullseq.all.R
#sbatch /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli/caret.rf.sgRNAfullseq.sh
# salloc -A SYB105 -N 2 -t 4:00:00
module load python
source /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/scripts/loadcondaandes.sh
conda activate /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/Libraries/andes/envs/test
setwd("/gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli")
df.dcast <- read.delim("ecoli.features.sgRNA.fullseq.DWT.raw.onehot.sgRNAraw.thermaltensor.dcast.txt", header=T, sep="\t")
df.dcast <- na.omit(df.dcast)
#df.dcast[is.na(df.dcast)] <- 0
#colnames(df.dcast)
df.raw <- df.dcast[,c(2,28,44,2386:2389,2404,2421)]
df.dwt <- df.dcast[,c(2,14:27,30:43,2372:2385,2390:2403,2407:2420)]
df.onehot.ind <- df.dcast[,c(2,3:13,29,45:47,2405:2406,2422:2424)]
df.onehot.dep <- df.dcast[,c(2,48:87,150:169,232:251,314:333,396:403,466:469,532:535,598:601,664:667,730:733,858:881,944:947,1010:1013,1076:1079,1142:1145,1208:1211,1274:1277,1340:1343,1406:1409,1472:1475,1600:1623,1686:1689,1752:1755,1880:1899,1962:1981,2044:2063,2126:2145,2208:2227,2290:2309)]
df.thermal.sgRNA <- df.dcast[,c(2,88:149,170:231,252:313,334:395,404:465,470:531,536:597,602:663,668:729,734:857,882:943,948:1009,1014:1075,1080:1141,1146:1207,1212:1273,1278:1339,1344:1405,1410:1471,1476:1599,1624:1685,1690:1751,1756:1879,1900:1961,1982:2043,2064:2125,2146:2207,2228:2289,2310:2371)]
# onehot
data = cbind(df.onehot.ind, df.onehot.dep)
library(caret)
library(party)
set.seed(998)
inTraining <- createDataPartition(data$cut.score, p = .75, list = FALSE)
training <- data[ inTraining,]
testing <- data[-inTraining,]
seeds <- vector(mode = "list", length = nrow(training) + 1)
seeds <- lapply(seeds, function(x) 1:20)
rctrl1 <- trainControl(method = "cv", number = 5, returnResamp = "all", seeds = seeds)
set.seed(849)
rfFit1 <- train(cut.score ~ ., data = training,
method = "cforest",
trControl = rctrl1,
preProc = c("center", "scale"),
controls = party::cforest_unbiased(ntree = 20))
rfFit1
pred <- predict(rfFit1, testing)
pred.df <- data.frame(pred.score = pred, exp.score = testing$cut.score)
cor(pred.df$pred.score, pred.df$exp.score)
#
library(ggplot2)
pdf("rfFit.onehot.cor.pdf")
ggplot(pred.df, aes(x=pred.score, y=exp.score)) + geom_point() + theme_classic()
dev.off()
write.table(rfFit1, "rfFit.onehot.txt", quote=F, row.names=F)
–> try 20bp (p5-p25 for onehot and thermal tensors) with same sample set…
# salloc -A SYB105 -N 2 -t 4:00:00
module load python
source /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/scripts/loadcondaandes.sh
conda activate /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/Libraries/andes/envs/test
setwd("/gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli")
df.dcast <- read.delim("ecoli.features.sgRNA.fullseq.DWT.raw.onehot.sgRNAraw.thermaltensor.dcast.txt", header=T, sep="\t")
df.dcast <- na.omit(df.dcast)
#df.dcast[is.na(df.dcast)] <- 0
#colnames(df.dcast)
df.raw <- df.dcast[,c(2,28,44,2386:2389,2404,2421)]
df.dwt <- df.dcast[,c(2,14:27,30:43,2372:2385,2390:2403,2407:2420)]
df.onehot.ind <- df.dcast[,c(2,3:13,29,45:47,2405:2406,2422:2424)]
df.onehot.dep <- df.dcast[,c(2,68:87,150:169,232:251,314:333,396:403,466:469,532:535,598:601,664:667,730:733,858:881,944:947,1010:1013,1076:1079,1142:1145,1208:1211,1962:1981,2044:2063,2126:2145,2208:2227,2290:2309)]
df.thermal.sgRNA <- df.dcast[,c(2,88:149,170:231,252:313,334:395,404:465,470:531,536:597,602:663,668:729,734:795,882:943,948:1009,1014:1075,1080:1141,1146:1207,1212:1273,1982:2043,2064:2125,2146:2207,2228:2289,2310:2371)]
# onehot
#data = cbind(df.onehot.ind, df.onehot.dep)
#data = cbind(df.thermal.sgRNA)
data = cbind(df.onehot.ind, df.onehot.dep, df.thermal.sgRNA, df.raw)
library(caret)
library(party)
set.seed(998)
inTraining <- createDataPartition(data$cut.score, p = .75, list = FALSE)
training <- data[ inTraining,]
testing <- data[-inTraining,]
seeds <- vector(mode = "list", length = nrow(training) + 1)
seeds <- lapply(seeds, function(x) 1:20)
rctrl1 <- trainControl(method = "cv", number = 5, returnResamp = "all", seeds = seeds)
set.seed(849)
rfFit1 <- train(cut.score ~ ., data = training,
method = "cforest",
trControl = rctrl1,
preProc = c("center", "scale"),
controls = party::cforest_unbiased(ntree = 20))
rfFit1
pred <- predict(rfFit1, testing)
pred.df <- data.frame(pred.score = pred, exp.score = testing$cut.score)
cor(pred.df$pred.score, pred.df$exp.score)
module load python
source /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/scripts/loadcondaandes.sh
conda activate /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/Libraries/andes/envs/test
### dataset --> DataS4... save each sheet as a dataframe, add column declaring Cas9 type, intersect with DataS1 for sequence, create new sgRNAID using both the ID and Cas9 type, merge files
setwd("/Users/27n/Dropbox (ORNL)/ORNL.Noshay/Projects/SEED/ExploratoryDataForModelGeneration/e.coli")
seq <- read.delim("DataS1.txt", header=T, sep="\t")
setwd("/Users/27n/Dropbox (ORNL)/ORNL.Noshay/Projects/SEED/ExploratoryDataForModelGeneration/e.coli/DataS4.tables")
Cas9 <- read.delim("DataS4.Cas9.txt", header=T, sep="\t")
eSpCas9 <- read.delim("DataS4.eSpCas9.txt", header=T, sep="\t")
recAcas9 <- read.delim("DataS4.recACas9.txt", header=T, sep="\t")
# > nrow(seq)
# [1] 55671
# > nrow(Cas9)
# [1] 44163
# > nrow(eSpCas9)
# [1] 45071
# > nrow(recAcas9)
# [1] 48112
library(dplyr)
library(tidyr)
Cas9.seq <- left_join(Cas9, seq, by="sgRNAID")
eSpCas9.seq <- left_join(eSpCas9, seq, by="sgRNAID")
recAcas9.seq <- left_join(recAcas9, seq, by="sgRNAID")
Cas9.seq.id <- Cas9.seq %>% unite(sgRNAID, c(sgRNAID, type), sep="_")
eSpCas9.seq.id <- eSpCas9.seq %>% unite(sgRNAID, c(sgRNAID, type), sep="_")
recAcas9.seq.id <- recAcas9.seq %>% unite(sgRNAID, c(sgRNAID, type), sep="_")
df <- rbind(Cas9.seq.id, eSpCas9.seq.id)
df2 <- rbind(df, recAcas9.seq.id)
# 137346
df.na <- na.omit(df2)
# 126182
write.table(df.na, "Ecoli.allCas9.txt", quote=F, row.names=F, sep="\t")
sed '1d' Ecoli.allCas9.txt | awk '{print ">"$1"\n"$3}' > Ecoli.allCas9.fasta
# cd /Users/27n/Dropbox (ORNL)/ORNL.Noshay/Projects/SEED/ExploratoryDataForModelGeneration/e.coli/DataS4.tables
# scp Ecoli.allCas9.txt noshayjm@dtn.ccs.ornl.gov:/gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli/.
# scp Ecoli.allCas9.fasta noshayjm@dtn.ccs.ornl.gov:/gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli/.
### melting temp
# count nucleotides
cd /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli
python
input_file = open('Ecoli.allCas9.fasta', 'r')
output_file = open('Ecoli.allCas9_nuc_counts.tsv','w')
output_file.write('Window\tA\tC\tG\tT\tLength\tCG%\n')
from Bio import SeqIO
for cur_record in SeqIO.parse(input_file, "fasta") :
gene_name = cur_record.name
A_count = cur_record.seq.count('A')
C_count = cur_record.seq.count('C')
G_count = cur_record.seq.count('G')
T_count = cur_record.seq.count('T')
length = len(cur_record.seq)
cg_percentage = float(C_count + G_count) / length
output_line = '%s\t%i\t%i\t%i\t%i\t%i\t%f\n' % \
(gene_name, A_count, C_count, G_count, T_count, length, cg_percentage)
output_file.write(output_line)
output_file.close()
input_file.close()
exit()
# Melting temperature(°C) = 64.9 + 41 * (nG+nC-16.4)/(nA+nT+nG+nC)
R
library(dplyr)
setwd("/gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli")
df <- read.delim("Ecoli.allCas9_nuc_counts.tsv", header=T, sep="\t")
df.melt <- df %>% mutate(MeltingTemp = 64.9 + 41 * (G+C-16.4) / (A+T+G+C))
write.table(df.melt, "Ecoli.allCas9.nuc.count.txt", quote=F, row.names=F, sep="\t")
q()
### structure
cd /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli
python /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/Libraries/andes/iFeature/iFeature.py --file Ecoli.allCas9.fasta --type AAC --out Ecoli.allCas9.structure.txt
### onehot encoding
# import os, sys
# import numpy as np
#
# onehot_dict = {
# 'A': '1000',
# 'C': '0100',
# 'T': '0010',
# 'G': '0001',
# 'AA': '1000000000000000',
# 'AC': '0100000000000000',
# 'AT': '0010000000000000',
# 'AG': '0001000000000000',
# 'CA': '0000100000000000',
# 'CC': '0000010000000000',
# 'CT': '0000001000000000',
# 'CG': '0000000100000000',
# 'TA': '0000000010000000',
# 'TC': '0000000001000000',
# 'TT': '0000000000100000',
# 'TG': '0000000000010000',
# 'GA': '0000000000001000',
# 'GC': '0000000000000100',
# 'GT': '0000000000000010',
# 'GG': '0000000000000001',
# }
#
# # open input and output files
# input_path = sys.argv[1]
# input_file = open(input_path, 'r')
# dep1_file = open(input_path[:-4]+'_dependent1.txt', 'w')
# dep2_file = open(input_path[:-4]+'_dependent2.txt', 'w')
# indep1_file = open(input_path[:-4]+'_independent1.txt', 'w')
# indep2_file = open(input_path[:-4]+'_independent2.txt', 'w')
#
# # loop over nucleotide sequences
# for idx, line in enumerate(input_file):
#
# # if first iteration, write title line
# if idx == 0:
#
# dep1_file.writelines(line+': first-order position-dependent features'+ '\n')
# dep2_file.writelines(line+': second-order position-dependent features'+ '\n')
# indep1_file.writelines(line+': first-order position-independent features'+ '\n')
# indep2_file.writelines(line+': second-order position-independent features'+ '\n')
#
# # otherwise encode sequence
# else:
#
# # split line by tab
# line = line.split('\t')
#
# # extract sequence (also remove \n)
# seq = line[-1][:-1]
#
# # compute position-dependent features as one-hot vectors
# pos_dep1 = ''.join([onehot_dict[seq[i]] for i in range(len(seq))])
# pos_dep2 = ''.join([onehot_dict[seq[i:i+2]] for i in range(len(seq)-1)])
#
# # compute position-independent features as sum over position-dependent features
# pos_indep1 = list(np.array([int(o) for o in pos_dep1]).reshape([-1, 4]).sum(axis=0))
# pos_indep2 = list(np.array([int(o) for o in pos_dep2]).reshape([-1, 16]).sum(axis=0))
# pos_indep1 = ''.join([str(p) for p in pos_indep1])
# pos_indep2 = ''.join([str(p) for p in pos_indep2])
#
# # write features to file
# dep1_file.writelines(line[0] + '\t' + pos_dep1 + '\n')
# dep2_file.writelines(line[0] + '\t' + pos_dep2 + '\n')
# indep1_file.writelines(line[0] + '\t' + pos_indep1 + '\n')
# indep2_file.writelines(line[0] + '\t' + pos_indep2 + '\n')
#
# if idx % 10000 == 0:
# print('{0:,}'.format(idx)+' lines processed...')
#
# print('Done!')
#
# input_file.close()
# dep1_file.close()
# dep2_file.close()
# indep1_file.close()
# indep2_file.close()
#python path/to/encode_sequences.py path/to/data.txt
cd /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli/
cut -f 1,3 Ecoli.allCas9.txt > Ecoli.allCas9.noscore.txt
python encode_sequences.py Ecoli.allCas9.noscore.txt
# separate nucleotide sequence values into individual columns in data frame so each position counts as one feature
cd /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli/
sed '1d' Ecoli.allCas9.noscore_independent1.txt | sed '1d' | awk '{gsub(/./,"& ",$2);print $0}' | sed '1i sgRNAID A C T G' | cut -d ' ' -f 1-5 > Ecoli.allCas9_ind1.txt
sed '1d' Ecoli.allCas9.noscore_independent2.txt | sed '1d' | awk '{gsub(/./,"& ",$2);print $0}' | sed '1i sgRNAID AA AC AT AG CA CC CT CG TA TC TT TG GA GC GT GG' | cut -d ' ' -f 1-17 > Ecoli.allCas9_ind2.txt
sed '1d' Ecoli.allCas9.noscore_dependent1.txt | sed '1d' | awk '{gsub(/./,"& ",$2);print $0}' | sed '1i sgRNAID p1.A p1.C p1.T p1.G p2.A p2.C p2.T p2.G p3.A p3.C p3.T p3.G p4.A p4.C p4.T p4.G p5.A p5.C p5.T p5.G p6.A p6.C p6.T p6.G p7.A p7.C p7.T p7.G p8.A p8.C p8.T p8.G p9.A p9.C p9.T p9.G p10.A p10.C p10.T p10.G p11.A p11.C p11.T p11.G p12.A p12.C p12.T p12.G p13.A p13.C p13.T p13.G p14.A p14.C p14.T p14.G p15.A p15.C p15.T p15.G p16.A p16.C p16.T p16.G p17.A p17.C p17.T p17.G p18.A p18.C p18.T p18.G p19.A p19.C p19.T p19.G p20.A p20.C p20.T p20.G' | cut -d ' ' -f 1-81 > Ecoli.allCas9_dep1.txt
sed '1d' Ecoli.allCas9.noscore_dependent2.txt | sed '1d' | awk '{gsub(/./,"& ",$2);print $0}' | sed '1i sgRNAID p1.AA p1.AC p1.AT p1.AG p1.CA p1.CC p1.CT p1.CG p1.TA p1.TC p1.TT p1.TG p1.GA p1.GC p1.GT p1.GG p2.AA p2.AC p2.AT p2.AG p2.CA p2.CC p2.CT p2.CG p2.TA p2.TC p2.TT p2.TG p2.GA p2.GC p2.GT p2.GG p3.AA p3.AC p3.AT p3.AG p3.CA p3.CC p3.CT p3.CG p3.TA p3.TC p3.TT p3.TG p3.GA p3.GC p3.GT p3.GG p4.AA p4.AC p4.AT p4.AG p4.CA p4.CC p4.CT p4.CG p4.TA p4.TC p4.TT p4.TG p4.GA p4.GC p4.GT p4.GG p5.AA p5.AC p5.AT p5.AG p5.CA p5.CC p5.CT p5.CG p5.TA p5.TC p5.TT p5.TG p5.GA p5.GC p5.GT p5.GG p6.AA p6.AC p6.AT p6.AG p6.CA p6.CC p6.CT p6.CG p6.TA p6.TC p6.TT p6.TG p6.GA p6.GC p6.GT p6.GG p7.AA p7.AC p7.AT p7.AG p7.CA p7.CC p7.CT p7.CG p7.TA p7.TC p7.TT p7.TG p7.GA p7.GC p7.GT p7.GG p8.AA p8.AC p8.AT p8.AG p8.CA p8.CC p8.CT p8.CG p8.TA p8.TC p8.TT p8.TG p8.GA p8.GC p8.GT p8.GG p9.AA p9.AC p9.AT p9.AG p9.CA p9.CC p9.CT p9.CG p9.TA p9.TC p9.TT p9.TG p9.GA p9.GC p9.GT p9.GG p10.AA p10.AC p10.AT p10.AG p10.CA p10.CC p10.CT p10.CG p10.TA p10.TC p10.TT p10.TG p10.GA p10.GC p10.GT p10.GG p11.AA p11.AC p11.AT p11.AG p11.CA p11.CC p11.CT p11.CG p11.TA p11.TC p11.TT p11.TG p11.GA p11.GC p11.GT p11.GG p12.AA p12.AC p12.AT p12.AG p12.CA p12.CC p12.CT p12.CG p12.TA p12.TC p12.TT p12.TG p12.GA p12.GC p12.GT p12.GG p13.AA p13.AC p13.AT p13.AG p13.CA p13.CC p13.CT p13.CG p13.TA p13.TC p13.TT p13.TG p13.GA p13.GC p13.GT p13.GG p14.AA p14.AC p14.AT p14.AG p14.CA p14.CC p14.CT p14.CG p14.TA p14.TC p14.TT p14.TG p14.GA p14.GC p14.GT p14.GG p15.AA p15.AC p15.AT p15.AG p15.CA p15.CC p15.CT p15.CG p15.TA p15.TC p15.TT p15.TG p15.GA p15.GC p15.GT p15.GG p16.AA p16.AC p16.AT p16.AG p16.CA p16.CC p16.CT p16.CG p16.TA p16.TC p16.TT p16.TG p16.GA p16.GC p16.GT p16.GG p17.AA p17.AC p17.AT p17.AG p17.CA p17.CC p17.CT p17.CG p17.TA p17.TC p17.TT p17.TG p17.GA p17.GC p17.GT p17.GG p18.AA p18.AC p18.AT p18.AG p18.CA p18.CC p18.CT p18.CG p18.TA p18.TC p18.TT p18.TG p18.GA p18.GC p18.GT p18.GG p19.AA p19.AC p19.AT p19.AG p19.CA p19.CC p19.CT p19.CG p19.TA p19.TC p19.TT p19.TG p19.GA p19.GC p19.GT p19.GG p20.AA p20.AC p20.AT p20.AG p20.CA p20.CC p20.CT p20.CG p20.TA p20.TC p20.TT p20.TG p20.GA p20.GC p20.GT p20.GG' | cut -d ' ' -f 1-321 > Ecoli.allCas9_dep2.txt
#### chemical tensors
cd /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli/
sed '1d' Ecoli.allCas9.noscore.txt | awk '{gsub(/./,"& ",$2);print $0}' | sed '1i sgRNAID p1 p2 p3 p4 p5 p6 p7 p8 p9 p10 p11 p12 p13 p14 p15 p16 p17 p18 p19 p20' | cut -d ' ' -f 1-21 > Ecoli.allCas9.sequence.txt
# salloc -A SYB105 -N 2 -t 4:00:00
module load python
source /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/scripts/loadcondaandes.sh
conda activate /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/Libraries/andes/envs/test
R
library(dplyr)
library(reshape2)
setwd("/gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli/")
tensor <- read.delim("protein_rna_dna-vector_lee_nucleotide_dna_data.txt", header=T, sep="\t", stringsAsFactors = F)
seq <- read.delim("Ecoli.allCas9.sequence.txt", header=T, sep=" ", stringsAsFactors = F)
tensor.features <- tensor[,1]
rownames(tensor) <- tensor[,1]
tensor.df <- tensor[,2:5]
tensor.t <- as.data.frame(t(tensor.df))
tensor.t$base <- c("A", "C", "G", "T")
rownames(seq) <- seq[,1]
seq.df <- seq[,2:21]
seq.melt <- melt(seq, id="sgRNAID")
colnames(seq.melt) <- c("sgRNAID", "position", "base")
seq.tensor <- left_join(seq.melt, tensor.t, by="base")
seq.tensor.melt <- melt(seq.tensor, id=c("sgRNAID", "position", "base"))
seq.tensor.dcast <- dcast(seq.tensor.melt, sgRNAID ~ position + variable, value.var="value")
write.table(seq.tensor.dcast, "Ecoli.allCas9.tensors.txt", quote=F, row.names=F, sep="\t")
write.table(seq.tensor.melt, "Ecoli.allCas9.tensors.melt.txt", quote=F, row.names=F, sep="\t")
# salloc -A SYB105 -N 2 -t 4:00:00
module load python
source /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/scripts/loadcondaandes.sh
conda activate /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/Libraries/andes/envs/test
R
library(dplyr)
library(reshape2)
library(wmtsa)
library(tidyr)
library(MassSpecWavelet)
setwd("/gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli")
structure <- read.delim("Ecoli.allCas9.structure.txt", header=T, sep="\t", stringsAsFactors = F)
nuc <- read.delim("Ecoli.allCas9.nuc.count.txt", header=T, sep="\t", stringsAsFactors = F)
score <- read.delim("Ecoli.allCas9.txt", header=T, sep="\t", stringsAsFactors = F)
score.df <- score[,c(1:2)]
colnames(score.df) <- c("sgRNAID", "cut.score")
structure.df <- structure[,2]
gc.df <- nuc[,7]
temp.df <- nuc[,8]
# make CWT figure
scales <- seq(1, 2000, 50)
wCoefs.temp <- cwt(temp.df, scales=scales, wavelet='mexh')
wCoefs.gc <- cwt(gc.df, scales=scales, wavelet='mexh')
wCoefs.structure <- cwt(structure.df, scales=scales, wavelet='mexh')
pdf(file='features.sgRNA.cwt.pdf')
par(mfrow=c(3,1))
scales <- seq(1, 2000, 50)
image(1:length(temp.df), scales, wCoefs.temp, col=terrain.colors(256), axes=FALSE, ylab='CWT coefficient scale', main='CWT sgRNA Melting Temp')
image(1:length(gc.df), scales, wCoefs.gc, col=terrain.colors(256), axes=FALSE, ylab='CWT coefficient scale', main='CWT GC Content coefficients')
image(1:length(structure.df), scales, wCoefs.structure, col=terrain.colors(256), axes=FALSE, ylab='CWT coefficient scale', main='CWT sgRNA Secondary Structure')
dev.off()
# Run DWT instead of CWT
temp.modwt <- wavMODWT(temp.df)
temp.modwt.df <- as.matrix(temp.modwt)
temp.modwt.label <- data.frame(label = row.names(temp.modwt.df), temp.modwt.df)
temp.modwt.name <- temp.modwt.label %>% separate(label, c("name", "number"), sep = "[^[:alnum:]]+")
colnames(temp.modwt.name) <- c("scale", "window", "temp.dwt")
gc.modwt <- wavMODWT(gc.df)
gc.modwt.df <- as.matrix(gc.modwt)
gc.modwt.label <- data.frame(label = row.names(gc.modwt.df), gc.modwt.df)
gc.modwt.name <- gc.modwt.label %>% separate(label, c("name", "number"), sep = "[^[:alnum:]]+")
colnames(gc.modwt.name) <- c("scale", "window", "gc.dwt")
structure.modwt <- wavMODWT(structure.df)
structure.modwt.df <- as.matrix(structure.modwt)
structure.modwt.label <- data.frame(label = row.names(structure.modwt.df), structure.modwt.df)
structure.modwt.name <- structure.modwt.label %>% separate(label, c("name", "number"), sep = "[^[:alnum:]]+")
colnames(structure.modwt.name) <- c("scale", "window", "structure.dwt")
window <- data.frame(score.df[,1])
window$window <- seq.int(nrow(window))
window$window <- as.character(window$window-1)
colnames(window) <- c("sgRNAID", "window")
library(tidygenomics)
window.score.df <- left_join(score.df, window, by=c("sgRNAID"))
window.score.temp <- left_join(window.score.df, temp.modwt.name, by="window")
window.temp.gc <- left_join(window.score.temp, gc.modwt.name, by=c("window", "scale"))
window.temp.gc.structure <- left_join(window.temp.gc, structure.modwt.name, by=c("window", "scale"))
window.temp.gc.structure.sgRNA <- subset(window.temp.gc.structure, window.temp.gc.structure$cut.score != "NA")
df.melt <- melt(window.temp.gc.structure.sgRNA[,c(1:2,4:7)], id=c("cut.score", "scale", "sgRNAID"))
df <- na.omit(df.melt)
# add sgRNA raw data
structure.df <- data.frame(structure[,2])
gc.df <- data.frame(nuc[,7])
temp.df <- data.frame(nuc[,8])
structure.df$scale <- "sgRNA.raw"
gc.df$scale <- "sgRNA.raw"
temp.df$scale <- "sgRNA.raw"
structure.df$sgRNAID <- structure[,1]
gc.df$sgRNAID <- nuc[,1]
temp.df$sgRNAID <- nuc[,1]
window.score.structure <- left_join(window.score.df, structure.df, by="sgRNAID")
window.score.structure.temp <- left_join(window.score.structure, temp.df, by=c("sgRNAID", "scale"))
window.score.structure.temp.gc <- left_join(window.score.structure.temp, gc.df, by=c("sgRNAID", "scale"))
colnames(window.score.structure.temp.gc) <- c("sgRNAID", "cut.score", "seq", "sgRNA.structure", "scale", "sgRNA.temp", "sgRNA.gc")
## add one-hot encoding of sequence
setwd("/gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli")
onehot.ind1 <- read.delim("Ecoli.allCas9_ind1.txt", header=T, sep=" ")
onehot.ind2 <- read.delim("Ecoli.allCas9_ind2.txt", header=T, sep=" ")
onehot.dep1 <- read.delim("Ecoli.allCas9_dep1.txt", header=T, sep=" ")
onehot.dep2 <- read.delim("Ecoli.allCas9_dep2.txt", header=T, sep=" ")
onehot.dep2 <- onehot.dep2[,1:305]
onehot.ind <- full_join(onehot.ind1, onehot.ind2, by="sgRNAID")
onehot.dep <- full_join(onehot.dep1, onehot.dep2, by="sgRNAID")
onehot <- full_join(onehot.ind, onehot.dep, by="sgRNAID")
onehot$scale <- "sgRNA.raw"
data.onehot <- left_join(window.score.structure.temp.gc, onehot, by=c("sgRNAID", "scale"))
df2.melt <- melt(data.onehot[,c(1,2,4:ncol(data.onehot))], id=c("cut.score", "scale", "sgRNAID"))
df2 <- na.omit(df2.melt)
colnames(df) <- c("cut.score", "scale", "sgRNAID", "variable", "value")
df <- rbind(df, df2)
df.id <- df %>% unite(feature.scale, c(variable, scale), sep = "")
df.id$value <- as.numeric(df.id$value)
df.id <- df.id[!(is.na(df.id$value) | df.id$value==""), ]
colnames(df.id) <- c("cut.score", "feature.scale", "sgRNAID", "value")
setwd("/gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli")
tensor <- read.delim("Ecoli.allCas9.tensors.melt.txt", header=T, sep="\t")
tensor[is.na(tensor)] <- 0
tensor$scale <- "raw"
tensor.id <- tensor %>% unite(feature.scale, c(position, variable, scale), sep = "")
tensor.id$value <- as.numeric(tensor.id$value)
tensor.id[is.na(tensor.id)] <- 0
df.score <- unique(df.id[,c(1,3)])
tensor.score <- inner_join(tensor.id, df.score, by="sgRNAID")
tensor.score.order <- tensor.score[,c(5,2,1,4)]
head(df.id)
head(tensor.score.order)
tensor.df <- rbind(df.id, tensor.score.order)
write.table(tensor.df, "Ecoli.allCas9.DWT.raw.onehot.tensor.txt", quote=F, row.names=F, sep="\t")
df.dcast <- tensor.df %>% dcast(sgRNAID + cut.score ~ feature.scale, value.var = "value", fun.aggregate=mean, na.rm=TRUE)
write.table(df.dcast, "Ecoli.allCas9.DWT.raw.onehot.tensor.dcast.txt", quote=F, row.names=F, sep="\t")
nrow(df.dcast)
# 126182
df.dcast.na <- na.omit(df.dcast)
write.table(df.dcast, "Ecoli.allCas9.DWT.raw.onehot.tensor.dcast.na.txt", quote=F, row.names=F, sep="\t")
nrow(df.dcast.na)
# 126181
# Summit
module load r/4.0.5
# Andes
# salloc -A SYB105 -N 2 -t 4:00:00
module load python
source /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/scripts/loadcondaandes.sh
conda activate /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/Libraries/andes/envs/test
R
setwd("/gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli")
df <- read.delim("Ecoli.allCas9.DWT.raw.onehot.tensor.dcast.na.txt", header=T, sep="\t")
df <- na.omit(df)
## sample ID
df.features <- df[,c(1,3:ncol(df))]
df.score <- df[,1:2]
setwd("/gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli/iRF.run")
write.table(df.features, "Ecoli.allCas9.features_overlap.txt", quote=F, row.names=F, sep="\t")
write.table(df.score, "Ecoli.allCas9.score_overlap.txt", quote=F, row.names=F, sep="\t")
write.table(df.features, "Ecoli.allCas9.features.txt", quote=F, row.names=F, sep="\t")
write.table(df.score, "Ecoli.allCas9.score.txt", quote=F, row.names=F, sep="\t")
## no sample ID
## <DataFile>_overlap_noSampleIDs.txt & <YFile>_overlap_noSampleIDs.txt
df.features <- df[,c(3:ncol(df))]
df.score <- data.frame(df[,2])
colnames(df.score) <- "cut.score"
setwd("/gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli/iRF.run")
write.table(df.features, "Ecoli.allCas9.features_overlap_noSampleIDs.txt", quote=F, row.names=F, sep="\t")
write.table(df.score, "Ecoli.allCas9.score_overlap_noSampleIDs.txt", quote=F, row.names=F, sep="\t")
# run python scripts on Andes
# run job submissions on Summit
# Builder script: /gpfs/alpine/syb105/proj-shared/Projects/iRF/iRF_LOOP_SetUp_CrossLayer.py
# [python iRF_LOOP_SetUp_CrossLayer.py --DataFile --YFile --System Summit --NodesPer 1 --TotalNodes 10 --RunTime 2 --Account SYB105 --NumTrees 1000 --NumIterations 5 --RunName iRF.XX --bypass --Prediction]
# Andes
cd /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli/iRF.run
mkdir Ecoli.allCas9
mkdir Ecoli.allCas9/all.features
module load python/3.7-anaconda3
cd /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli/iRF.run/Ecoli.allCas9/all.features
python /gpfs/alpine/syb105/proj-shared/Projects/iRF/iRF_LOOP_SetUp_CrossLayer.py --System Summit --NodesPer 1 --TotalNodes 50 --RunTime 90 --Account SYB105 --NumTrees 1000 --NumIterations 5 --RunName Ecoli.allCas9 --bypass --Prediction /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli/iRF.run/Ecoli.allCas9.features.txt /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli/iRF.run/Ecoli.allCas9.score.txt
# Summit
cd /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli/iRF.run/
module load python/3.7.0-anaconda3-5.3.0
# full
bsub /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli/iRF.run/Ecoli.allCas9/all.features/Submits/submit_full_Ecoli.allCas9_0.sh
# train
bsub /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli/iRF.run/Ecoli.allCas9/all.features/Submits/submit_train_Ecoli.allCas9_0.sh
# once the train submissions are done run the test submissions
# test
bsub /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli/iRF.run/Ecoli.allCas9/all.features/Submits/submit_test_Ecoli.allCas9_0.sh
# Andes
cd /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli/iRF.run/Ecoli.allCas9/
module load python/3.7-anaconda3
cd /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli/iRF.run/Ecoli.allCas9/all.features
python /gpfs/alpine/syb105/proj-shared/Projects/iRF/iRF_postProcessing.py --Iterations 5 --Prediction --PredAccuracy MAE,MAEA,MSE,R2 --varTot 95 /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli/iRF.run/Ecoli.allCas9/YNames.txt Ecoli.allCas9
# pearson correlation
setwd("/gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli/iRF.run/Ecoli.allCas9/all.features/cut.score/foldRuns/fold9/Runs/Set4")
pred <- read.delim("Ecoli.allCas9_Set4_test.prediction", header=T, sep="\t")
y <- read.delim("set4_Y_test_noSampleIDs.txt", header=T, sep="\t")
cor(y$cut.score, pred$Predictions.)
# 0.7133888
# spearman correlation
cor(y$cut.score, pred$Predictions., method=c("spearman"))
# 0.6882686
–> split the data by features
# feature columns
# sgRNAID = 1
# ind onehot = 2:12,30:33,1618:1619,1637:1639
# dep onehot = 34:69,132:147,210:225,288:303,366:381,444:459,522:537,600:615,678:693,756:771,896:915,1040:1059,1122:1141,1204:1223,1286:1301,1364:1379,1442:1457,1520:1535
# dwt = 13:29,1601:1617,1620:1636
# tensor = 70:131,148:209,226:287,304:365,382:443,460:521,538:599,616:677,694:755,772:895,916:1039,1050:1121,1142:1203,1224:1285,1302:1363,1380:1441,1458:1519,1536:1597
# raw = 1598:1600
cd /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli/iRF.run
cut -f 1,1598-1600 Ecoli.allCas9.features.txt > Ecoli.allCas9.raw.features.txt
cut -f 1,1598-1600 Ecoli.allCas9.features.txt > Ecoli.allCas9.raw.features_overlap.txt
cut -f 1598-1600 Ecoli.allCas9.features.txt > Ecoli.allCas9.raw.features_overlap_noSampleIDs.txt
cut -f 1,13-29,1601-1617,1620-1636 Ecoli.allCas9.features.txt > Ecoli.allCas9.dwt.features.txt
cut -f 1,13-29,1601-1617,1620-1636 Ecoli.allCas9.features.txt > Ecoli.allCas9.dwt.features_overlap.txt
cut -f 13-29,1601-1617,1620-1636 Ecoli.allCas9.features.txt > Ecoli.allCas9.dwt.features_overlap_noSampleIDs.txt
cut -f 1,2-12,30-33,1618-1619,1637-1639,34-69,132-147,210-225,288-303,366-381,444-459,522-537,600-615,678-693,756-771,896-915,1040-1059,1122-1141,1204-1223,1286-1301,1364-1379,1442-1457,1520-1535 Ecoli.allCas9.features.txt > Ecoli.allCas9.onehot.features.txt
cut -f 1,2-12,30-33,1618-1619,1637-1639,34-69,132-147,210-225,288-303,366-381,444-459,522-537,600-615,678-693,756-771,896-915,1040-1059,1122-1141,1204-1223,1286-1301,1364-1379,1442-1457,1520-1535 Ecoli.allCas9.features.txt > Ecoli.allCas9.onehot.features_overlap.txt
cut -f 2-12,30-33,1618-1619,1637-1639,34-69,132-147,210-225,288-303,366-381,444-459,522-537,600-615,678-693,756-771,896-915,1040-1059,1122-1141,1204-1223,1286-1301,1364-1379,1442-1457,1520-1535 Ecoli.allCas9.features.txt > Ecoli.allCas9.onehot.features_overlap_noSampleIDs.txt
cut -f 1,70-131,148-209,226-287,304-365,382-443,460-521,538-599,616-677,694-755,772-895,916-1039,1050-1121,1142-1203,1224-1285,1302-1363,1380-1441,1458-1519,1536-1597 Ecoli.allCas9.features.txt > Ecoli.allCas9.tensor.features.txt
cut -f 1,70-131,148-209,226-287,304-365,382-443,460-521,538-599,616-677,694-755,772-895,916-1039,1050-1121,1142-1203,1224-1285,1302-1363,1380-1441,1458-1519,1536-1597 Ecoli.allCas9.features.txt > Ecoli.allCas9.tensor.features_overlap.txt
cut -f 70-131,148-209,226-287,304-365,382-443,460-521,538-599,616-677,694-755,772-895,916-1039,1050-1121,1142-1203,1224-1285,1302-1363,1380-1441,1458-1519,1536-1597 Ecoli.allCas9.features.txt > Ecoli.allCas9.tensor.features_overlap_noSampleIDs.txt
cut -f 1,1598-1600,2-12,30-33,1618-1619,1637-1639,34-69,132-147,210-225,288-303,366-381,444-459,522-537,600-615,678-693,756-771,896-915,1040-1059,1122-1141,1204-1223,1286-1301,1364-1379,1442-1457,1520-1535 Ecoli.allCas9.features.txt > Ecoli.allCas9.raw.onehot.features.txt
cut -f 1,1598-1600,2-12,30-33,1618-1619,1637-1639,34-69,132-147,210-225,288-303,366-381,444-459,522-537,600-615,678-693,756-771,896-915,1040-1059,1122-1141,1204-1223,1286-1301,1364-1379,1442-1457,1520-1535 Ecoli.allCas9.features.txt > Ecoli.allCas9.raw.onehot.features_overlap.txt
cut -f 1598-1600,2-12,30-33,1618-1619,1637-1639,34-69,132-147,210-225,288-303,366-381,444-459,522-537,600-615,678-693,756-771,896-915,1040-1059,1122-1141,1204-1223,1286-1301,1364-1379,1442-1457,1520-1535 Ecoli.allCas9.features.txt > Ecoli.allCas9.raw.onehot.features_overlap_noSampleIDs.txt
# run python scripts on Andes
# run job submissions on Summit
# Builder script: /gpfs/alpine/syb105/proj-shared/Projects/iRF/iRF_LOOP_SetUp_CrossLayer.py
# [python iRF_LOOP_SetUp_CrossLayer.py --DataFile --YFile --System Summit --NodesPer 1 --TotalNodes 10 --RunTime 2 --Account SYB105 --NumTrees 1000 --NumIterations 5 --RunName iRF.XX --bypass --Prediction]
# Andes
cd /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli/iRF.run/Ecoli.allCas9
mkdir raw.features
mkdir dwt.features
mkdir onehot.features
mkdir tensor.features
mkdir raw.onehot.features
module load python/3.7-anaconda3
cd /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli/iRF.run/Ecoli.allCas9/raw.features
python /gpfs/alpine/syb105/proj-shared/Projects/iRF/iRF_LOOP_SetUp_CrossLayer.py --System Summit --NodesPer 1 --TotalNodes 50 --RunTime 60 --Account SYB105 --NumTrees 1000 --NumIterations 5 --RunName Ecoli.allCas9.raw --bypass --Prediction /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli/iRF.run/Ecoli.allCas9.raw.features.txt /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli/iRF.run/Ecoli.allCas9.score.txt
cd /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli/iRF.run/Ecoli.allCas9/dwt.features
python /gpfs/alpine/syb105/proj-shared/Projects/iRF/iRF_LOOP_SetUp_CrossLayer.py --System Summit --NodesPer 1 --TotalNodes 50 --RunTime 60 --Account SYB105 --NumTrees 1000 --NumIterations 5 --RunName Ecoli.allCas9.dwt --bypass --Prediction /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli/iRF.run/Ecoli.allCas9.dwt.features.txt /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli/iRF.run/Ecoli.allCas9.score.txt
cd /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli/iRF.run/Ecoli.allCas9/onehot.features
python /gpfs/alpine/syb105/proj-shared/Projects/iRF/iRF_LOOP_SetUp_CrossLayer.py --System Summit --NodesPer 1 --TotalNodes 50 --RunTime 60 --Account SYB105 --NumTrees 1000 --NumIterations 5 --RunName Ecoli.allCas9.onehot --bypass --Prediction /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli/iRF.run/Ecoli.allCas9.onehot.features.txt /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli/iRF.run/Ecoli.allCas9.score.txt
cd /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli/iRF.run/Ecoli.allCas9/tensor.features
python /gpfs/alpine/syb105/proj-shared/Projects/iRF/iRF_LOOP_SetUp_CrossLayer.py --System Summit --NodesPer 1 --TotalNodes 50 --RunTime 60 --Account SYB105 --NumTrees 1000 --NumIterations 5 --RunName Ecoli.allCas9.tensor --bypass --Prediction /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli/iRF.run/Ecoli.allCas9.tensor.features.txt /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli/iRF.run/Ecoli.allCas9.score.txt
cd /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli/iRF.run/Ecoli.allCas9/raw.onehot.features
python /gpfs/alpine/syb105/proj-shared/Projects/iRF/iRF_LOOP_SetUp_CrossLayer.py --System Summit --NodesPer 1 --TotalNodes 50 --RunTime 60 --Account SYB105 --NumTrees 1000 --NumIterations 5 --RunName Ecoli.allCas9.raw.onehot --bypass --Prediction /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli/iRF.run/Ecoli.allCas9.raw.onehot.features.txt /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli/iRF.run/Ecoli.allCas9.score.txt
# Summit
cd /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli/iRF.run
module load python/3.7.0-anaconda3-5.3.0
# full
bsub /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli/iRF.run/Ecoli.allCas9/dwt.features/Submits/submit_full_Ecoli.allCas9.dwt_0.sh
bsub /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli/iRF.run/Ecoli.allCas9/raw.features/Submits/submit_full_Ecoli.allCas9.raw_0.sh
bsub /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli/iRF.run/Ecoli.allCas9/onehot.features/Submits/submit_full_Ecoli.allCas9.onehot_0.sh
bsub /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli/iRF.run/Ecoli.allCas9/tensor.features/Submits/submit_full_Ecoli.allCas9.tensor_0.sh
bsub /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli/iRF.run/Ecoli.allCas9/raw.onehot.features/Submits/submit_full_Ecoli.allCas9.raw.onehot_0.sh
# train
bsub /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli/iRF.run/Ecoli.allCas9/dwt.features/Submits/submit_train_Ecoli.allCas9.dwt_0.sh
bsub /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli/iRF.run/Ecoli.allCas9/raw.features/Submits/submit_train_Ecoli.allCas9.raw_0.sh
bsub /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli/iRF.run/Ecoli.allCas9/onehot.features/Submits/submit_train_Ecoli.allCas9.onehot_0.sh
bsub /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli/iRF.run/Ecoli.allCas9/tensor.features/Submits/submit_train_Ecoli.allCas9.tensor_0.sh
bsub /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli/iRF.run/Ecoli.allCas9/raw.onehot.features/Submits/submit_train_Ecoli.allCas9.raw.onehot_0.sh
# once the train submissions are done run the test submissions
# test
bsub /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli/iRF.run/Ecoli.allCas9/dwt.features/Submits/submit_test_Ecoli.allCas9.dwt_0.sh
bsub /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli/iRF.run/Ecoli.allCas9/raw.features/Submits/submit_test_Ecoli.allCas9.raw_0.sh
bsub /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli/iRF.run/Ecoli.allCas9/onehot.features/Submits/submit_test_Ecoli.allCas9.onehot_0.sh
bsub /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli/iRF.run/Ecoli.allCas9/tensor.features/Submits/submit_test_Ecoli.allCas9.tensor_0.sh
bsub /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli/iRF.run/Ecoli.allCas9/raw.onehot.features/Submits/submit_test_Ecoli.allCas9.raw.onehot_0.sh
# Andes
cd /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli/iRF.run/Ecoli.allCas9
module load python/3.7-anaconda3
cd /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli/iRF.run/Ecoli.allCas9/dwt.features
python /gpfs/alpine/syb105/proj-shared/Projects/iRF/iRF_postProcessing.py --Iterations 5 --Prediction --PredAccuracy MAE,MAEA,MSE,R2 --varTot 95 /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli/iRF.run/Ecoli.allCas9/YNames.txt Ecoli.allCas9.dwt
cd /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli/iRF.run/Ecoli.allCas9/raw.features
python /gpfs/alpine/syb105/proj-shared/Projects/iRF/iRF_postProcessing.py --Iterations 5 --Prediction --PredAccuracy MAE,MAEA,MSE,R2 --varTot 95 /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli/iRF.run/Ecoli.allCas9/YNames.txt Ecoli.allCas9.raw
cd /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli/iRF.run/Ecoli.allCas9/onehot.features
python /gpfs/alpine/syb105/proj-shared/Projects/iRF/iRF_postProcessing.py --Iterations 5 --Prediction --PredAccuracy MAE,MAEA,MSE,R2 --varTot 95 /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli/iRF.run/Ecoli.allCas9/YNames.txt Ecoli.allCas9.onehot
cd /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli/iRF.run/Ecoli.allCas9/tensor.features
python /gpfs/alpine/syb105/proj-shared/Projects/iRF/iRF_postProcessing.py --Iterations 5 --Prediction --PredAccuracy MAE,MAEA,MSE,R2 --varTot 95 /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli/iRF.run/Ecoli.allCas9/YNames.txt Ecoli.allCas9.tensor
cd /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli/iRF.run/Ecoli.allCas9/raw.onehot.features
python /gpfs/alpine/syb105/proj-shared/Projects/iRF/iRF_postProcessing.py --Iterations 5 --Prediction --PredAccuracy MAE,MAEA,MSE,R2 --varTot 95 /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli/iRF.run/Ecoli.allCas9/YNames.txt Ecoli.allCas9.raw.onehot
# correlation
setwd("/gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli/iRF.run/Ecoli.allCas9/raw.features/cut.score/foldRuns/fold9/Runs/Set4")
pred <- read.delim("Ecoli.allCas9_Set4_test.prediction", header=T, sep="\t")
y <- read.delim("set4_Y_test_noSampleIDs.txt", header=T, sep="\t")
cor(y$cut.score, pred$Predictions., method=c("pearson"))
# 0.1594092
cor(y$cut.score, pred$Predictions., method=c("spearman"))
# 0.156682
id <- read.delim("set4_test_SampleIDs.txt", header=F, sep="\t")
colnames(id) <- "sgRNAID"
id.pred <- cbind(id, pred)
id.pred.y <- cbind(id.pred, y)
library(tidyr)
id.pred.y.group <- id.pred.y %>% separate(sgRNAID, c("sgRNA", "ID", "group"), "_")
cor(id.pred.y.group$cut.score, id.pred.y.group$Predictions., method=c("pearson"))
# 0.1594092
pred.Cas9 <- subset(id.pred.y.group, id.pred.y.group$group == "Cas9")
cor(pred.Cas9$cut.score, pred.Cas9$Predictions., method=c("pearson"))
# 0.1953999
pred.eSpCas9 <- subset(id.pred.y.group, id.pred.y.group$group == "eSpCas9")
cor(pred.eSpCas9$cut.score, pred.eSpCas9$Predictions., method=c("pearson"))
# 0.2022733
pred.recACas9 <- subset(id.pred.y.group, id.pred.y.group$group == "recACas9")
cor(pred.recACas9$cut.score, pred.recACas9$Predictions., method=c("pearson"))
# 0.1597929
setwd("/gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli/iRF.run/Ecoli.allCas9/dwt.features/cut.score/foldRuns/fold9/Runs/Set4")
pred <- read.delim("Ecoli.allCas9.dwt_Set4_test.prediction", header=T, sep="\t")
y <- read.delim("set4_Y_test_noSampleIDs.txt", header=T, sep="\t")
cor(y$cut.score, pred$Predictions., method=c("pearson"))
# 0.1594092
cor(y$cut.score, pred$Predictions., method=c("spearman"))
# 0.156682
setwd("/gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli/iRF.run/Ecoli.allCas9/onehot.features/cut.score/foldRuns/fold9/Runs/Set4")
pred <- read.delim("Ecoli.allCas9.onehot_Set4_test.prediction", header=T, sep="\t")
y <- read.delim("set4_Y_test_noSampleIDs.txt", header=T, sep="\t")
cor(y$cut.score, pred$Predictions., method=c("pearson"))
# 0.3061615
cor(y$cut.score, pred$Predictions., method=c("spearman"))
# 0.3572923
id <- read.delim("set4_test_SampleIDs.txt", header=F, sep="\t")
colnames(id) <- "sgRNAID"
id.pred <- cbind(id, pred)
id.pred.y <- cbind(id.pred, y)
library(tidyr)
id.pred.y.group <- id.pred.y %>% separate(sgRNAID, c("sgRNA", "ID", "group"), "_")
pred.Cas9 <- subset(id.pred.y.group, id.pred.y.group$group == "Cas9")
cor(pred.Cas9$cut.score, pred.Cas9$Predictions., method=c("pearson"))
# 0.7328605
pred.eSpCas9 <- subset(id.pred.y.group, id.pred.y.group$group == "eSpCas9")
cor(pred.eSpCas9$cut.score, pred.eSpCas9$Predictions., method=c("pearson"))
# 0.6809697
pred.recACas9 <- subset(id.pred.y.group, id.pred.y.group$group == "recACas9")
cor(pred.recACas9$cut.score, pred.recACas9$Predictions., method=c("pearson"))
# 0.5033779
setwd("/gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli/iRF.run/Ecoli.allCas9/tensor.features/cut.score/foldRuns/fold9/Runs/Set4")
pred <- read.delim("Ecoli.allCas9.tensor_Set4_test.prediction", header=T, sep="\t")
y <- read.delim("set4_Y_test_noSampleIDs.txt", header=T, sep="\t")
cor(y$cut.score, pred$Predictions., method=c("pearson"))
# 0.3135828
cor(y$cut.score, pred$Predictions., method=c("spearman"))
# 0.3627849
id <- read.delim("set4_test_SampleIDs.txt", header=F, sep="\t")
colnames(id) <- "sgRNAID"
id.pred <- cbind(id, pred)
id.pred.y <- cbind(id.pred, y)
library(tidyr)
id.pred.y.group <- id.pred.y %>% separate(sgRNAID, c("sgRNA", "ID", "group"), "_")
pred.Cas9 <- subset(id.pred.y.group, id.pred.y.group$group == "Cas9")
cor(pred.Cas9$cut.score, pred.Cas9$Predictions., method=c("pearson"))
# 0.7355619
pred.eSpCas9 <- subset(id.pred.y.group, id.pred.y.group$group == "eSpCas9")
cor(pred.eSpCas9$cut.score, pred.eSpCas9$Predictions., method=c("pearson"))
# 0.6896463
pred.recACas9 <- subset(id.pred.y.group, id.pred.y.group$group == "recACas9")
cor(pred.recACas9$cut.score, pred.recACas9$Predictions., method=c("pearson"))
# 0.5092788
setwd("/gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli/iRF.run/Ecoli.allCas9/raw.onehot.features/cut.score/foldRuns/fold9/Runs/Set4")
pred <- read.delim("Ecoli.allCas9.raw.onehot_Set4_test.prediction", header=T, sep="\t")
y <- read.delim("set4_Y_test_noSampleIDs.txt", header=T, sep="\t")
cor(y$cut.score, pred$Predictions., method=c("pearson"))
# 0.3068015
cor(y$cut.score, pred$Predictions., method=c("spearman"))
# 0.3580278
id <- read.delim("set4_test_SampleIDs.txt", header=F, sep="\t")
colnames(id) <- "sgRNAID"
id.pred <- cbind(id, pred)
id.pred.y <- cbind(id.pred, y)
library(tidyr)
id.pred.y.group <- id.pred.y %>% separate(sgRNAID, c("sgRNA", "ID", "group"), "_")
pred.Cas9 <- subset(id.pred.y.group, id.pred.y.group$group == "Cas9")
cor(pred.Cas9$cut.score, pred.Cas9$Predictions., method=c("pearson"))
# 0.7324255
pred.eSpCas9 <- subset(id.pred.y.group, id.pred.y.group$group == "eSpCas9")
cor(pred.eSpCas9$cut.score, pred.eSpCas9$Predictions., method=c("pearson"))
# 0.6821168
pred.recACas9 <- subset(id.pred.y.group, id.pred.y.group$group == "recACas9")
cor(pred.recACas9$cut.score, pred.recACas9$Predictions., method=c("pearson"))
# 0.5034997
–> test for each Cas9 type Spearman correlation coefficients of 0.542, 0.682 and 0.328 for Cas9, eSpCas9 and Cas9 (ΔrecA)
# Raw Features
setwd("/gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli/iRF.run/Ecoli.allCas9/raw.features/cut.score/foldRuns/fold9/Runs/Set4")
pred <- read.delim("Ecoli.allCas9_Set4_test.prediction", header=T, sep="\t")
y <- read.delim("set4_Y_test_noSampleIDs.txt", header=T, sep="\t")
id <- read.delim("set4_test_SampleIDs.txt", header=F, sep="\t")
colnames(id) <- "sgRNAID"
id.pred <- cbind(id, pred)
id.pred.y <- cbind(id.pred, y)
library(tidyr)
id.pred.y.group <- id.pred.y %>% separate(sgRNAID, c("sgRNA", "ID", "group"), "_")
cor(id.pred.y.group$cut.score, id.pred.y.group$Predictions., method=c("pearson"))
# 0.1594092
cor(id.pred.y.group$cut.score, id.pred.y.group$Predictions., method=c("spearman"))
# 0.156682
pred.Cas9 <- subset(id.pred.y.group, id.pred.y.group$group == "Cas9")
cor(pred.Cas9$cut.score, pred.Cas9$Predictions., method=c("pearson"))
# 0.1953999
cor(pred.Cas9$cut.score, pred.Cas9$Predictions., method=c("spearman"))
# 0.1899011
pred.eSpCas9 <- subset(id.pred.y.group, id.pred.y.group$group == "eSpCas9")
cor(pred.eSpCas9$cut.score, pred.eSpCas9$Predictions., method=c("pearson"))
# 0.2022733
cor(pred.eSpCas9$cut.score, pred.eSpCas9$Predictions., method=c("spearman"))
# 0.2043874
pred.recACas9 <- subset(id.pred.y.group, id.pred.y.group$group == "recACas9")
cor(pred.recACas9$cut.score, pred.recACas9$Predictions., method=c("pearson"))
# 0.1597929
cor(pred.recACas9$cut.score, pred.recACas9$Predictions., method=c("spearman"))
# 0.1323311
# Raw + Onehot
setwd("/gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli/iRF.run/Ecoli.allCas9/raw.onehot.features/cut.score/foldRuns/fold9/Runs/Set4")
pred <- read.delim("Ecoli.allCas9.raw.onehot_Set4_test.prediction", header=T, sep="\t")
y <- read.delim("set4_Y_test_noSampleIDs.txt", header=T, sep="\t")
id <- read.delim("set4_test_SampleIDs.txt", header=F, sep="\t")
colnames(id) <- "sgRNAID"
id.pred <- cbind(id, pred)
id.pred.y <- cbind(id.pred, y)
library(tidyr)
id.pred.y.group <- id.pred.y %>% separate(sgRNAID, c("sgRNA", "ID", "group"), "_")
cor(id.pred.y.group$cut.score, id.pred.y.group$Predictions., method=c("pearson"))
# 0.3068602
cor(id.pred.y.group$cut.score, id.pred.y.group$Predictions., method=c("spearman"))
# 0.3580532
pred.Cas9 <- subset(id.pred.y.group, id.pred.y.group$group == "Cas9")
cor(pred.Cas9$cut.score, pred.Cas9$Predictions., method=c("pearson"))
# 0.7324255
cor(pred.Cas9$cut.score, pred.Cas9$Predictions., method=c("spearman"))
# 0.7316238
pred.eSpCas9 <- subset(id.pred.y.group, id.pred.y.group$group == "eSpCas9")
cor(pred.eSpCas9$cut.score, pred.eSpCas9$Predictions., method=c("pearson"))
# 0.6821168
cor(pred.eSpCas9$cut.score, pred.eSpCas9$Predictions., method=c("spearman"))
# 0.6801732
pred.recACas9 <- subset(id.pred.y.group, id.pred.y.group$group == "recACas9")
cor(pred.recACas9$cut.score, pred.recACas9$Predictions., method=c("pearson"))
# 0.5034997
cor(pred.recACas9$cut.score, pred.recACas9$Predictions., method=c("spearman"))
# 0.4910574
# Onehot
setwd("/gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli/iRF.run/Ecoli.allCas9/onehot.features/cut.score/foldRuns/fold9/Runs/Set4")
pred <- read.delim("Ecoli.allCas9.onehot_Set4_test.prediction", header=T, sep="\t")
y <- read.delim("set4_Y_test_noSampleIDs.txt", header=T, sep="\t")
id <- read.delim("set4_test_SampleIDs.txt", header=F, sep="\t")
colnames(id) <- "sgRNAID"
id.pred <- cbind(id, pred)
id.pred.y <- cbind(id.pred, y)
library(tidyr)
id.pred.y.group <- id.pred.y %>% separate(sgRNAID, c("sgRNA", "ID", "group"), "_")
cor(id.pred.y.group$cut.score, id.pred.y.group$Predictions., method=c("pearson"))
# 0.3061915
pred.Cas9 <- subset(id.pred.y.group, id.pred.y.group$group == "Cas9")
cor(pred.Cas9$cut.score, pred.Cas9$Predictions., method=c("pearson"))
# 0.7328605
pred.eSpCas9 <- subset(id.pred.y.group, id.pred.y.group$group == "eSpCas9")
cor(pred.eSpCas9$cut.score, pred.eSpCas9$Predictions., method=c("pearson"))
# 0.6809697
pred.recACas9 <- subset(id.pred.y.group, id.pred.y.group$group == "recACas9")
cor(pred.recACas9$cut.score, pred.recACas9$Predictions., method=c("pearson"))
# 0.5033779
# Tensor
setwd("/gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli/iRF.run/Ecoli.allCas9/tensor.features/cut.score/foldRuns/fold9/Runs/Set4")
pred <- read.delim("Ecoli.allCas9.tensor_Set4_test.prediction", header=T, sep="\t")
y <- read.delim("set4_Y_test_noSampleIDs.txt", header=T, sep="\t")
id <- read.delim("set4_test_SampleIDs.txt", header=F, sep="\t")
colnames(id) <- "sgRNAID"
id.pred <- cbind(id, pred)
id.pred.y <- cbind(id.pred, y)
library(tidyr)
id.pred.y.group <- id.pred.y %>% separate(sgRNAID, c("sgRNA", "ID", "group"), "_")
cor(id.pred.y.group$cut.score, id.pred.y.group$Predictions., method=c("pearson"))
# 0.3135828
pred.Cas9 <- subset(id.pred.y.group, id.pred.y.group$group == "Cas9")
#nrow=8052
cor(pred.Cas9$cut.score, pred.Cas9$Predictions., method=c("pearson"))
# 0.7355619
pred.eSpCas9 <- subset(id.pred.y.group, id.pred.y.group$group == "eSpCas9")
#nrow=8336
cor(pred.eSpCas9$cut.score, pred.eSpCas9$Predictions., method=c("pearson"))
# 0.6896463
pred.recACas9 <- subset(id.pred.y.group, id.pred.y.group$group == "recACas9")
#nrow=8848
cor(pred.recACas9$cut.score, pred.recACas9$Predictions., method=c("pearson"))
# 0.5092788
# DWT
setwd("/gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli/iRF.run/Ecoli.allCas9/dwt.features/cut.score/foldRuns/fold9/Runs/Set4")
pred <- read.delim("Ecoli.allCas9.dwt_Set4_test.prediction", header=T, sep="\t")
y <- read.delim("set4_Y_test_noSampleIDs.txt", header=T, sep="\t")
id <- read.delim("set4_test_SampleIDs.txt", header=F, sep="\t")
colnames(id) <- "sgRNAID"
id.pred <- cbind(id, pred)
id.pred.y <- cbind(id.pred, y)
library(tidyr)
id.pred.y.group <- id.pred.y %>% separate(sgRNAID, c("sgRNA", "ID", "group"), "_")
cor(id.pred.y.group$cut.score, id.pred.y.group$Predictions., method=c("pearson"))
# 0.4887379 (spearman=0.4365595)
pred.Cas9 <- subset(id.pred.y.group, id.pred.y.group$group == "Cas9")
cor(pred.Cas9$cut.score, pred.Cas9$Predictions., method=c("pearson"))
# 0.006071799
pred.eSpCas9 <- subset(id.pred.y.group, id.pred.y.group$group == "eSpCas9")
cor(pred.eSpCas9$cut.score, pred.eSpCas9$Predictions., method=c("pearson"))
# 0.001608477
pred.recACas9 <- subset(id.pred.y.group, id.pred.y.group$group == "recACas9")
cor(pred.recACas9$cut.score, pred.recACas9$Predictions., method=c("pearson"))
# 0.008588094
# All
setwd("/gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli/iRF.run/Ecoli.allCas9/all.features/cut.score/foldRuns/fold9/Runs/Set4")
pred <- read.delim("Ecoli.allCas9_Set4_test.prediction", header=T, sep="\t")
y <- read.delim("set4_Y_test_noSampleIDs.txt", header=T, sep="\t")
id <- read.delim("set4_test_SampleIDs.txt", header=F, sep="\t")
colnames(id) <- "sgRNAID"
id.pred <- cbind(id, pred)
id.pred.y <- cbind(id.pred, y)
library(tidyr)
id.pred.y.group <- id.pred.y %>% separate(sgRNAID, c("sgRNA", "ID", "group"), "_")
# 25236
cor(id.pred.y.group$cut.score, id.pred.y.group$Predictions., method=c("pearson"))
# 0.6551865 (spearman=0.6117463)
pred.Cas9 <- subset(id.pred.y.group, id.pred.y.group$group == "Cas9")
cor(pred.Cas9$cut.score, pred.Cas9$Predictions., method=c("pearson"))
# 0.4612111
pred.eSpCas9 <- subset(id.pred.y.group, id.pred.y.group$group == "eSpCas9")
cor(pred.eSpCas9$cut.score, pred.eSpCas9$Predictions., method=c("pearson"))
# 0.5312701
pred.recACas9 <- subset(id.pred.y.group, id.pred.y.group$group == "recACas9")
cor(pred.recACas9$cut.score, pred.recACas9$Predictions., method=c("pearson"))
# 0.304202
library(ggplot2)
pdf("all.features.cor.plot.pdf")
ggplot(id.pred.y.group, aes(x=cut.score, y=Predictions., color=group)) + geom_point() + theme_classic()
dev.off()
pdf("all.features.cor.plot.split.pdf")
ggplot(id.pred.y.group, aes(x=cut.score, y=Predictions., color=group)) + geom_point() + theme_classic() + facet_wrap(group ~ .)
dev.off()
pdf("all.features.cor.plot.split2.pdf")
ggplot(id.pred.y.group, aes(x=cut.score, y=Predictions., color=group)) + geom_point() + theme_classic() + facet_grid(group ~ ., scales = "free", space = "free")
dev.off()
setwd("/gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli/iRF.run/Ecoli.allCas9/all.features/cut.score/foldRuns/fold9/Runs/Set4")
feature <- read.delim("Ecoli.allCas9_cut.score.importance4", header=F, sep=" ")
feature.order <- feature[order(-feature$V2),]
feature.order.top <- feature.order[1:25,]
feature.order.top$id <- feature.order.top$V1
df <- separate(feature.order.top, V1, c("feature", "type"))
pdf("all.features.importance.pdf")
ggplot(df, aes(x = reorder(id, -V2), y = V2)) + geom_bar(stat = "identity") + theme_classic() + theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1))
dev.off()
setwd("/gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli/iRF.run/Ecoli.allCas9/all.features/normalizedEdgeFiles")
feature <- read.delim("cut.score_Normalize.txt", header=F, sep="\t")
feature.order <- feature[order(-feature$V3),]
feature.order.top <- feature.order[1:25,]
pdf("all.features.importance.normalized.pdf")
ggplot(feature.order.top, aes(x = reorder(V1, -V3), y = V3)) + geom_bar(stat = "identity") + theme_classic() + theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1))
dev.off()
library(tidyr)
library(ggplot2)
setwd("/gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli/iRF.run/Ecoli.allCas9/all.features/normalizedEdgeFiles")
feature <- read.delim("cut.score_Normalize.txt", header=F, sep=" ")
feature.order <- feature[order(-feature$V2),]
feature.order.top <- feature.order[1:25,]
feature.order.top$id <- feature.order.top$V1
df <- separate(feature.order.top, V1, c("feature", "type"))
pdf("all.features.importance.pdf")
ggplot(df, aes(x = reorder(id, -V2), y = V2)) + geom_bar(stat = "identity") + theme_classic() + theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1))
dev.off()
# scp noshayjm@dtn.ccs.ornl.gov:/gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli/iRF.run/Ecoli.allCas9/all.features/normalizedEdgeFiles/all.features.importance.pdf /Users/27n/Dropbox (ORNL)/ORNL.Noshay/Projects/SEED/ExploratoryDataForModelGeneration/e.coli/.
library(ggplot2)
library(RColorBrewer)
pdf("all.features.correlation.pdf")
ggplot(id.pred.y.group, aes(x=cut.score, y=Predictions., shape=group, color=group)) + geom_point() + scale_colour_brewer(palette = "RdYlBu") + theme_classic()
dev.off()
# scp noshayjm@dtn.ccs.ornl.gov:/gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli/iRF.run/Ecoli.allCas9/all.features/normalizedEdgeFiles/all.features.correlation.pdf /Users/27n/Dropbox (ORNL)/ORNL.Noshay/Projects/SEED/ExploratoryDataForModelGeneration/e.coli/.
## add classification??
setwd("/gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli/iRF.run/Ecoli.allCas9/all.features/cut.score/foldRuns/fold9/Runs/Set4")
pred <- read.delim("Ecoli.allCas9_Set4_test.prediction", header=T, sep="\t")
y <- read.delim("set4_Y_test_noSampleIDs.txt", header=T, sep="\t")
id <- read.delim("set4_test_SampleIDs.txt", header=F, sep="\t")
colnames(id) <- "sgRNAID"
id.pred <- cbind(id, pred)
id.pred.y <- cbind(id.pred, y)
library(tidyr)
id.pred.y.group <- id.pred.y %>% separate(sgRNAID, c("sgRNA", "ID", "group"), "_")
setwd("/gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli/")
ds6 <- read.delim("DataS6.txt", header=T, sep="\t")
ds6.id <- ds6 %>% separate(sgRNA, c("sgRNA", "ID"), "_")
library(dplyr)
id.pred.y.group.ds6 <- inner_join(id.pred.y.group, ds6.id[,c(1,2,5)], by=c("sgRNA", "ID"))
library(ggplot2)
library(RColorBrewer)
pdf("all.features.correlation.class.pdf")
ggplot(id.pred.y.group.ds6, aes(x=cut.score, y=Predictions., shape=group, color=group)) + geom_point() + scale_colour_brewer(palette = "RdYlBu") + theme_classic() + facet_grid(. ~ Quality)
dev.off()
# scp noshayjm@dtn.ccs.ornl.gov:/gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli/all.features.correlation.class.pdf /Users/27n/Dropbox (ORNL)/ORNL.Noshay/Projects/SEED/ExploratoryDataForModelGeneration/e.coli/.
***Shows that the model (incorporating all Cas9 types) is better at predicting for Cas9 and eSpCas9 than for recACas9… want to be able to have three separate test sets for each run (a test set with each Cas9 type that is not used in the model training to eliminate bias)… then I can get summary accuracy statistics for each type of Cas9… use the –groupFile option?
***Ashley: Try making six groups and then having there be 6 kfolds instead of 5. It might just work. You’ll also need to change –sampleSize to something much closer to the number of samples in each new group
# split each Cas9 group into two groups...
# add --group tag
# add --sampleSize tag
setwd("/gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli/iRF.run/")
df <- read.delim("Ecoli.allCas9.features.txt")
df.id <- data.frame(df$sgRNAID)
library(tidyr)
df.sep <- separate(df.id, df.sgRNAID, c("sgRNA", "ID", "Cas9"), sep="_")
df.cas9 <- subset(df.sep, df.sep$Cas9 == "Cas9")
# 40518 / 2 = 20259
df.recA <- subset(df.sep, df.sep$Cas9 == "recACas9")
# 44198 / 2 = 22099
df.eSp <- subset(df.sep, df.sep$Cas9 == "eSpCas9")
# 41465 / 2 = 20732.5
df.cas9.1 <- df.cas9[1:20259,]
df.cas9.2 <- df.cas9[20260:40518,]
df.recA.1 <- df.recA[1:22099,]
df.recA.2 <- df.recA[22100:44198,]
df.eSp.1 <- df.eSp[1:20732,]
df.eSp.2 <- df.eSp[20733:41465,]
df.cas9.1$group <- "Cas9.group1"
df.cas9.2$group <- "Cas9.group2"
df.recA.1$group <- "recA.Cas9.group1"
df.recA.2$group <- "recA.Cas9.group2"
df.eSp.1$group <- "eSp.Cas9.group1"
df.eSp.2$group <- "eSp.Cas9.group2"
df1 <- rbind(df.cas9.1, df.cas9.2)
df2 <- rbind(df1, df.recA.1)
df3 <- rbind(df2, df.recA.2)
df4 <- rbind(df3, df.eSp.1)
df5 <- rbind(df4, df.eSp.2)
library(dplyr)
df.order <- left_join(df.sep, df5, by=c("sgRNA", "ID", "Cas9"))
df.group <- data.frame(df.order$group)
colnames(df.group) <- "groupID"
write.table(df.group, "Ecoli.allCas9.groupfile.txt", quote=F, row.names=F, sep="\t")
# run python scripts on Andes
# run job submissions on Summit
# Builder script: /gpfs/alpine/syb105/proj-shared/Projects/iRF/iRF_LOOP_SetUp_CrossLayer.py
# [python iRF_LOOP_SetUp_CrossLayer.py --DataFile --YFile --System Summit --NodesPer 1 --TotalNodes 10 --RunTime 2 --Account SYB105 --NumTrees 1000 --NumIterations 5 --RunName iRF.XX --bypass --Prediction]
# Andes
cd /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli/iRF.run/Ecoli.allCas9
mkdir group.features
module load python/3.7-anaconda3
cd /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli/iRF.run/Ecoli.allCas9/group.features
python /gpfs/alpine/syb105/proj-shared/Projects/iRF/iRF_LOOP_SetUp_CrossLayer.py --System Summit --NodesPer 1 --TotalNodes 50 --RunTime 120 --Account SYB105 --NumTrees 1000 --NumIterations 6 --RunName Ecoli.allCas9.group --bypass --Prediction --sampleSize 20000 --groupFile /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli/iRF.run/Ecoli.allCas9.groupfile.txt /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli/iRF.run/Ecoli.allCas9.features.txt /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli/iRF.run/Ecoli.allCas9.score.txt
# Summit
cd /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli/iRF.run
module load python/3.7.0-anaconda3-5.3.0
# full
bsub /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli/iRF.run/Ecoli.allCas9/group.features/Submits/submit_full_Ecoli.allCas9.group_0.sh
# train
bsub /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli/iRF.run/Ecoli.allCas9/group.features/Submits/submit_train_Ecoli.allCas9.group_0.sh
# once the train submissions are done run the test submissions
# test
bsub /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli/iRF.run/Ecoli.allCas9/group.features/Submits/submit_test_Ecoli.allCas9.group_0.sh
# Andes
module load python/3.7-anaconda3
cd /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli/iRF.run/Ecoli.allCas9/group.features
python /gpfs/alpine/syb105/proj-shared/Projects/iRF/iRF_postProcessing.py --Iterations 6 --Prediction --PredAccuracy MAE,MAEA,MSE,R2 --varTot 95 /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli/iRF.run/Ecoli.allCas9/YNames.txt Ecoli.allCas9.group
# correlation
setwd("/gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli/iRF.run/Ecoli.allCas9/group.features/cut.score/foldRuns/fold9/Runs/Set4")
pred <- read.delim("Ecoli.allCas9.group_Set4_test.prediction", header=T, sep="\t")
y <- read.delim("set4_Y_test_noSampleIDs.txt", header=T, sep="\t")
cor(y$cut.score, pred$Predictions., method=c("pearson"))
# 0.4296021
cor(y$cut.score, pred$Predictions., method=c("spearman"))
# 0.417005
# correlation - by Cas9 type
id <- read.delim("set4_test_SampleIDs.txt", header=F, sep="\t")
colnames(id) <- "sgRNAID"
id.pred <- cbind(id, pred)
id.pred.y <- cbind(id.pred, y)
library(tidyr)
id.pred.y.group <- id.pred.y %>% separate(sgRNAID, c("sgRNA", "ID", "group"), "_")
# 20000
cor(id.pred.y.group$cut.score, id.pred.y.group$Predictions., method=c("pearson"))
# 0.4296021
pred.Cas9 <- subset(id.pred.y.group, id.pred.y.group$group == "Cas9")
cor(pred.Cas9$cut.score, pred.Cas9$Predictions., method=c("pearson"))
# 0.4296021
pred.eSpCas9 <- subset(id.pred.y.group, id.pred.y.group$group == "eSpCas9")
cor(pred.eSpCas9$cut.score, pred.eSpCas9$Predictions., method=c("pearson"))
# NA
pred.recACas9 <- subset(id.pred.y.group, id.pred.y.group$group == "recACas9")
cor(pred.recACas9$cut.score, pred.recACas9$Predictions., method=c("pearson"))
# NA
setwd("/gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli/iRF.run/Ecoli.allCas9/group.features/cut.score/foldRuns/fold9/Runs/Set3")
pred <- read.delim("Ecoli.allCas9.group_Set3_test.prediction", header=T, sep="\t")
y <- read.delim("set3_Y_test_noSampleIDs.txt", header=T, sep="\t")
id <- read.delim("set3_test_SampleIDs.txt", header=F, sep="\t")
colnames(id) <- "sgRNAID"
id.pred <- cbind(id, pred)
id.pred.y <- cbind(id.pred, y)
library(tidyr)
id.pred.y.group <- id.pred.y %>% separate(sgRNAID, c("sgRNA", "ID", "group"), "_")
# 20000
cor(id.pred.y.group$cut.score, id.pred.y.group$Predictions., method=c("pearson"))
# 0.5120277
pred.Cas9 <- subset(id.pred.y.group, id.pred.y.group$group == "Cas9")
cor(pred.Cas9$cut.score, pred.Cas9$Predictions., method=c("pearson"))
# NA
pred.eSpCas9 <- subset(id.pred.y.group, id.pred.y.group$group == "eSpCas9")
cor(pred.eSpCas9$cut.score, pred.eSpCas9$Predictions., method=c("pearson"))
# 0.5120277
pred.recACas9 <- subset(id.pred.y.group, id.pred.y.group$group == "recACas9")
cor(pred.recACas9$cut.score, pred.recACas9$Predictions., method=c("pearson"))
# NA
setwd("/gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli/iRF.run/Ecoli.allCas9/group.features/cut.score/foldRuns/fold9/Runs/Set2")
pred <- read.delim("Ecoli.allCas9.group_Set2_test.prediction", header=T, sep="\t")
y <- read.delim("set2_Y_test_noSampleIDs.txt", header=T, sep="\t")
id <- read.delim("set2_test_SampleIDs.txt", header=F, sep="\t")
colnames(id) <- "sgRNAID"
id.pred <- cbind(id, pred)
id.pred.y <- cbind(id.pred, y)
library(tidyr)
id.pred.y.group <- id.pred.y %>% separate(sgRNAID, c("sgRNA", "ID", "group"), "_")
# 20000
cor(id.pred.y.group$cut.score, id.pred.y.group$Predictions., method=c("pearson"))
# 0.5001313
pred.Cas9 <- subset(id.pred.y.group, id.pred.y.group$group == "Cas9")
cor(pred.Cas9$cut.score, pred.Cas9$Predictions., method=c("pearson"))
# NA
pred.eSpCas9 <- subset(id.pred.y.group, id.pred.y.group$group == "eSpCas9")
cor(pred.eSpCas9$cut.score, pred.eSpCas9$Predictions., method=c("pearson"))
# 0.5001313
pred.recACas9 <- subset(id.pred.y.group, id.pred.y.group$group == "recACas9")
cor(pred.recACas9$cut.score, pred.recACas9$Predictions., method=c("pearson"))
# NA
setwd("/gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli/iRF.run/Ecoli.allCas9/group.features/cut.score/foldRuns/fold9/Runs/Set1")
pred <- read.delim("Ecoli.allCas9.group_Set1_test.prediction", header=T, sep="\t")
y <- read.delim("set1_Y_test_noSampleIDs.txt", header=T, sep="\t")
id <- read.delim("set1_test_SampleIDs.txt", header=F, sep="\t")
colnames(id) <- "sgRNAID"
id.pred <- cbind(id, pred)
id.pred.y <- cbind(id.pred, y)
library(tidyr)
id.pred.y.group <- id.pred.y %>% separate(sgRNAID, c("sgRNA", "ID", "group"), "_")
# 20000
cor(id.pred.y.group$cut.score, id.pred.y.group$Predictions., method=c("pearson"))
# 0.276328
pred.Cas9 <- subset(id.pred.y.group, id.pred.y.group$group == "Cas9")
cor(pred.Cas9$cut.score, pred.Cas9$Predictions., method=c("pearson"))
# NA
pred.eSpCas9 <- subset(id.pred.y.group, id.pred.y.group$group == "eSpCas9")
cor(pred.eSpCas9$cut.score, pred.eSpCas9$Predictions., method=c("pearson"))
# NA
pred.recACas9 <- subset(id.pred.y.group, id.pred.y.group$group == "recACas9")
cor(pred.recACas9$cut.score, pred.recACas9$Predictions., method=c("pearson"))
# 0.276328
setwd("/gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli/iRF.run/Ecoli.allCas9/group.features/cut.score/foldRuns/fold9/Runs/Set0")
pred <- read.delim("Ecoli.allCas9.group_Set0_test.prediction", header=T, sep="\t")
y <- read.delim("set0_Y_test_noSampleIDs.txt", header=T, sep="\t")
id <- read.delim("set0_test_SampleIDs.txt", header=F, sep="\t")
colnames(id) <- "sgRNAID"
id.pred <- cbind(id, pred)
id.pred.y <- cbind(id.pred, y)
library(tidyr)
id.pred.y.group <- id.pred.y %>% separate(sgRNAID, c("sgRNA", "ID", "group"), "_")
# 20000
cor(id.pred.y.group$cut.score, id.pred.y.group$Predictions., method=c("pearson"))
# 0.6965256
pred.Cas9 <- subset(id.pred.y.group, id.pred.y.group$group == "Cas9")
cor(pred.Cas9$cut.score, pred.Cas9$Predictions., method=c("pearson"))
# 0.4400045
pred.eSpCas9 <- subset(id.pred.y.group, id.pred.y.group$group == "eSpCas9")
cor(pred.eSpCas9$cut.score, pred.eSpCas9$Predictions., method=c("pearson"))
# NA
pred.recACas9 <- subset(id.pred.y.group, id.pred.y.group$group == "recACas9")
cor(pred.recACas9$cut.score, pred.recACas9$Predictions., method=c("pearson"))
# 0.2754579
# feature columns
# sgRNAID = 1
# ind onehot = 2:12,30:33,1618:1619,1637:1639
# dep onehot = 34:69,132:147,210:225,288:303,366:381,444:459,522:537,600:615,678:693,756:771,896:915,1040:1059,1122:1141,1204:1223,1286:1301,1364:1379,1442:1457,1520:1535
# dwt = 13:29,1601:1617,1620:1636
# tensor = 70:131,148:209,226:287,304:365,382:443,460:521,538:599,616:677,694:755,772:895,916:1039,1050:1121,1142:1203,1224:1285,1302:1363,1380:1441,1458:1519,1536:1597
# raw = 1598:1600
cd /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli/iRF.run
cut -f 1,1598-1600,70-131,148-209,226-287,304-365,382-443,460-521,538-599,616-677,694-755,772-895,916-1039,1050-1121,1142-1203,1224-1285,1302-1363,1380-1441,1458-1519,1536-1597 Ecoli.allCas9.features.txt > Ecoli.allCas9.raw.tensor.features.txt
cut -f 1,1598-1600,70-131,148-209,226-287,304-365,382-443,460-521,538-599,616-677,694-755,772-895,916-1039,1050-1121,1142-1203,1224-1285,1302-1363,1380-1441,1458-1519,1536-1597 Ecoli.allCas9.features.txt > Ecoli.allCas9.raw.tensor.features_overlap.txt
cut -f 1598-1600,70-131,148-209,226-287,304-365,382-443,460-521,538-599,616-677,694-755,772-895,916-1039,1050-1121,1142-1203,1224-1285,1302-1363,1380-1441,1458-1519,1536-1597 Ecoli.allCas9.features.txt > Ecoli.allCas9.raw.tensor.features_overlap_noSampleIDs.txt
cut -f 1,1598-1600,70-131,148-209,226-287,304-365,382-443,460-521,538-599,616-677,694-755,772-895,916-1039,1050-1121,1142-1203,1224-1285,1302-1363,1380-1441,1458-1519,1536-1597,2-12,30-33,1618-1619,1637-1639,34-69,132-147,210-225,288-303,366-381,444-459,522-537,600-615,678-693,756-771,896-915,1040-1059,1122-1141,1204-1223,1286-1301,1364-1379,1442-1457,1520-1535 Ecoli.allCas9.features.txt > Ecoli.allCas9.raw.tensor.onehot.features.txt
cut -f 1,1598-1600,70-131,148-209,226-287,304-365,382-443,460-521,538-599,616-677,694-755,772-895,916-1039,1050-1121,1142-1203,1224-1285,1302-1363,1380-1441,1458-1519,1536-1597,2-12,30-33,1618-1619,1637-1639,34-69,132-147,210-225,288-303,366-381,444-459,522-537,600-615,678-693,756-771,896-915,1040-1059,1122-1141,1204-1223,1286-1301,1364-1379,1442-1457,1520-1535 Ecoli.allCas9.features.txt > Ecoli.allCas9.raw.tensor.onehot.features_overlap.txt
cut -f 1598-1600,70-131,148-209,226-287,304-365,382-443,460-521,538-599,616-677,694-755,772-895,916-1039,1050-1121,1142-1203,1224-1285,1302-1363,1380-1441,1458-1519,1536-1597,2-12,30-33,1618-1619,1637-1639,34-69,132-147,210-225,288-303,366-381,444-459,522-537,600-615,678-693,756-771,896-915,1040-1059,1122-1141,1204-1223,1286-1301,1364-1379,1442-1457,1520-1535 Ecoli.allCas9.features.txt > Ecoli.allCas9.raw.tensor.onehot.features_overlap_noSampleIDs.txt
# R
setwd("/gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli/iRF.run")
df <- read.delim("Ecoli.allCas9.raw.tensor.features.txt", header=T, sep="\t")
is.numeric(df[,2:ncol(df)])
df.mat <- as.matrix(df[,2:ncol(df)])
df.mat.id <- cbind(as.data.frame(df$df.sgRNAID), df.mat)
write.table(df.mat.id, "Ecoli.allCas9.raw.tensor.features.txt", quote=F, row.names=F, sep="\t")
write.table(df.mat.id, "Ecoli.allCas9.raw.tensor.features_overlap.txt", quote=F, row.names=F, sep="\t")
write.table(df.mat, "Ecoli.allCas9.raw.tensor.features_overlap_noSampleIDs.txt", quote=F, row.names=F, sep="\t")
df <- read.delim("Ecoli.allCas9.raw.tensor.onehot.features.txt", header=T, sep="\t")
is.numeric(df[,2:ncol(df)])
df.mat <- as.matrix(df[,2:ncol(df)])
df.mat.id <- cbind(as.data.frame(df$df.sgRNAID), df.mat)
write.table(df.mat.id, "Ecoli.allCas9.raw.tensor.onehot.features.txt", quote=F, row.names=F, sep="\t")
write.table(df.mat.id, "Ecoli.allCas9.raw.tensor.onehot.features_overlap.txt", quote=F, row.names=F, sep="\t")
write.table(df.mat, "Ecoli.allCas9.raw.tensor.onehot.features_overlap_noSampleIDs.txt", quote=F, row.names=F, sep="\t")
# run python scripts on Andes
# run job submissions on Summit
# Builder script: /gpfs/alpine/syb105/proj-shared/Projects/iRF/iRF_LOOP_SetUp_CrossLayer.py
# [python iRF_LOOP_SetUp_CrossLayer.py --DataFile --YFile --System Summit --NodesPer 1 --TotalNodes 10 --RunTime 2 --Account SYB105 --NumTrees 1000 --NumIterations 5 --RunName iRF.XX --bypass --Prediction]
# Andes
cd /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli/iRF.run/Ecoli.allCas9
mkdir raw.tensor.features
mkdir raw.tensor.onehot.features
module load python/3.7-anaconda3
cd /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli/iRF.run/Ecoli.allCas9/raw.tensor.features
python /gpfs/alpine/syb105/proj-shared/Projects/iRF/iRF_LOOP_SetUp_CrossLayer.py --System Summit --NodesPer 1 --TotalNodes 50 --RunTime 60 --Account SYB105 --NumTrees 1000 --NumIterations 5 --RunName Ecoli.allCas9.raw.tensor --bypass --Prediction /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli/iRF.run/Ecoli.allCas9.raw.tensor.features.txt /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli/iRF.run/Ecoli.allCas9.score.txt
cd /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli/iRF.run/Ecoli.allCas9/raw.tensor.onehot.features
python /gpfs/alpine/syb105/proj-shared/Projects/iRF/iRF_LOOP_SetUp_CrossLayer.py --System Summit --NodesPer 1 --TotalNodes 50 --RunTime 90 --Account SYB105 --NumTrees 1000 --NumIterations 5 --RunName Ecoli.allCas9.raw.tensor.onehot --bypass --Prediction /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli/iRF.run/Ecoli.allCas9.raw.tensor.onehot.features.txt /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli/iRF.run/Ecoli.allCas9.score.txt
# Summit
cd /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli/iRF.run
module load python/3.7.0-anaconda3-5.3.0
# full
bsub /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli/iRF.run/Ecoli.allCas9/raw.tensor.features/Submits/submit_full_Ecoli.allCas9.raw.tensor_0.sh
bsub /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli/iRF.run/Ecoli.allCas9/raw.tensor.onehot.features/Submits/submit_full_Ecoli.allCas9.raw.tensor.onehot_0.sh
# train
bsub /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli/iRF.run/Ecoli.allCas9/raw.tensor.features/Submits/submit_train_Ecoli.allCas9.raw.tensor_0.sh
bsub /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli/iRF.run/Ecoli.allCas9/raw.tensor.onehot.features/Submits/submit_train_Ecoli.allCas9.raw.tensor.onehot_0.sh
# once the train submissions are done run the test submissions
# test
bsub /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli/iRF.run/Ecoli.allCas9/raw.tensor.features/Submits/submit_test_Ecoli.allCas9.raw.tensor_0.sh
bsub /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli/iRF.run/Ecoli.allCas9/raw.tensor.onehot.features/Submits/submit_test_Ecoli.allCas9.raw.tensor.onehot_0.sh
# Andes
cd /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli/iRF.run/Ecoli.allCas9
module load python/3.7-anaconda3
cd /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli/iRF.run/Ecoli.allCas9/raw.tensor.features
python /gpfs/alpine/syb105/proj-shared/Projects/iRF/iRF_postProcessing.py --Iterations 5 --Prediction --PredAccuracy MAE,MAEA,MSE,R2 --varTot 95 /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli/iRF.run/Ecoli.allCas9/YNames.txt Ecoli.allCas9.raw.tensor
cd /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli/iRF.run/Ecoli.allCas9/raw.tensor.onehot.features
python /gpfs/alpine/syb105/proj-shared/Projects/iRF/iRF_postProcessing.py --Iterations 5 --Prediction --PredAccuracy MAE,MAEA,MSE,R2 --varTot 95 /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli/iRF.run/Ecoli.allCas9/YNames.txt Ecoli.allCas9.raw.tensor.onehot
module load python
source /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/scripts/loadcondaandes.sh
conda activate /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/Libraries/andes/envs/test
R
# correlation
setwd("/gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli/iRF.run/Ecoli.allCas9/raw.tensor.features/cut.score/foldRuns/fold9/Runs/Set4")
pred <- read.delim("Ecoli.allCas9.raw.tensor_Set4_test.prediction", header=T, sep="\t")
y <- read.delim("set4_Y_test_noSampleIDs.txt", header=T, sep="\t")
cor(y$cut.score, pred$Predictions., method=c("pearson"))
# 0.3146719
cor(y$cut.score, pred$Predictions., method=c("spearman"))
# 0.364033
id <- read.delim("set4_test_SampleIDs.txt", header=F, sep="\t")
colnames(id) <- "sgRNAID"
id.pred <- cbind(id, pred)
id.pred.y <- cbind(id.pred, y)
library(tidyr)
id.pred.y.group <- id.pred.y %>% separate(sgRNAID, c("sgRNA", "ID", "group"), "_")
pred.Cas9 <- subset(id.pred.y.group, id.pred.y.group$group == "Cas9")
cor(pred.Cas9$cut.score, pred.Cas9$Predictions., method=c("pearson"))
# 0.7360972
pred.eSpCas9 <- subset(id.pred.y.group, id.pred.y.group$group == "eSpCas9")
cor(pred.eSpCas9$cut.score, pred.eSpCas9$Predictions., method=c("pearson"))
# 0.689881
pred.recACas9 <- subset(id.pred.y.group, id.pred.y.group$group == "recACas9")
cor(pred.recACas9$cut.score, pred.recACas9$Predictions., method=c("pearson"))
# 0.5117508
setwd("/gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli/iRF.run/Ecoli.allCas9/raw.tensor.onehot.features/cut.score/foldRuns/fold9/Runs/Set4")
pred <- read.delim("Ecoli.allCas9.raw.tensor.onehot_Set4_test.prediction", header=T, sep="\t")
y <- read.delim("set4_Y_test_noSampleIDs.txt", header=T, sep="\t")
cor(y$cut.score, pred$Predictions., method=c("pearson"))
# 0.3142851
cor(y$cut.score, pred$Predictions., method=c("spearman"))
# 0.3637944
id <- read.delim("set4_test_SampleIDs.txt", header=F, sep="\t")
colnames(id) <- "sgRNAID"
id.pred <- cbind(id, pred)
id.pred.y <- cbind(id.pred, y)
library(tidyr)
id.pred.y.group <- id.pred.y %>% separate(sgRNAID, c("sgRNA", "ID", "group"), "_")
pred.Cas9 <- subset(id.pred.y.group, id.pred.y.group$group == "Cas9")
cor(pred.Cas9$cut.score, pred.Cas9$Predictions., method=c("pearson"))
# 0.7367916
pred.eSpCas9 <- subset(id.pred.y.group, id.pred.y.group$group == "eSpCas9")
cor(pred.eSpCas9$cut.score, pred.eSpCas9$Predictions., method=c("pearson"))
# 0.6891104
pred.recACas9 <- subset(id.pred.y.group, id.pred.y.group$group == "recACas9")
cor(pred.recACas9$cut.score, pred.recACas9$Predictions., method=c("pearson"))
# 0.5126078
# salloc -A SYB105 -N 2 -t 4:00:00
module load python
source /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/scripts/loadcondaandes.sh
conda activate /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/Libraries/andes/envs/test
R
setwd("/gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli")
df <- read.delim("Ecoli.allCas9.DWT.raw.onehot.tensor.dcast.na.txt", header=T, sep="\t")
df <- na.omit(df)
## sample ID
df.features <- as.matrix(df[,c(3:ncol(df))])
df.cor <- cor(df.features)
write.table(df.cor, "Ecoli.allCas9.DWT.raw.onehot.tensor.features.correlation.txt", quote=F, row.names=F, sep="\t")
setwd("/gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli")
df.cor <- read.delim("Ecoli.allCas9.DWT.raw.onehot.tensor.features.correlation.txt", header=T, sep="\t")
df.cor[is.na(df.cor)] <- 0
df.num <- as.matrix(df.cor)
features <- colnames(df.num)
rownames(df.num) <- features
library(corrplot)
pdf("features.cor.pdf")
plot.new()
corrplot(df.cor, type = "upper", order = "hclust", tl.col = "black", tl.srt = 45)
dev.off()
library(pheatmap)
library(gplots)
pdf("features.cor.heatmap.pdf")
plot.new()
pheatmap(df.num, col=greenred(256), main="Feature Correlation", cluster_cols=F, fontsize_row=2, border_color=NA)
dev.off()
# Melt the correlation matrix
library(reshape2)
melted_cormat <- melt(df.num, na.rm = TRUE)
# Heatmap
library(ggplot2)
pdf("features.cor.heatmap2.pdf")
ggplot(data = melted_cormat, aes(Var2, Var1, fill = value))+
geom_tile(color = "white")+
scale_fill_gradient2(low = "blue", high = "red", mid = "white",
midpoint = 0, limit = c(-1,1), space = "Lab",
name="Pearson\nCorrelation") +
theme_minimal()+
theme(axis.text.x = element_text(angle = 45, vjust = 1,
size = 2, hjust = 1))+
coord_fixed()
dev.off()
######### find highly correlated variales using caret
setwd("/gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli")
df.cor <- read.delim("Ecoli.allCas9.DWT.raw.onehot.tensor.features.correlation.txt", header=T, sep="\t")
df.cor[is.na(df.cor)] <- 0
df.num <- as.matrix(df.cor)
features <- colnames(df.num)
rownames(df.num) <- features
library(caret)
findCorrelation(df.num, cutoff = .4, exact = TRUE, names = TRUE)
# 1204
findCorrelation(df.num, cutoff = .6, exact = TRUE, names = TRUE)
# 1132
findCorrelation(df.num, cutoff = .8, exact = TRUE, names = TRUE)
# 1100
findCorrelation(df.num, cutoff = .9, exact = TRUE, names = TRUE)
# 986
### remove from iRF run???
df.num.remove <- findCorrelation(df.num, cutoff = .9, exact = TRUE, names = TRUE)
write.tabe(df.num.remove, "ecoli.allCas9.features.highlycorrelated.txt", quote=F, row.names=F, sep="\t")
setwd("/gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli/iRF.run")
df <- read.delim("Ecoli.allCas9.features.txt", header=T, sep="\t")
df.subset <- df[ , -which(names(df) %in% df.num.remove)]
df.mat <- as.matrix(df.subset[,2:ncol(df.subset)])
df.mat.id <- cbind(as.data.frame(df$sgRNAID), df.mat)
write.table(df.mat.id, "Ecoli.allCas9.noncor.features.txt", quote=F, row.names=F, sep="\t")
write.table(df.mat.id, "Ecoli.allCas9.noncor.features_overlap.txt", quote=F, row.names=F, sep="\t")
write.table(df.mat, "Ecoli.allCas9.noncor.features_overlap_noSampleIDs.txt", quote=F, row.names=F, sep="\t")
# run python scripts on Andes
# run job submissions on Summit
# Builder script: /gpfs/alpine/syb105/proj-shared/Projects/iRF/iRF_LOOP_SetUp_CrossLayer.py
# [python iRF_LOOP_SetUp_CrossLayer.py --DataFile --YFile --System Summit --NodesPer 1 --TotalNodes 10 --RunTime 2 --Account SYB105 --NumTrees 1000 --NumIterations 5 --RunName iRF.XX --bypass --Prediction]
# Andes
cd /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli/iRF.run/Ecoli.allCas9
mkdir noncor.features
module load python/3.7-anaconda3
cd /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli/iRF.run/Ecoli.allCas9/noncor.features
python /gpfs/alpine/syb105/proj-shared/Projects/iRF/iRF_LOOP_SetUp_CrossLayer.py --System Summit --NodesPer 1 --TotalNodes 50 --RunTime 120 --Account SYB105 --NumTrees 1000 --NumIterations 5 --RunName Ecoli.allCas9.noncor --bypass --Prediction /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli/iRF.run/Ecoli.allCas9.noncor.features.txt /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli/iRF.run/Ecoli.allCas9.score.txt
# Summit
cd /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli/iRF.run
module load python/3.7.0-anaconda3-5.3.0
# full
bsub /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli/iRF.run/Ecoli.allCas9/noncor.features/Submits/submit_full_Ecoli.allCas9.noncor_0.sh
# train
bsub /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli/iRF.run/Ecoli.allCas9/noncor.features/Submits/submit_train_Ecoli.allCas9.noncor_0.sh
# once the train submissions are done run the test submissions
# test
bsub /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli/iRF.run/Ecoli.allCas9/noncor.features/Submits/submit_test_Ecoli.allCas9.noncor_0.sh
# Andes
cd /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli/iRF.run/Ecoli.allCas9
module load python/3.7-anaconda3
cd /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli/iRF.run/Ecoli.allCas9/noncor.features
python /gpfs/alpine/syb105/proj-shared/Projects/iRF/iRF_postProcessing.py --Iterations 5 --Prediction --PredAccuracy MAE,MAEA,MSE,R2 --varTot 95 /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli/iRF.run/Ecoli.allCas9/YNames.txt Ecoli.allCas9.noncor
# correlation
setwd("/gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli/iRF.run/Ecoli.allCas9/noncor.features/cut.score/foldRuns/fold9/Runs/Set4")
pred <- read.delim("Ecoli.allCas9.noncor_Set4_test.prediction", header=T, sep="\t")
y <- read.delim("set4_Y_test_noSampleIDs.txt", header=T, sep="\t")
cor(y$cut.score, pred$Predictions., method=c("pearson"))
# 0.6446967
cor(y$cut.score, pred$Predictions., method=c("spearman"))
# 0.5998552
# correlation - by Cas9 type
id <- read.delim("set4_test_SampleIDs.txt", header=F, sep="\t")
colnames(id) <- "sgRNAID"
id.pred <- cbind(id, pred)
id.pred.y <- cbind(id.pred, y)
library(tidyr)
id.pred.y.group <- id.pred.y %>% separate(sgRNAID, c("sgRNA", "ID", "group"), "_")
# 25236
cor(id.pred.y.group$cut.score, id.pred.y.group$Predictions., method=c("pearson"))
# 0.6446967
pred.Cas9 <- subset(id.pred.y.group, id.pred.y.group$group == "Cas9")
cor(pred.Cas9$cut.score, pred.Cas9$Predictions., method=c("pearson"))
# 0.4463958
pred.eSpCas9 <- subset(id.pred.y.group, id.pred.y.group$group == "eSpCas9")
cor(pred.eSpCas9$cut.score, pred.eSpCas9$Predictions., method=c("pearson"))
# 0.5049007
pred.recACas9 <- subset(id.pred.y.group, id.pred.y.group$group == "recACas9")
cor(pred.recACas9$cut.score, pred.recACas9$Predictions., method=c("pearson"))
# 0.2918939
library(tidyr)
library(ggplot2)
setwd("/gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli/iRF.run/Ecoli.allCas9/noncor.features/normalizedEdgeFiles")
feature <- read.delim("cut.score_Normalize.txt", header=F, sep=" ")
feature.order <- feature[order(-feature$V2),]
feature.order.top <- feature.order[1:25,]
feature.order.top$id <- feature.order.top$V1
df <- separate(feature.order.top, V1, c("feature", "type"))
pdf("all.features.noncorrelated.importance.pdf")
ggplot(df, aes(x = reorder(id, -V2), y = V2)) + geom_bar(stat = "identity") + theme_classic() + theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1))
dev.off()
# scp noshayjm@dtn.ccs.ornl.gov:/gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli/iRF.run/Ecoli.allCas9/noncor.features/normalizedEdgeFiles/all.features.noncorrelated.importance.pdf /Users/27n/Dropbox (ORNL)/ORNL.Noshay/Projects/SEED/ExploratoryDataForModelGeneration/e.coli/.
library(ggplot2)
library(RColorBrewer)
pdf("all.features.noncorrelated.correlation.pdf")
ggplot(id.pred.y.group, aes(x=cut.score, y=Predictions., shape=group, color=group)) + geom_point() + scale_colour_brewer(palette = "RdYlBu") + theme_classic()
dev.off()
# scp noshayjm@dtn.ccs.ornl.gov:/gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli/iRF.run/Ecoli.allCas9/noncor.features/normalizedEdgeFiles/all.features.noncorrelated.correlation.pdf /Users/27n/Dropbox (ORNL)/ORNL.Noshay/Projects/SEED/ExploratoryDataForModelGeneration/e.coli/.
setwd("/gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli/iRF.run/")
df <- read.delim("Ecoli.allCas9.score.txt", header=T, sep="\t")
# normalize cut.score values: (x - min(x)) / (max(x) - min(x))
df$cut.score.norm <- (df$cut.score - min(df$cut.score)) / (max(df$cut.score) - min(df$cut.score))
df.norm <- df[,c(1,3)]
write.table(df.norm, "Ecoli.allCas9.score.normalized.txt", quote=F, row.names=F, sep="\t")
# Andes
cd /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli/iRF.run
mkdir Ecoli.allCas9.norm
mkdir Ecoli.allCas9.norm/all.features
mkdir Ecoli.allCas9.norm/onehot.features
mkdir Ecoli.allCas9.norm/tensor.features
module load python/3.7-anaconda3
cd /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli/iRF.run/Ecoli.allCas9.norm/all.features
python /gpfs/alpine/syb105/proj-shared/Projects/iRF/iRF_LOOP_SetUp_CrossLayer.py --System Summit --NodesPer 1 --TotalNodes 50 --RunTime 90 --Account SYB105 --NumTrees 1000 --NumIterations 5 --RunName Ecoli.allCas9 --bypass --Prediction /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli/iRF.run/Ecoli.allCas9.features.txt /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli/iRF.run/Ecoli.allCas9.score.normalized.txt
cd /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli/iRF.run/Ecoli.allCas9.norm/onehot.features
python /gpfs/alpine/syb105/proj-shared/Projects/iRF/iRF_LOOP_SetUp_CrossLayer.py --System Summit --NodesPer 1 --TotalNodes 50 --RunTime 90 --Account SYB105 --NumTrees 1000 --NumIterations 5 --RunName Ecoli.allCas9.onehot --bypass --Prediction /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli/iRF.run/Ecoli.allCas9.onehot.features.txt /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli/iRF.run/Ecoli.allCas9.score.normalized.txt
cd /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli/iRF.run/Ecoli.allCas9.norm/tensor.features
python /gpfs/alpine/syb105/proj-shared/Projects/iRF/iRF_LOOP_SetUp_CrossLayer.py --System Summit --NodesPer 1 --TotalNodes 50 --RunTime 90 --Account SYB105 --NumTrees 1000 --NumIterations 5 --RunName Ecoli.allCas9.tensor --bypass --Prediction /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli/iRF.run/Ecoli.allCas9.tensor.features.txt /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli/iRF.run/Ecoli.allCas9.score.normalized.txt
# Summit
cd /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli/iRF.run
module load python/3.7.0-anaconda3-5.3.0
# full
bsub /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli/iRF.run/Ecoli.allCas9.norm/all.features/Submits/submit_full_Ecoli.allCas9_0.sh
bsub /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli/iRF.run/Ecoli.allCas9.norm/onehot.features/Submits/submit_full_Ecoli.allCas9.onehot_0.sh
bsub /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli/iRF.run/Ecoli.allCas9.norm/tensor.features/Submits/submit_full_Ecoli.allCas9.tensor_0.sh
# train
bsub /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli/iRF.run/Ecoli.allCas9.norm/all.features/Submits/submit_train_Ecoli.allCas9_0.sh
bsub /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli/iRF.run/Ecoli.allCas9.norm/onehot.features/Submits/submit_train_Ecoli.allCas9.onehot_0.sh
bsub /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli/iRF.run/Ecoli.allCas9.norm/tensor.features/Submits/submit_train_Ecoli.allCas9.tensor_0.sh
# once the train submissions are done run the test submissions
# test
bsub /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli/iRF.run/Ecoli.allCas9.norm/all.features/Submits/submit_test_Ecoli.allCas9_0.sh
bsub /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli/iRF.run/Ecoli.allCas9.norm/onehot.features/Submits/submit_test_Ecoli.allCas9.onehot_0.sh
bsub /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli/iRF.run/Ecoli.allCas9.norm/tensor.features/Submits/submit_test_Ecoli.allCas9.tensor_0.sh
# Andes
cd /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli/iRF.run/Ecoli.allCas9.norm
module load python/3.7-anaconda3
cd /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli/iRF.run/Ecoli.allCas9.norm/all.features
python /gpfs/alpine/syb105/proj-shared/Projects/iRF/iRF_postProcessing.py --Iterations 5 --Prediction --PredAccuracy MAE,MAEA,MSE,R2 --varTot 95 /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli/iRF.run/Ecoli.allCas9.norm/YNames.txt Ecoli.allCas9
cd /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli/iRF.run/Ecoli.allCas9.norm/onehot.features
python /gpfs/alpine/syb105/proj-shared/Projects/iRF/iRF_postProcessing.py --Iterations 5 --Prediction --PredAccuracy MAE,MAEA,MSE,R2 --varTot 95 /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli/iRF.run/Ecoli.allCas9.norm/YNames.txt Ecoli.allCas9.onehot
cd /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli/iRF.run/Ecoli.allCas9.norm/tensor.features
python /gpfs/alpine/syb105/proj-shared/Projects/iRF/iRF_postProcessing.py --Iterations 5 --Prediction --PredAccuracy MAE,MAEA,MSE,R2 --varTot 95 /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli/iRF.run/Ecoli.allCas9.norm/YNames.txt Ecoli.allCas9.tensor
# correlation
setwd("/gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli/iRF.run/Ecoli.allCas9.norm/all.features/cut.score.norm/foldRuns/fold9/Runs/Set4")
pred <- read.delim("Ecoli.allCas9_Set4_test.prediction", header=T, sep="\t")
y <- read.delim("set4_Y_test_noSampleIDs.txt", header=T, sep="\t")
cor(y$cut.score.norm, pred$Predictions., method=c("pearson"))
# 0.6549382
cor(y$cut.score.norm, pred$Predictions., method=c("spearman"))
# 0.611766
id <- read.delim("set4_test_SampleIDs.txt", header=F, sep="\t")
colnames(id) <- "sgRNAID"
id.pred <- cbind(id, pred)
id.pred.y <- cbind(id.pred, y)
library(tidyr)
id.pred.y.group <- id.pred.y %>% separate(sgRNAID, c("sgRNA", "ID", "group"), "_")
# 25236
cor(id.pred.y.group$cut.score, id.pred.y.group$Predictions., method=c("pearson"))
# 0.6549382
pred.Cas9 <- subset(id.pred.y.group, id.pred.y.group$group == "Cas9")
cor(pred.Cas9$cut.score, pred.Cas9$Predictions., method=c("pearson"))
# 0.4608008
pred.eSpCas9 <- subset(id.pred.y.group, id.pred.y.group$group == "eSpCas9")
cor(pred.eSpCas9$cut.score, pred.eSpCas9$Predictions., method=c("pearson"))
# 0.5306271
pred.recACas9 <- subset(id.pred.y.group, id.pred.y.group$group == "recACas9")
cor(pred.recACas9$cut.score, pred.recACas9$Predictions., method=c("pearson"))
# 0.3039527
setwd("/gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli/iRF.run/Ecoli.allCas9.norm/onehot.features/cut.score.norm/foldRuns/fold9/Runs/Set4")
pred <- read.delim("Ecoli.allCas9.onehot_Set4_test.prediction", header=T, sep="\t")
y <- read.delim("set4_Y_test_noSampleIDs.txt", header=T, sep="\t")
cor(y$cut.score.norm, pred$Predictions., method=c("pearson"))
# 0.3062494
cor(y$cut.score.norm, pred$Predictions., method=c("spearman"))
# 0.3573005
id <- read.delim("set4_test_SampleIDs.txt", header=F, sep="\t")
colnames(id) <- "sgRNAID"
id.pred <- cbind(id, pred)
id.pred.y <- cbind(id.pred, y)
library(tidyr)
id.pred.y.group <- id.pred.y %>% separate(sgRNAID, c("sgRNA", "ID", "group"), "_")
# 25236
cor(id.pred.y.group$cut.score, id.pred.y.group$Predictions., method=c("pearson"))
# 0.3062494
pred.Cas9 <- subset(id.pred.y.group, id.pred.y.group$group == "Cas9")
cor(pred.Cas9$cut.score, pred.Cas9$Predictions., method=c("pearson"))
# 0.733117
pred.eSpCas9 <- subset(id.pred.y.group, id.pred.y.group$group == "eSpCas9")
cor(pred.eSpCas9$cut.score, pred.eSpCas9$Predictions., method=c("pearson"))
# 0.6803328
pred.recACas9 <- subset(id.pred.y.group, id.pred.y.group$group == "recACas9")
cor(pred.recACas9$cut.score, pred.recACas9$Predictions., method=c("pearson"))
# 0.5036643
setwd("/gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli/iRF.run/Ecoli.allCas9.norm/tensor.features/cut.score.norm/foldRuns/fold9/Runs/Set4")
pred <- read.delim("Ecoli.allCas9.tensor_Set4_test.prediction", header=T, sep="\t")
y <- read.delim("set4_Y_test_noSampleIDs.txt", header=T, sep="\t")
cor(y$cut.score.norm, pred$Predictions., method=c("pearson"))
# 0.3137147
cor(y$cut.score.norm, pred$Predictions., method=c("spearman"))
# 0.3629264
id <- read.delim("set4_test_SampleIDs.txt", header=F, sep="\t")
colnames(id) <- "sgRNAID"
id.pred <- cbind(id, pred)
id.pred.y <- cbind(id.pred, y)
library(tidyr)
id.pred.y.group <- id.pred.y %>% separate(sgRNAID, c("sgRNA", "ID", "group"), "_")
# 25236
cor(id.pred.y.group$cut.score, id.pred.y.group$Predictions., method=c("pearson"))
# 0.3137147
pred.Cas9 <- subset(id.pred.y.group, id.pred.y.group$group == "Cas9")
cor(pred.Cas9$cut.score, pred.Cas9$Predictions., method=c("pearson"))
# 0.7356844
pred.eSpCas9 <- subset(id.pred.y.group, id.pred.y.group$group == "eSpCas9")
cor(pred.eSpCas9$cut.score, pred.eSpCas9$Predictions., method=c("pearson"))
# 0.6895006
pred.recACas9 <- subset(id.pred.y.group, id.pred.y.group$group == "recACas9")
cor(pred.recACas9$cut.score, pred.recACas9$Predictions., method=c("pearson"))
# 0.5094097
–> should I be normalizing within each Cas9 type and then putting them together? Probably because otherwise the ranking will still be off…
setwd("/gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli/iRF.run/")
df <- read.delim("Ecoli.allCas9.score.txt", header=T, sep="\t")
library(tidyr)
df$sgRNAID.group <- df$sgRNAID
df.group <- df %>% separate(sgRNAID.group, c("sgRNA", "ID", "group"), "_")
df.cas9 <- subset(df.group, df.group$group == "Cas9")
df.eSp <- subset(df.group, df.group$group == "eSpCas9")
df.recA <- subset(df.group, df.group$group == "recACas9")
# normalize cut.score values: (x - min(x)) / (max(x) - min(x))
df$cut.score.norm <- (df$cut.score - min(df$cut.score)) / (max(df$cut.score) - min(df$cut.score))
df.cas9$cut.score.norm <- (df.cas9$cut.score - min(df.cas9$cut.score)) / (max(df.cas9$cut.score) - min(df.cas9$cut.score))
df.eSp$cut.score.norm <- (df.eSp$cut.score - min(df.eSp$cut.score)) / (max(df.eSp$cut.score) - min(df.eSp$cut.score))
df.recA$cut.score.norm <- (df.recA$cut.score - min(df.recA$cut.score)) / (max(df.recA$cut.score) - min(df.recA$cut.score))
df.all <- rbind(df.cas9, df.eSp, df.recA)
df.norm <- df.all[,c(1,6)]
write.table(df.norm, "Ecoli.allCas9.score.Cas9normalized.txt", quote=F, row.names=F, sep="\t")
# Andes
cd /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli/iRF.run
mkdir Ecoli.allCas9.norm/all.cas9norm.features
module load python/3.7-anaconda3
cd /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli/iRF.run/Ecoli.allCas9.norm/all.cas9norm.features
python /gpfs/alpine/syb105/proj-shared/Projects/iRF/iRF_LOOP_SetUp_CrossLayer.py --System Summit --NodesPer 1 --TotalNodes 50 --RunTime 120 --Account SYB105 --NumTrees 1000 --NumIterations 5 --RunName Ecoli.allCas9 --bypass --Prediction /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli/iRF.run/Ecoli.allCas9.features.txt /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli/iRF.run/Ecoli.allCas9.score.Cas9normalized.txt
# Summit
cd /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli/iRF.run
module load python/3.7.0-anaconda3-5.3.0
# full
bsub /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli/iRF.run/Ecoli.allCas9.norm/all.cas9norm.features/Submits/submit_full_Ecoli.allCas9_0.sh
# train
bsub /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli/iRF.run/Ecoli.allCas9.norm/all.cas9norm.features/Submits/submit_train_Ecoli.allCas9_0.sh
# once the train submissions are done run the test submissions
# test
bsub /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli/iRF.run/Ecoli.allCas9.norm/all.cas9norm.features/Submits/submit_test_Ecoli.allCas9_0.sh
# Andes
cd /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli/iRF.run/Ecoli.allCas9.norm
module load python/3.7-anaconda3
cd /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli/iRF.run/Ecoli.allCas9.norm/all.cas9norm.features
python /gpfs/alpine/syb105/proj-shared/Projects/iRF/iRF_postProcessing.py --Iterations 5 --Prediction --PredAccuracy MAE,MAEA,MSE,R2 --varTot 95 /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli/iRF.run/Ecoli.allCas9.norm/YNames.txt Ecoli.allCas9
# correlation
setwd("/gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli/iRF.run/Ecoli.allCas9.norm/all.cas9norm.features/cut.score.norm/foldRuns/fold9/Runs/Set4")
pred <- read.delim("Ecoli.allCas9_Set4_test.prediction", header=T, sep="\t")
y <- read.delim("set4_Y_test_noSampleIDs.txt", header=T, sep="\t")
cor(y$cut.score.norm, pred$Predictions., method=c("pearson"))
# 0.009126348
cor(y$cut.score.norm, pred$Predictions., method=c("spearman"))
# 0.006658865
id <- read.delim("set4_test_SampleIDs.txt", header=F, sep="\t")
colnames(id) <- "sgRNAID"
id.pred <- cbind(id, pred)
id.pred.y <- cbind(id.pred, y)
library(tidyr)
id.pred.y.group <- id.pred.y %>% separate(sgRNAID, c("sgRNA", "ID", "group"), "_")
#
cor(id.pred.y.group$cut.score, id.pred.y.group$Predictions., method=c("pearson"))
# 0.009126348
pred.Cas9 <- subset(id.pred.y.group, id.pred.y.group$group == "Cas9")
cor(pred.Cas9$cut.score, pred.Cas9$Predictions., method=c("pearson"))
# 0.01767534
pred.eSpCas9 <- subset(id.pred.y.group, id.pred.y.group$group == "eSpCas9")
cor(pred.eSpCas9$cut.score, pred.eSpCas9$Predictions., method=c("pearson"))
# 0.007529625
pred.recACas9 <- subset(id.pred.y.group, id.pred.y.group$group == "recACas9")
cor(pred.recACas9$cut.score, pred.recACas9$Predictions., method=c("pearson"))
# 0.003470348
############### Why did this get so bad??? ############### --> what other methods of normalization should i try?
### Look into the actual range of cut scores
setwd("/gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli/iRF.run/")
df <- read.delim("Ecoli.allCas9.score.txt", header=T, sep="\t")
library(tidyr)
df$sgRNAID.group <- df$sgRNAID
df.group <- df %>% separate(sgRNAID.group, c("sgRNA", "ID", "group"), "_")
df.cas9 <- subset(df.group, df.group$group == "Cas9")
df.eSp <- subset(df.group, df.group$group == "eSpCas9")
df.recA <- subset(df.group, df.group$group == "recACas9")
summary(df.cas9$cut.score)
# Min. 1st Qu. Median Mean 3rd Qu. Max.
# 0.00 17.24 27.18 24.56 32.69 48.38
summary(df.eSp$cut.score)
# Min. 1st Qu. Median Mean 3rd Qu. Max.
# 0.00 8.67 16.71 16.91 25.11 45.17
summary(df.recA$cut.score)
# Min. 1st Qu. Median Mean 3rd Qu. Max.
# 0.01 10.90 13.13 12.43 14.87 22.03
nrow(df.cas9)
# 40518
nrow(df.eSp)
# 41465
nrow(df.recA)
# 44198
–> try to run the model on the full feature set but on each Cas9 type separately
cd /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli/iRF.run/
sed '1p;/_Cas9/!d' Ecoli.allCas9.score.txt > Ecoli.Cas9.score.txt
sed '1p;/_Cas9/!d' Ecoli.allCas9.score.txt > Ecoli.Cas9.score_overlap.txt
sed '1p;/_Cas9/!d' Ecoli.allCas9.score.txt | cut -f 2 > Ecoli.Cas9.score_overlap_noSampleIDs.txt
sed '1p;/_eSpCas9/!d' Ecoli.allCas9.score.txt > Ecoli.eSpCas9.score.txt
sed '1p;/_eSpCas9/!d' Ecoli.allCas9.score.txt > Ecoli.eSpCas9.score_overlap.txt
sed '1p;/_eSpCas9/!d' Ecoli.allCas9.score.txt | cut -f 2 > Ecoli.eSpCas9.score_overlap_noSampleIDs.txt
sed '1p;/_recACas9/!d' Ecoli.allCas9.score.txt > Ecoli.recACas9.score.txt
sed '1p;/_recACas9/!d' Ecoli.allCas9.score.txt > Ecoli.recACas9.score_overlap.txt
sed '1p;/_recACas9/!d' Ecoli.allCas9.score.txt | cut -f 2 > Ecoli.recACas9.score_overlap_noSampleIDs.txt
sed '1p;/_Cas9/!d' Ecoli.allCas9.features.txt > Ecoli.Cas9.features.txt
sed '1p;/_Cas9/!d' Ecoli.allCas9.features.txt > Ecoli.Cas9.features_overlap.txt
sed '1p;/_Cas9/!d' Ecoli.allCas9.features.txt | cut -f 1 --complement > Ecoli.Cas9.features_overlap_noSampleIDs.txt
sed '1p;/_eSpCas9/!d' Ecoli.allCas9.features.txt > Ecoli.eSpCas9.features.txt
sed '1p;/_eSpCas9/!d' Ecoli.allCas9.features.txt > Ecoli.eSpCas9.features_overlap.txt
sed '1p;/_eSpCas9/!d' Ecoli.allCas9.features.txt | cut -f 1 --complement > Ecoli.eSpCas9.features_overlap_noSampleIDs.txt
sed '1p;/_recACas9/!d' Ecoli.allCas9.features.txt > Ecoli.recACas9.features.txt
sed '1p;/_recACas9/!d' Ecoli.allCas9.features.txt > Ecoli.recACas9.features_overlap.txt
sed '1p;/_recACas9/!d' Ecoli.allCas9.features.txt | cut -f 1 --complement > Ecoli.recACas9.features_overlap_noSampleIDs.txt
# Andes
cd /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli/iRF.run
mkdir Ecoli.Cas9
mkdir Ecoli.eSpCas9
mkdir Ecoli.recACas9
module load python/3.7-anaconda3
cd /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli/iRF.run/Ecoli.Cas9
python /gpfs/alpine/syb105/proj-shared/Projects/iRF/iRF_LOOP_SetUp_CrossLayer.py --System Summit --NodesPer 1 --TotalNodes 50 --RunTime 60 --Account SYB105 --NumTrees 1000 --NumIterations 5 --RunName Ecoli.Cas9 --bypass --Prediction /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli/iRF.run/Ecoli.Cas9.features.txt /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli/iRF.run/Ecoli.Cas9.score.txt
cd /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli/iRF.run/Ecoli.eSpCas9
python /gpfs/alpine/syb105/proj-shared/Projects/iRF/iRF_LOOP_SetUp_CrossLayer.py --System Summit --NodesPer 1 --TotalNodes 50 --RunTime 60 --Account SYB105 --NumTrees 1000 --NumIterations 5 --RunName Ecoli.eSpCas9 --bypass --Prediction /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli/iRF.run/Ecoli.eSpCas9.features.txt /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli/iRF.run/Ecoli.eSpCas9.score.txt
cd /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli/iRF.run/Ecoli.recACas9
python /gpfs/alpine/syb105/proj-shared/Projects/iRF/iRF_LOOP_SetUp_CrossLayer.py --System Summit --NodesPer 1 --TotalNodes 50 --RunTime 60 --Account SYB105 --NumTrees 1000 --NumIterations 5 --RunName Ecoli.recACas9 --bypass --Prediction /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli/iRF.run/Ecoli.recACas9.features.txt /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli/iRF.run/Ecoli.recACas9.score.txt
# Summit
cd /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli/iRF.run
module load python/3.7.0-anaconda3-5.3.0
# full
bsub /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli/iRF.run/Ecoli.Cas9/Submits/submit_full_Ecoli.Cas9_0.sh
bsub /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli/iRF.run/Ecoli.eSpCas9/Submits/submit_full_Ecoli.eSpCas9_0.sh
bsub /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli/iRF.run/Ecoli.recACas9/Submits/submit_full_Ecoli.recACas9_0.sh
# train
bsub /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli/iRF.run/Ecoli.Cas9/Submits/submit_train_Ecoli.Cas9_0.sh
bsub /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli/iRF.run/Ecoli.eSpCas9/Submits/submit_train_Ecoli.eSpCas9_0.sh
bsub /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli/iRF.run/Ecoli.recACas9/Submits/submit_train_Ecoli.recACas9_0.sh
# once the train submissions are done run the test submissions
# test
bsub /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli/iRF.run/Ecoli.Cas9/Submits/submit_test_Ecoli.Cas9_0.sh
bsub /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli/iRF.run/Ecoli.eSpCas9/Submits/submit_test_Ecoli.eSpCas9_0.sh
bsub /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli/iRF.run/Ecoli.recACas9/Submits/submit_test_Ecoli.recACas9_0.sh
# Andes
module load python/3.7-anaconda3
cd /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli/iRF.run/Ecoli.Cas9
python /gpfs/alpine/syb105/proj-shared/Projects/iRF/iRF_postProcessing.py --Iterations 5 --Prediction --PredAccuracy MAE,MAEA,MSE,R2 --varTot 95 /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli/iRF.run/Ecoli.Cas9/YNames.txt Ecoli.Cas9
cd /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli/iRF.run/Ecoli.eSpCas9
python /gpfs/alpine/syb105/proj-shared/Projects/iRF/iRF_postProcessing.py --Iterations 5 --Prediction --PredAccuracy MAE,MAEA,MSE,R2 --varTot 95 /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli/iRF.run/Ecoli.eSpCas9/YNames.txt Ecoli.eSpCas9
cd /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli/iRF.run/Ecoli.recACas9
python /gpfs/alpine/syb105/proj-shared/Projects/iRF/iRF_postProcessing.py --Iterations 5 --Prediction --PredAccuracy MAE,MAEA,MSE,R2 --varTot 95 /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli/iRF.run/Ecoli.recACas9/YNames.txt Ecoli.recACas9
# correlation - Cas9
setwd("/gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli/iRF.run/Ecoli.Cas9/cut.score/foldRuns/fold9/Runs/Set4")
pred <- read.delim("Ecoli.Cas9_Set4_test.prediction", header=T, sep="\t")
y <- read.delim("set4_Y_test_noSampleIDs.txt", header=T, sep="\t")
cor(y$cut.score, pred$Predictions., method=c("pearson"))
# 0.4665774
cor(y$cut.score, pred$Predictions., method=c("spearman"))
# 0.4479432
# correlation - eSpCas9
setwd("/gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli/iRF.run/Ecoli.eSpCas9/cut.score/foldRuns/fold9/Runs/Set4")
pred <- read.delim("Ecoli.eSpCas9_Set4_test.prediction", header=T, sep="\t")
y <- read.delim("set4_Y_test_noSampleIDs.txt", header=T, sep="\t")
cor(y$cut.score, pred$Predictions., method=c("pearson"))
# 0.5523305
cor(y$cut.score, pred$Predictions., method=c("spearman"))
# 0.5507119
# correlation - recA
setwd("/gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli/iRF.run/Ecoli.recACas9/cut.score/foldRuns/fold9/Runs/Set4")
pred <- read.delim("Ecoli.recACas9_Set4_test.prediction", header=T, sep="\t")
y <- read.delim("set4_Y_test_noSampleIDs.txt", header=T, sep="\t")
cor(y$cut.score, pred$Predictions., method=c("pearson"))
# 0.3022364
cor(y$cut.score, pred$Predictions., method=c("spearman"))
# 0.2739521
# /Users/27n/Dropbox (ORNL)/ORNL.Noshay/Projects/SEED/ExploratoryDataForModelGeneration/Y.Lipolytica.SupTable1.txt
# /Users/27n/Dropbox (ORNL)/ORNL.Noshay/Projects/SEED/ExploratoryDataForModelGeneration/GSM552919_Ylip.fsa.txt
module load python
source /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/scripts/loadcondaandes.sh
conda activate /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/Libraries/andes/envs/test
### dataset --> Data S4... save each sheet as a dataframe, add column declaring Cas9 type, intersect with Data S1 for sequence, create new sgRNAID using both the ID and Cas9 type, merge files
setwd("/Users/27n/Dropbox (ORNL)/ORNL.Noshay/Projects/SEED/ExploratoryDataForModelGeneration")
df <- read.delim("Y.Lipolytica.SupTable1.txt", header=T, sep="\t")
library(dplyr)
library(tidyr)
df2 <- unite(df, sgRNAID,c("Number", "Gene.target"), sep="_", remove=TRUE)
df3 <- df2[,c(1,3,2)]
colnames(df3) <- c("sgRNAID", "cut.score", "nucleotide.sequence")
df.na <- na.omit(df3)
# 46711
write.table(df.na, "Y.Lipolytica.txt", quote=F, row.names=F, sep="\t")
sed '1d' Y.Lipolytica.txt | awk '{print ">"$1"\n"$3}' > Y.Lipolytica.fasta
# cd /Users/27n/Dropbox (ORNL)/ORNL.Noshay/Projects/SEED/ExploratoryDataForModelGeneration/
# scp Y.Lipolytica.txt noshayjm@dtn.ccs.ornl.gov:/gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli/.
# scp Y.Lipolytica.fasta noshayjm@dtn.ccs.ornl.gov:/gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli/.
### melting temp
# count nucleotides
cd /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli
python
input_file = open('Y.Lipolytica.fasta', 'r')
output_file = open('Y.Lipolytica_nuc_counts.tsv','w')
output_file.write('Window\tA\tC\tG\tT\tLength\tCG%\n')
from Bio import SeqIO
for cur_record in SeqIO.parse(input_file, "fasta") :
gene_name = cur_record.name
A_count = cur_record.seq.count('A')
C_count = cur_record.seq.count('C')
G_count = cur_record.seq.count('G')
T_count = cur_record.seq.count('T')
length = len(cur_record.seq)
cg_percentage = float(C_count + G_count) / length
output_line = '%s\t%i\t%i\t%i\t%i\t%i\t%f\n' % \
(gene_name, A_count, C_count, G_count, T_count, length, cg_percentage)
output_file.write(output_line)
output_file.close()
input_file.close()
exit()
# Melting temperature(°C) = 64.9 + 41 * (nG+nC-16.4)/(nA+nT+nG+nC)
R
library(dplyr)
setwd("/gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli")
df <- read.delim("Y.Lipolytica_nuc_counts.tsv", header=T, sep="\t")
df.melt <- df %>% mutate(MeltingTemp = 64.9 + 41 * (G+C-16.4) / (A+T+G+C))
write.table(df.melt, "Y.Lipolytica.nuc.count.txt", quote=F, row.names=F, sep="\t")
q()
### structure
cd /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli
python /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/Libraries/andes/iFeature/iFeature.py --file Y.Lipolytica.fasta --type AAC --out Y.Lipolytica.structure.txt
### onehot encoding
cd /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli/
cut -f 1,3 Y.Lipolytica.txt > Y.Lipolytica.noscore.txt
python encode_sequences.py Y.Lipolytica.noscore.txt
# separate nucleotide sequence values into individual columns in data frame so each position counts as one feature
cd /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli/
sed '1d' Y.Lipolytica.noscore_independent1.txt | sed '1d' | awk '{gsub(/./,"& ",$2);print $0}' | sed '1i sgRNAID A C T G' | cut -d ' ' -f 1-5 > Y.Lipolytica_ind1.txt
sed '1d' Y.Lipolytica.noscore_independent2.txt | sed '1d' | awk '{gsub(/./,"& ",$2);print $0}' | sed '1i sgRNAID AA AC AT AG CA CC CT CG TA TC TT TG GA GC GT GG' | cut -d ' ' -f 1-17 > Y.Lipolytica_ind2.txt
sed '1d' Y.Lipolytica.noscore_dependent1.txt | sed '1d' | awk '{gsub(/./,"& ",$2);print $0}' | sed '1i sgRNAID p1.A p1.C p1.T p1.G p2.A p2.C p2.T p2.G p3.A p3.C p3.T p3.G p4.A p4.C p4.T p4.G p5.A p5.C p5.T p5.G p6.A p6.C p6.T p6.G p7.A p7.C p7.T p7.G p8.A p8.C p8.T p8.G p9.A p9.C p9.T p9.G p10.A p10.C p10.T p10.G p11.A p11.C p11.T p11.G p12.A p12.C p12.T p12.G p13.A p13.C p13.T p13.G p14.A p14.C p14.T p14.G p15.A p15.C p15.T p15.G p16.A p16.C p16.T p16.G p17.A p17.C p17.T p17.G p18.A p18.C p18.T p18.G p19.A p19.C p19.T p19.G p20.A p20.C p20.T p20.G' | cut -d ' ' -f 1-21 > Y.Lipolytica_dep1.txt
sed '1d' Y.Lipolytica.noscore_dependent2.txt | sed '1d' | awk '{gsub(/./,"& ",$2);print $0}' | sed '1i sgRNAID p1.AA p1.AC p1.AT p1.AG p1.CA p1.CC p1.CT p1.CG p1.TA p1.TC p1.TT p1.TG p1.GA p1.GC p1.GT p1.GG p2.AA p2.AC p2.AT p2.AG p2.CA p2.CC p2.CT p2.CG p2.TA p2.TC p2.TT p2.TG p2.GA p2.GC p2.GT p2.GG p3.AA p3.AC p3.AT p3.AG p3.CA p3.CC p3.CT p3.CG p3.TA p3.TC p3.TT p3.TG p3.GA p3.GC p3.GT p3.GG p4.AA p4.AC p4.AT p4.AG p4.CA p4.CC p4.CT p4.CG p4.TA p4.TC p4.TT p4.TG p4.GA p4.GC p4.GT p4.GG p5.AA p5.AC p5.AT p5.AG p5.CA p5.CC p5.CT p5.CG p5.TA p5.TC p5.TT p5.TG p5.GA p5.GC p5.GT p5.GG p6.AA p6.AC p6.AT p6.AG p6.CA p6.CC p6.CT p6.CG p6.TA p6.TC p6.TT p6.TG p6.GA p6.GC p6.GT p6.GG p7.AA p7.AC p7.AT p7.AG p7.CA p7.CC p7.CT p7.CG p7.TA p7.TC p7.TT p7.TG p7.GA p7.GC p7.GT p7.GG p8.AA p8.AC p8.AT p8.AG p8.CA p8.CC p8.CT p8.CG p8.TA p8.TC p8.TT p8.TG p8.GA p8.GC p8.GT p8.GG p9.AA p9.AC p9.AT p9.AG p9.CA p9.CC p9.CT p9.CG p9.TA p9.TC p9.TT p9.TG p9.GA p9.GC p9.GT p9.GG p10.AA p10.AC p10.AT p10.AG p10.CA p10.CC p10.CT p10.CG p10.TA p10.TC p10.TT p10.TG p10.GA p10.GC p10.GT p10.GG p11.AA p11.AC p11.AT p11.AG p11.CA p11.CC p11.CT p11.CG p11.TA p11.TC p11.TT p11.TG p11.GA p11.GC p11.GT p11.GG p12.AA p12.AC p12.AT p12.AG p12.CA p12.CC p12.CT p12.CG p12.TA p12.TC p12.TT p12.TG p12.GA p12.GC p12.GT p12.GG p13.AA p13.AC p13.AT p13.AG p13.CA p13.CC p13.CT p13.CG p13.TA p13.TC p13.TT p13.TG p13.GA p13.GC p13.GT p13.GG p14.AA p14.AC p14.AT p14.AG p14.CA p14.CC p14.CT p14.CG p14.TA p14.TC p14.TT p14.TG p14.GA p14.GC p14.GT p14.GG p15.AA p15.AC p15.AT p15.AG p15.CA p15.CC p15.CT p15.CG p15.TA p15.TC p15.TT p15.TG p15.GA p15.GC p15.GT p15.GG p16.AA p16.AC p16.AT p16.AG p16.CA p16.CC p16.CT p16.CG p16.TA p16.TC p16.TT p16.TG p16.GA p16.GC p16.GT p16.GG p17.AA p17.AC p17.AT p17.AG p17.CA p17.CC p17.CT p17.CG p17.TA p17.TC p17.TT p17.TG p17.GA p17.GC p17.GT p17.GG p18.AA p18.AC p18.AT p18.AG p18.CA p18.CC p18.CT p18.CG p18.TA p18.TC p18.TT p18.TG p18.GA p18.GC p18.GT p18.GG p19.AA p19.AC p19.AT p19.AG p19.CA p19.CC p19.CT p19.CG p19.TA p19.TC p19.TT p19.TG p19.GA p19.GC p19.GT p19.GG p20.AA p20.AC p20.AT p20.AG p20.CA p20.CC p20.CT p20.CG p20.TA p20.TC p20.TT p20.TG p20.GA p20.GC p20.GT p20.GG' | cut -d ' ' -f 1-321 > Y.Lipolytica_dep2.txt
#### chemical tensors
cd /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli/
sed '1d' Y.Lipolytica.noscore.txt | awk '{gsub(/./,"& ",$2);print $0}' | sed '1i sgRNAID p1 p2 p3 p4 p5 p6 p7 p8 p9 p10 p11 p12 p13 p14 p15 p16 p17 p18 p19 p20' | cut -d ' ' -f 1-21 > Y.Lipolytica.sequence.txt
# salloc -A SYB105 -N 2 -t 4:00:00
module load python
source /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/scripts/loadcondaandes.sh
conda activate /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/Libraries/andes/envs/test
R
library(dplyr)
library(reshape2)
setwd("/gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli/")
tensor <- read.delim("protein_rna_dna-vector_lee_nucleotide_dna_data.txt", header=T, sep="\t", stringsAsFactors = F)
seq <- read.delim("Y.Lipolytica.sequence.txt", header=T, sep=" ", stringsAsFactors = F)
tensor.features <- tensor[,1]
rownames(tensor) <- tensor[,1]
tensor.df <- tensor[,2:5]
tensor.t <- as.data.frame(t(tensor.df))
tensor.t$base <- c("A", "C", "G", "T")
rownames(seq) <- seq[,1]
seq.df <- seq[,2:21]
seq.melt <- melt(seq, id="sgRNAID")
colnames(seq.melt) <- c("sgRNAID", "position", "base")
seq.tensor <- left_join(seq.melt, tensor.t, by="base")
seq.tensor.melt <- melt(seq.tensor, id=c("sgRNAID", "position", "base"))
seq.tensor.dcast <- dcast(seq.tensor.melt, sgRNAID ~ position + variable, value.var="value")
write.table(seq.tensor.dcast, "Y.Lipolytica.tensors.txt", quote=F, row.names=F, sep="\t")
write.table(seq.tensor.melt, "Y.Lipolytica.tensors.melt.txt", quote=F, row.names=F, sep="\t")
# salloc -A SYB105 -N 2 -t 4:00:00
module load python
source /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/scripts/loadcondaandes.sh
conda activate /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/Libraries/andes/envs/test
R
library(dplyr)
library(reshape2)
library(wmtsa)
library(tidyr)
library(MassSpecWavelet)
setwd("/gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli")
structure <- read.delim("Y.Lipolytica.structure.txt", header=T, sep="\t", stringsAsFactors = F)
nuc <- read.delim("Y.Lipolytica.nuc.count.txt", header=T, sep="\t", stringsAsFactors = F)
score <- read.delim("Y.Lipolytica.txt", header=T, sep="\t", stringsAsFactors = F)
score.df <- score[,c(1:2)]
colnames(score.df) <- c("sgRNAID", "cut.score")
structure.df <- structure[,2]
gc.df <- nuc[,7]
temp.df <- nuc[,8]
# Run DWT
temp.modwt <- wavMODWT(temp.df)
temp.modwt.df <- as.matrix(temp.modwt)
temp.modwt.label <- data.frame(label = row.names(temp.modwt.df), temp.modwt.df)
temp.modwt.name <- temp.modwt.label %>% separate(label, c("name", "number"), sep = "[^[:alnum:]]+")
colnames(temp.modwt.name) <- c("scale", "window", "temp.dwt")
gc.modwt <- wavMODWT(gc.df)
gc.modwt.df <- as.matrix(gc.modwt)
gc.modwt.label <- data.frame(label = row.names(gc.modwt.df), gc.modwt.df)
gc.modwt.name <- gc.modwt.label %>% separate(label, c("name", "number"), sep = "[^[:alnum:]]+")
colnames(gc.modwt.name) <- c("scale", "window", "gc.dwt")
structure.modwt <- wavMODWT(structure.df)
structure.modwt.df <- as.matrix(structure.modwt)
structure.modwt.label <- data.frame(label = row.names(structure.modwt.df), structure.modwt.df)
structure.modwt.name <- structure.modwt.label %>% separate(label, c("name", "number"), sep = "[^[:alnum:]]+")
colnames(structure.modwt.name) <- c("scale", "window", "structure.dwt")
window <- data.frame(score.df[,1])
window$window <- seq.int(nrow(window))
window$window <- as.character(window$window-1)
colnames(window) <- c("sgRNAID", "window")
library(tidygenomics)
window.score.df <- left_join(score.df, window, by=c("sgRNAID"))
window.score.temp <- left_join(window.score.df, temp.modwt.name, by="window")
window.temp.gc <- left_join(window.score.temp, gc.modwt.name, by=c("window", "scale"))
window.temp.gc.structure <- left_join(window.temp.gc, structure.modwt.name, by=c("window", "scale"))
window.temp.gc.structure.sgRNA <- subset(window.temp.gc.structure, window.temp.gc.structure$cut.score != "NA")
df.melt <- melt(window.temp.gc.structure.sgRNA[,c(1:2,4:7)], id=c("cut.score", "scale", "sgRNAID"))
df <- na.omit(df.melt)
# add sgRNA raw data
structure.df <- data.frame(structure[,2])
gc.df <- data.frame(nuc[,7])
temp.df <- data.frame(nuc[,8])
structure.df$scale <- "sgRNA.raw"
gc.df$scale <- "sgRNA.raw"
temp.df$scale <- "sgRNA.raw"
structure.df$sgRNAID <- structure[,1]
gc.df$sgRNAID <- nuc[,1]
temp.df$sgRNAID <- nuc[,1]
window.score.structure <- left_join(window.score.df, structure.df, by="sgRNAID")
window.score.structure.temp <- left_join(window.score.structure, temp.df, by=c("sgRNAID", "scale"))
window.score.structure.temp.gc <- left_join(window.score.structure.temp, gc.df, by=c("sgRNAID", "scale"))
colnames(window.score.structure.temp.gc) <- c("sgRNAID", "cut.score", "seq", "sgRNA.structure", "scale", "sgRNA.temp", "sgRNA.gc")
## add one-hot encoding of sequence
setwd("/gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli")
onehot.ind1 <- read.delim("Y.Lipolytica_ind1.txt", header=T, sep=" ")
onehot.ind2 <- read.delim("Y.Lipolytica_ind2.txt", header=T, sep=" ")
onehot.dep1 <- read.delim("Y.Lipolytica_dep1.txt", header=T, sep=" ")
onehot.dep2 <- read.delim("Y.Lipolytica_dep2.txt", header=T, sep=" ")
onehot.dep2 <- onehot.dep2[,1:305]
onehot.ind <- full_join(onehot.ind1, onehot.ind2, by="sgRNAID")
onehot.dep <- full_join(onehot.dep1, onehot.dep2, by="sgRNAID")
onehot <- full_join(onehot.ind, onehot.dep, by="sgRNAID")
onehot$scale <- "sgRNA.raw"
data.onehot <- left_join(window.score.structure.temp.gc, onehot, by=c("sgRNAID", "scale"))
df2.melt <- melt(data.onehot[,c(1,2,4:ncol(data.onehot))], id=c("cut.score", "scale", "sgRNAID"))
df2 <- na.omit(df2.melt)
colnames(df) <- c("cut.score", "scale", "sgRNAID", "variable", "value")
df <- rbind(df, df2)
df.id <- df %>% unite(feature.scale, c(variable, scale), sep = "")
df.id$value <- as.numeric(df.id$value)
df.id <- df.id[!(is.na(df.id$value) | df.id$value==""), ]
colnames(df.id) <- c("cut.score", "feature.scale", "sgRNAID", "value")
setwd("/gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli")
tensor <- read.delim("Y.Lipolytica.tensors.melt.txt", header=T, sep="\t")
tensor[is.na(tensor)] <- 0
tensor$scale <- "raw"
tensor.id <- tensor %>% unite(feature.scale, c(position, variable, scale), sep = "")
tensor.id$value <- as.numeric(tensor.id$value)
tensor.id[is.na(tensor.id)] <- 0
df.score <- unique(df.id[,c(1,3)])
tensor.score <- inner_join(tensor.id, df.score, by="sgRNAID")
tensor.score.order <- tensor.score[,c(5,2,1,4)]
head(df.id)
head(tensor.score.order)
tensor.df <- rbind(df.id, tensor.score.order)
write.table(tensor.df, "Y.Lipolytica.DWT.raw.onehot.tensor.txt", quote=F, row.names=F, sep="\t")
df.dcast <- tensor.df %>% dcast(sgRNAID + cut.score ~ feature.scale, value.var = "value", fun.aggregate=mean, na.rm=TRUE)
write.table(df.dcast, "Y.Lipolytica.DWT.raw.onehot.tensor.dcast.txt", quote=F, row.names=F, sep="\t")
nrow(df.dcast)
# 46711
df.dcast.na <- na.omit(df.dcast)
write.table(df.dcast, "Y.Lipolytica.DWT.raw.onehot.tensor.dcast.na.txt", quote=F, row.names=F, sep="\t")
nrow(df.dcast.na)
# 46711
setwd("/gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli")
df.dcast <- read.delim("Y.Lipolytica.DWT.raw.onehot.tensor.dcast.na.txt", header=T, sep="\t")
df.features <- df.dcast[,c(1,3:ncol(df.dcast))]
df.features.nolabel <- df.dcast[,c(3:ncol(df.dcast))]
df.scores <- df.dcast[,c(1,2)]
df.scores.nolabel <- as.data.frame(df.dcast[,c(2)])
colnames(df.scores.nolabel) <- "cut.score"
setwd("/gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli/iRF.run/Ecoli.allCas9/all.features/Yeast/")
write.table(df.features, "Yeast_test.txt", quote=F, row.names=F, sep="\t")
write.table(df.scores, "Y_test.txt", quote=F, row.names=F, sep="\t")
write.table(df.features.nolabel, "Yeast_test_noSampleIDs.txt", quote=F, row.names=F, sep="\t")
write.table(df.scores.nolabel, "Y_test_noSampleIDs.txt", quote=F, row.names=F, sep="\t")
#!/bin/bash -l
#BSUB -P SYB105
#BSUB -W 04:15
#BSUB -nnodes 50
#BSUB -J Yeast.test_0
#BSUB -o Yeast.test_0.o%J
#BSUB -e Yeast.test_0.e%J
#mkdir /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli/iRF.run/Ecoli.allCas9/all.features/Yeast
cd /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli/iRF.run/Ecoli.allCas9/all.features/Yeast
/usr/bin/time -f "%e" jsrun -n 1 -a 1 -c 40 -bpacked:40 /gpfs/alpine/syb105/proj-shared/Projects/iRF/IterativeRanger/cpp_version/build/ranger --file /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli/iRF.run/Ecoli.allCas9/all.features/Yeast/Yeast_test_noSampleIDs.txt --yfile /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli/iRF.run/Ecoli.allCas9/all.features/Yeast/Y_test_noSampleIDs.txt --predict /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli/iRF.run/Ecoli.allCas9/all.features/cut.score/foldRuns/fold0/Runs/Set0/Ecoli.allCas9_cut.score.forest --treetype 3 --depvarname cut.score --impmeasure 1 --nthreads 160 --useMPI 0 --outprefix Yeast_test --outputDirectory /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli/iRF.run/Ecoli.allCas9/all.features/Yeast > /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli/iRF.run/Ecoli.allCas9/all.features/Yeast/Yeast_test.o
# bsub /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli/iRF.run/Ecoli.allCas9/all.features/Yeast/Yeast_test_submit.sh
### Summit
#!/bin/bash -l
#BSUB -P SYB105
#BSUB -W 02:15
#BSUB -nnodes 50
#BSUB -J Yeast.tensor.test_0
#BSUB -o Yeast.tensor.test_0.o%J
#BSUB -e Yeast.tensor.test_0.e%J
#mkdir /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli/iRF.run/Ecoli.allCas9/all.features/Yeast
cd /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli/iRF.run/Ecoli.allCas9/all.features/Yeast
/usr/bin/time -f "%e" jsrun -n 1 -a 1 -c 40 -bpacked:40 /gpfs/alpine/syb105/proj-shared/Projects/iRF/IterativeRanger/cpp_version/build/ranger --file /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli/iRF.run/Ecoli.allCas9/all.features/Yeast/Yeast_test_noSampleIDs.txt --yfile /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli/iRF.run/Ecoli.allCas9/all.features/Yeast/Y_test_noSampleIDs.txt --predict /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli/iRF.run/Ecoli.allCas9/tensor.features/cut.score/foldRuns/fold0/Runs/Set0/Ecoli.allCas9.tensor_cut.score.forest --treetype 3 --depvarname cut.score --impmeasure 1 --nthreads 160 --useMPI 0 --outprefix Yeast_tensor_test --outputDirectory /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli/iRF.run/Ecoli.allCas9/all.features/Yeast > /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli/iRF.run/Ecoli.allCas9/all.features/Yeast/Yeast_tensor_test.o
# bsub /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli/iRF.run/Ecoli.allCas9/all.features/Yeast/Yeast_tensor_test_submit.sh
# ### why is the dataframe not numeric when all columns are??
# setwd("/gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli/iRF.run/Ecoli.allCas9/all.features/Yeast/")
# df <- read.delim("Yeast_test_noSampleIDs.txt", header=T, sep="\t")
#
# library(dplyr)
# non_num_cols <- subset(num_cols, num_cols == "FALSE")
# df_num <- select_if(df, is.numeric)
# # still not a numeric matrix ...
#
# df_mat <- as.matrix(df)
# # now its numeric
# write.table(df_mat, "Yeast_test_noSampleIDs.txt", quote=F, row.names=F, sep="\t")
#
# df <- read.delim("Y_test_noSampleIDs.txt", header=T, sep="\t")
# df_mat <- as.matrix(df)
# write.table(df_mat, "Y_test_noSampleIDs.txt", quote=F, row.names=F, sep="\t")
# ## now try to re-run
setwd("/gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli/iRF.run/Ecoli.allCas9/all.features/Yeast/")
score <- read.delim("Y_test_noSampleIDs.txt", header=T, sep="\t")
predict <- read.delim("Yeast_test.prediction", header=T, sep="\t")
score.predict <- cbind(score, predict)
cor(score.predict$cut.score, score.predict$Predictions.)
# -0.05784664
pdf("Yeast.prediction.scatter.pdf")
library(ggplot2)
ggplot(score.predict, aes(x=cut.score, y=Predictions.)) + geom_point() + theme_classic()
dev.off()
cd /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli/iRF.run/multi.species
sed '1d' /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli/iRF.run/Ecoli.allCas9/all.features/Yeast/Yeast_test_noSampleIDs.txt | cat /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli/iRF.run/Ecoli.allCas9.features_overlap_noSampleIDs.txt - > /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli/iRF.run/Ecoli.allCas9/all.features/multi.species/Ecoli.Yeast.features_overlap_noSampleIDs.txt
sed '1d' /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli/iRF.run/Ecoli.allCas9/all.features/Yeast/Yeast_test.txt | cat /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli/iRF.run/Ecoli.allCas9.features.txt - > /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli/iRF.run/Ecoli.allCas9/all.features/multi.species/Ecoli.Yeast.features.txt
cp Ecoli.Yeast.features.txt Ecoli.Yeast.features_overlap.txt
sed '1d' /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli/iRF.run/Ecoli.allCas9/all.features/Yeast/Y_test_noSampleIDs.txt | cat /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli/iRF.run/Ecoli.allCas9.score_overlap_noSampleIDs.txt - > /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli/iRF.run/Ecoli.allCas9/all.features/multi.species/Ecoli.Yeast.scores_overlap_noSampleIDs.txt
sed '1d' /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli/iRF.run/Ecoli.allCas9/all.features/Yeast/Y_test.txt | cat /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli/iRF.run/Ecoli.allCas9.score.txt - > /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli/iRF.run/Ecoli.allCas9/all.features/multi.species/Ecoli.Yeast.scores.txt
cp Ecoli.Yeast.scores.txt Ecoli.Yeast.scores_overlap.txt
module load r
R
setwd("/gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli/iRF.run/Ecoli.allCas9/all.features/multi.species/")
df <- read.delim("Ecoli.Yeast.features_overlap_noSampleIDs.txt", header=T, sep="\t")
df.mat <- as.matrix(df)
write.table(df.mat, "Ecoli.Yeast.features_overlap_noSampleIDs.txt", quote=F, row.names=F, sep="\t")
df <- read.delim("Ecoli.Yeast.features.txt", header=T, sep="\t")
df.mat <- as.matrix(df)
write.table(df.mat, "Ecoli.Yeast.features.txt", quote=F, row.names=F, sep="\t")
df <- read.delim("Ecoli.Yeast.features_overlap.txt", header=T, sep="\t")
df.mat <- as.matrix(df)
write.table(df.mat, "Ecoli.Yeast.features_overlap.txt", quote=F, row.names=F, sep="\t")
df <- read.delim("Ecoli.Yeast.scores_overlap_noSampleIDs.txt", header=T, sep="\t")
df.mat <- as.matrix(df)
write.table(df.mat, "Ecoli.Yeast.scores_overlap_noSampleIDs.txt", quote=F, row.names=F, sep="\t")
df <- read.delim("Ecoli.Yeast.scores.txt", header=T, sep="\t")
df.mat <- as.matrix(df)
write.table(df.mat, "Ecoli.Yeast.scores.txt", quote=F, row.names=F, sep="\t")
df <- read.delim("Ecoli.Yeast.scores_overlap.txt", header=T, sep="\t")
df.mat <- as.matrix(df)
write.table(df.mat, "Ecoli.Yeast.scores_overlap.txt", quote=F, row.names=F, sep="\t")
# run python scripts on Andes
# run job submissions on Summit
# Builder script: /gpfs/alpine/syb105/proj-shared/Projects/iRF/iRF_LOOP_SetUp_CrossLayer.py
# [python iRF_LOOP_SetUp_CrossLayer.py --DataFile --YFile --System Summit --NodesPer 1 --TotalNodes 10 --RunTime 2 --Account SYB105 --NumTrees 1000 --NumIterations 5 --RunName iRF.XX --bypass --Prediction]
# Andes
module load python/3.7-anaconda3
cd /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli/iRF.run/Ecoli.allCas9/all.features/multi.species
python /gpfs/alpine/syb105/proj-shared/Projects/iRF/iRF_LOOP_SetUp_CrossLayer.py --System Summit --NodesPer 1 --TotalNodes 50 --RunTime 60 --Account SYB105 --NumTrees 1000 --NumIterations 5 --RunName Ecoli.Yeast --bypass --Prediction /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli/iRF.run/Ecoli.allCas9/all.features/multi.species/Ecoli.Yeast.features.txt /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli/iRF.run/Ecoli.allCas9/all.features/multi.species/Ecoli.Yeast.scores.txt
# Summit
module load python/3.7.0-anaconda3-5.3.0
cd /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli/iRF.run/Ecoli.allCas9/all.features/multi.species
# full
bsub /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli/iRF.run/Ecoli.allCas9/all.features/multi.species/Submits/submit_full_Ecoli.Yeast_0.sh
# train
bsub /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli/iRF.run/Ecoli.allCas9/all.features/multi.species/Submits/submit_train_Ecoli.Yeast_0.sh
# once the train submissions are done run the test submissions
# test
bsub /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli/iRF.run/Ecoli.allCas9/all.features/multi.species/Submits/submit_test_Ecoli.Yeast_0.sh
# Andes
cd /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli/iRF.run/Ecoli.allCas9/all.features/multi.species
module load python/3.7-anaconda3
cd /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli/iRF.run/Ecoli.allCas9/all.features/multi.species
python /gpfs/alpine/syb105/proj-shared/Projects/iRF/iRF_postProcessing.py --Iterations 5 --Prediction --PredAccuracy MAE,MAEA,MSE,R2 --varTot 95 /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli/iRF.run/Ecoli.allCas9/YNames.txt Ecoli.Yeast
module load python
source /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/scripts/loadcondaandes.sh
conda activate /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/Libraries/andes/envs/test
R
# correlation
setwd("/gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli/iRF.run/Ecoli.allCas9/multi.species/cut.score/foldRuns/fold9/Runs/Set4")
pred <- read.delim("Ecoli.Yeast_Set4_test.prediction", header=T, sep="\t")
y <- read.delim("set4_Y_test_noSampleIDs.txt", header=T, sep="\t")
cor(y$cut.score, pred$Predictions., method=c("pearson"))
#
cor(y$cut.score, pred$Predictions., method=c("spearman"))
#
module load python
source /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/scripts/loadcondaandes.sh
conda activate /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/Libraries/andes/envs/test
mkdir /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/human
# scp /Users/27n/Dropbox (ORNL)/ORNL.Noshay/Projects/SEED/ExploratoryDataForModelGeneration/human/doench.2014.TableS7.txt noshayjm@dtn.ccs.ornl.gov:/gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/human/.
setwd("/gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/human/")
df <- read.delim("doench.2014.TableS7.txt", header=T, sep="\t")
colnames(df) <- c("sgRNAID", "nucleotide.sequence", "cut.score")
df2 <- df[,c(1,3,2)]
df.na <- na.omit(df2)
write.table(df.na, "Doench2014.txt", quote=F, row.names=F, sep="\t")
cd /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/human/
sed '1d' Doench2014.txt | awk '{print ">"$1"\n"$3}' > Doench2014.fasta
### melting temp
# count nucleotides
cd /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/human
python
input_file = open('Doench2014.fasta', 'r')
output_file = open('Doench2014_nuc_counts.tsv','w')
output_file.write('Window\tA\tC\tG\tT\tLength\tCG%\n')
from Bio import SeqIO
for cur_record in SeqIO.parse(input_file, "fasta") :
gene_name = cur_record.name
A_count = cur_record.seq.count('A')
C_count = cur_record.seq.count('C')
G_count = cur_record.seq.count('G')
T_count = cur_record.seq.count('T')
length = len(cur_record.seq)
cg_percentage = float(C_count + G_count) / length
output_line = '%s\t%i\t%i\t%i\t%i\t%i\t%f\n' % \
(gene_name, A_count, C_count, G_count, T_count, length, cg_percentage)
output_file.write(output_line)
output_file.close()
input_file.close()
exit()
# Melting temperature(°C) = 64.9 + 41 * (nG+nC-16.4)/(nA+nT+nG+nC)
R
library(dplyr)
setwd("/gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/human")
df <- read.delim("Doench2014_nuc_counts.tsv", header=T, sep="\t")
df.melt <- df %>% mutate(MeltingTemp = 64.9 + 41 * (G+C-16.4) / (A+T+G+C))
write.table(df.melt, "Doench2014.nuc.count.txt", quote=F, row.names=F, sep="\t")
q()
### structure
cd /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/human
python /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/Libraries/andes/iFeature/iFeature.py --file Doench2014.fasta --type AAC --out Doench2014.structure.txt
### onehot encoding
cd /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/human/
cut -f 1,3 Doench2014.txt > Doench2014.noscore.txt
python /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli/encode_sequences.py Doench2014.noscore.txt
# separate nucleotide sequence values into individual columns in data frame so each position counts as one feature
cd /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/human/
sed '1d' Doench2014.noscore_independent1.txt | sed '1d' | awk '{gsub(/./,"& ",$2);print $0}' | sed '1i sgRNAID A C T G' | cut -d ' ' -f 1-5 > Doench2014_ind1.txt
sed '1d' Doench2014.noscore_independent2.txt | sed '1d' | awk '{gsub(/./,"& ",$2);print $0}' | sed '1i sgRNAID AA AC AT AG CA CC CT CG TA TC TT TG GA GC GT GG' | cut -d ' ' -f 1-17 > Doench2014_ind2.txt
sed '1d' Doench2014.noscore_dependent1.txt | sed '1d' | awk '{gsub(/./,"& ",$2);print $0}' | sed '1i sgRNAID p1.A p1.C p1.T p1.G p2.A p2.C p2.T p2.G p3.A p3.C p3.T p3.G p4.A p4.C p4.T p4.G p5.A p5.C p5.T p5.G p6.A p6.C p6.T p6.G p7.A p7.C p7.T p7.G p8.A p8.C p8.T p8.G p9.A p9.C p9.T p9.G p10.A p10.C p10.T p10.G p11.A p11.C p11.T p11.G p12.A p12.C p12.T p12.G p13.A p13.C p13.T p13.G p14.A p14.C p14.T p14.G p15.A p15.C p15.T p15.G p16.A p16.C p16.T p16.G p17.A p17.C p17.T p17.G p18.A p18.C p18.T p18.G p19.A p19.C p19.T p19.G p20.A p20.C p20.T p20.G' | cut -d ' ' -f 1-21 > Doench2014_dep1.txt
sed '1d' Doench2014.noscore_dependent2.txt | sed '1d' | awk '{gsub(/./,"& ",$2);print $0}' | sed '1i sgRNAID p1.AA p1.AC p1.AT p1.AG p1.CA p1.CC p1.CT p1.CG p1.TA p1.TC p1.TT p1.TG p1.GA p1.GC p1.GT p1.GG p2.AA p2.AC p2.AT p2.AG p2.CA p2.CC p2.CT p2.CG p2.TA p2.TC p2.TT p2.TG p2.GA p2.GC p2.GT p2.GG p3.AA p3.AC p3.AT p3.AG p3.CA p3.CC p3.CT p3.CG p3.TA p3.TC p3.TT p3.TG p3.GA p3.GC p3.GT p3.GG p4.AA p4.AC p4.AT p4.AG p4.CA p4.CC p4.CT p4.CG p4.TA p4.TC p4.TT p4.TG p4.GA p4.GC p4.GT p4.GG p5.AA p5.AC p5.AT p5.AG p5.CA p5.CC p5.CT p5.CG p5.TA p5.TC p5.TT p5.TG p5.GA p5.GC p5.GT p5.GG p6.AA p6.AC p6.AT p6.AG p6.CA p6.CC p6.CT p6.CG p6.TA p6.TC p6.TT p6.TG p6.GA p6.GC p6.GT p6.GG p7.AA p7.AC p7.AT p7.AG p7.CA p7.CC p7.CT p7.CG p7.TA p7.TC p7.TT p7.TG p7.GA p7.GC p7.GT p7.GG p8.AA p8.AC p8.AT p8.AG p8.CA p8.CC p8.CT p8.CG p8.TA p8.TC p8.TT p8.TG p8.GA p8.GC p8.GT p8.GG p9.AA p9.AC p9.AT p9.AG p9.CA p9.CC p9.CT p9.CG p9.TA p9.TC p9.TT p9.TG p9.GA p9.GC p9.GT p9.GG p10.AA p10.AC p10.AT p10.AG p10.CA p10.CC p10.CT p10.CG p10.TA p10.TC p10.TT p10.TG p10.GA p10.GC p10.GT p10.GG p11.AA p11.AC p11.AT p11.AG p11.CA p11.CC p11.CT p11.CG p11.TA p11.TC p11.TT p11.TG p11.GA p11.GC p11.GT p11.GG p12.AA p12.AC p12.AT p12.AG p12.CA p12.CC p12.CT p12.CG p12.TA p12.TC p12.TT p12.TG p12.GA p12.GC p12.GT p12.GG p13.AA p13.AC p13.AT p13.AG p13.CA p13.CC p13.CT p13.CG p13.TA p13.TC p13.TT p13.TG p13.GA p13.GC p13.GT p13.GG p14.AA p14.AC p14.AT p14.AG p14.CA p14.CC p14.CT p14.CG p14.TA p14.TC p14.TT p14.TG p14.GA p14.GC p14.GT p14.GG p15.AA p15.AC p15.AT p15.AG p15.CA p15.CC p15.CT p15.CG p15.TA p15.TC p15.TT p15.TG p15.GA p15.GC p15.GT p15.GG p16.AA p16.AC p16.AT p16.AG p16.CA p16.CC p16.CT p16.CG p16.TA p16.TC p16.TT p16.TG p16.GA p16.GC p16.GT p16.GG p17.AA p17.AC p17.AT p17.AG p17.CA p17.CC p17.CT p17.CG p17.TA p17.TC p17.TT p17.TG p17.GA p17.GC p17.GT p17.GG p18.AA p18.AC p18.AT p18.AG p18.CA p18.CC p18.CT p18.CG p18.TA p18.TC p18.TT p18.TG p18.GA p18.GC p18.GT p18.GG p19.AA p19.AC p19.AT p19.AG p19.CA p19.CC p19.CT p19.CG p19.TA p19.TC p19.TT p19.TG p19.GA p19.GC p19.GT p19.GG p20.AA p20.AC p20.AT p20.AG p20.CA p20.CC p20.CT p20.CG p20.TA p20.TC p20.TT p20.TG p20.GA p20.GC p20.GT p20.GG' | cut -d ' ' -f 1-321 > Doench2014_dep2.txt
#### chemical tensors
cd /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/human
sed '1d' Doench2014.noscore.txt | awk '{gsub(/./,"& ",$2);print $0}' | sed '1i sgRNAID p1 p2 p3 p4 p5 p6 p7 p8 p9 p10 p11 p12 p13 p14 p15 p16 p17 p18 p19 p20' | cut -d ' ' -f 1-21 > Doench2014.sequence.txt
# salloc -A SYB105 -N 2 -t 4:00:00
module load python
source /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/scripts/loadcondaandes.sh
conda activate /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/Libraries/andes/envs/test
R
library(dplyr)
library(reshape2)
setwd("/gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli/")
tensor <- read.delim("protein_rna_dna-vector_lee_nucleotide_dna_data.txt", header=T, sep="\t", stringsAsFactors = F)
setwd("/gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/human/")
seq <- read.delim("Doench2014.sequence.txt", header=T, sep=" ", stringsAsFactors = F)
tensor.features <- tensor[,1]
rownames(tensor) <- tensor[,1]
tensor.df <- tensor[,2:5]
tensor.t <- as.data.frame(t(tensor.df))
tensor.t$base <- c("A", "C", "G", "T")
rownames(seq) <- seq[,1]
seq.df <- seq[,2:21]
seq.melt <- melt(seq, id="sgRNAID")
colnames(seq.melt) <- c("sgRNAID", "position", "base")
seq.tensor <- left_join(seq.melt, tensor.t, by="base")
seq.tensor.melt <- melt(seq.tensor, id=c("sgRNAID", "position", "base"))
seq.tensor.dcast <- dcast(seq.tensor.melt, sgRNAID ~ position + variable, value.var="value")
write.table(seq.tensor.dcast, "Doench2014.tensors.txt", quote=F, row.names=F, sep="\t")
write.table(seq.tensor.melt, "Doench2014.tensors.melt.txt", quote=F, row.names=F, sep="\t")
# salloc -A SYB105 -N 2 -t 4:00:00
module load python
source /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/scripts/loadcondaandes.sh
conda activate /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/Libraries/andes/envs/test
R
library(dplyr)
library(reshape2)
library(wmtsa)
library(tidyr)
library(MassSpecWavelet)
setwd("/gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/human")
structure <- read.delim("Doench2014.structure.txt", header=T, sep="\t", stringsAsFactors = F)
nuc <- read.delim("Doench2014.nuc.count.txt", header=T, sep="\t", stringsAsFactors = F)
score <- read.delim("Doench2014.txt", header=T, sep="\t", stringsAsFactors = F)
score.df <- score[,c(1:2)]
colnames(score.df) <- c("sgRNAID", "cut.score")
structure.df <- structure[,2]
gc.df <- nuc[,7]
temp.df <- nuc[,8]
# Run DWT
temp.modwt <- wavMODWT(temp.df)
temp.modwt.df <- as.matrix(temp.modwt)
temp.modwt.label <- data.frame(label = row.names(temp.modwt.df), temp.modwt.df)
temp.modwt.name <- temp.modwt.label %>% separate(label, c("name", "number"), sep = "[^[:alnum:]]+")
colnames(temp.modwt.name) <- c("scale", "window", "temp.dwt")
gc.modwt <- wavMODWT(gc.df)
gc.modwt.df <- as.matrix(gc.modwt)
gc.modwt.label <- data.frame(label = row.names(gc.modwt.df), gc.modwt.df)
gc.modwt.name <- gc.modwt.label %>% separate(label, c("name", "number"), sep = "[^[:alnum:]]+")
colnames(gc.modwt.name) <- c("scale", "window", "gc.dwt")
structure.modwt <- wavMODWT(structure.df)
structure.modwt.df <- as.matrix(structure.modwt)
structure.modwt.label <- data.frame(label = row.names(structure.modwt.df), structure.modwt.df)
structure.modwt.name <- structure.modwt.label %>% separate(label, c("name", "number"), sep = "[^[:alnum:]]+")
colnames(structure.modwt.name) <- c("scale", "window", "structure.dwt")
window <- data.frame(score.df[,1])
window$window <- seq.int(nrow(window))
window$window <- as.character(window$window-1)
colnames(window) <- c("sgRNAID", "window")
library(tidygenomics)
window.score.df <- left_join(score.df, window, by=c("sgRNAID"))
window.score.temp <- left_join(window.score.df, temp.modwt.name, by="window")
window.temp.gc <- left_join(window.score.temp, gc.modwt.name, by=c("window", "scale"))
window.temp.gc.structure <- left_join(window.temp.gc, structure.modwt.name, by=c("window", "scale"))
window.temp.gc.structure.sgRNA <- subset(window.temp.gc.structure, window.temp.gc.structure$cut.score != "NA")
df.melt <- melt(window.temp.gc.structure.sgRNA[,c(1:2,4:7)], id=c("cut.score", "scale", "sgRNAID"))
df <- na.omit(df.melt)
# add sgRNA raw data
structure.df <- data.frame(structure[,2])
gc.df <- data.frame(nuc[,7])
temp.df <- data.frame(nuc[,8])
structure.df$scale <- "sgRNA.raw"
gc.df$scale <- "sgRNA.raw"
temp.df$scale <- "sgRNA.raw"
structure.df$sgRNAID <- structure[,1]
gc.df$sgRNAID <- nuc[,1]
temp.df$sgRNAID <- nuc[,1]
window.score.structure <- left_join(window.score.df, structure.df, by="sgRNAID")
window.score.structure.temp <- left_join(window.score.structure, temp.df, by=c("sgRNAID", "scale"))
window.score.structure.temp.gc <- left_join(window.score.structure.temp, gc.df, by=c("sgRNAID", "scale"))
colnames(window.score.structure.temp.gc) <- c("sgRNAID", "cut.score", "seq", "sgRNA.structure", "scale", "sgRNA.temp", "sgRNA.gc")
## add one-hot encoding of sequence
setwd("/gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/human")
onehot.ind1 <- read.delim("Doench2014_ind1.txt", header=T, sep=" ")
onehot.ind2 <- read.delim("Doench2014_ind2.txt", header=T, sep=" ")
onehot.dep1 <- read.delim("Doench2014_dep1.txt", header=T, sep=" ")
onehot.dep2 <- read.delim("Doench2014_dep2.txt", header=T, sep=" ")
onehot.dep2 <- onehot.dep2[,1:305]
onehot.ind <- full_join(onehot.ind1, onehot.ind2, by="sgRNAID")
onehot.dep <- full_join(onehot.dep1, onehot.dep2, by="sgRNAID")
onehot <- full_join(onehot.ind, onehot.dep, by="sgRNAID")
onehot$scale <- "sgRNA.raw"
data.onehot <- left_join(window.score.structure.temp.gc, onehot, by=c("sgRNAID", "scale"))
df2.melt <- melt(data.onehot[,c(1,2,4:ncol(data.onehot))], id=c("cut.score", "scale", "sgRNAID"))
df2 <- na.omit(df2.melt)
colnames(df) <- c("cut.score", "scale", "sgRNAID", "variable", "value")
df <- rbind(df, df2)
df.id <- df %>% unite(feature.scale, c(variable, scale), sep = "")
df.id$value <- as.numeric(df.id$value)
df.id <- df.id[!(is.na(df.id$value) | df.id$value==""), ]
colnames(df.id) <- c("cut.score", "feature.scale", "sgRNAID", "value")
setwd("/gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/human")
tensor <- read.delim("Doench2014.tensors.melt.txt", header=T, sep="\t")
tensor[is.na(tensor)] <- 0
tensor$scale <- "raw"
tensor.id <- tensor %>% unite(feature.scale, c(position, variable, scale), sep = "")
tensor.id$value <- as.numeric(tensor.id$value)
tensor.id[is.na(tensor.id)] <- 0
df.score <- unique(df.id[,c(1,3)])
tensor.score <- inner_join(tensor.id, df.score, by="sgRNAID")
tensor.score.order <- tensor.score[,c(5,2,1,4)]
head(df.id)
head(tensor.score.order)
tensor.df <- rbind(df.id, tensor.score.order)
write.table(tensor.df, "Doench2014.DWT.raw.onehot.tensor.txt", quote=F, row.names=F, sep="\t")
df.dcast <- tensor.df %>% dcast(sgRNAID + cut.score ~ feature.scale, value.var = "value", fun.aggregate=mean, na.rm=TRUE)
write.table(df.dcast, "Doench2014.DWT.raw.onehot.tensor.dcast.txt", quote=F, row.names=F, sep="\t")
nrow(df.dcast)
#
df.dcast.na <- na.omit(df.dcast)
write.table(df.dcast, "Doench2014.DWT.raw.onehot.tensor.dcast.na.txt", quote=F, row.names=F, sep="\t")
nrow(df.dcast.na)
#
setwd("/gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/human")
df.dcast <- read.delim("Doench2014.DWT.raw.onehot.tensor.dcast.na.txt", header=T, sep="\t")
df.features <- df.dcast[,c(1,3:ncol(df.dcast))]
df.features.nolabel <- df.dcast[,c(3:ncol(df.dcast))]
df.scores <- df.dcast[,c(1,2)]
df.scores.nolabel <- as.data.frame(df.dcast[,c(2)])
colnames(df.scores.nolabel) <- "cut.score"
setwd("/gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli/iRF.run/Ecoli.allCas9/all.features/Yeast/")
write.table(df.features, "Doench2014_features.txt", quote=F, row.names=F, sep="\t")
write.table(df.scores, "Doench2014_score.txt", quote=F, row.names=F, sep="\t")
write.table(df.features.nolabel, "Doench2014_features_noSampleIDs.txt", quote=F, row.names=F, sep="\t")
write.table(df.scores.nolabel, "Doench2014_score_noSampleIDs.txt", quote=F, row.names=F, sep="\t")
# Summit
module load r/4.0.5
# Andes
# salloc -A SYB105 -N 2 -t 4:00:00
module load python
source /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/scripts/loadcondaandes.sh
conda activate /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/Libraries/andes/envs/test
R
setwd("/gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/human")
df <- read.delim("Doench2014.DWT.raw.onehot.tensor.dcast.na.txt", header=T, sep="\t")
df <- na.omit(df)
## sample ID
df.features <- df[,c(1,3:ncol(df))]
df.score <- df[,1:2]
setwd("/gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/human/iRF.run")
write.table(as.matrix(df.features), "doench2014.features_overlap.txt", quote=F, row.names=F, sep="\t")
write.table(as.matrix(df.score), "doench2014.score_overlap.txt", quote=F, row.names=F, sep="\t")
write.table(as.matrix(df.features), "doench2014.features.txt", quote=F, row.names=F, sep="\t")
write.table(as.matrix(df.score), "doench2014.score.txt", quote=F, row.names=F, sep="\t")
## no sample ID
## <DataFile>_overlap_noSampleIDs.txt & <YFile>_overlap_noSampleIDs.txt
df.features <- df[,c(3:ncol(df))]
df.score <- data.frame(df[,2])
colnames(df.score) <- "cut.score"
setwd("/gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/human/iRF.run")
write.table(as.matrix(df.features), "doench2014.features_overlap_noSampleIDs.txt", quote=F, row.names=F, sep="\t")
write.table(as.matrix(df.score), "doench2014.score_overlap_noSampleIDs.txt", quote=F, row.names=F, sep="\t")
# run python scripts on Andes
# run job submissions on Summit
# Builder script: /gpfs/alpine/syb105/proj-shared/Projects/iRF/iRF_LOOP_SetUp_CrossLayer.py
# [python iRF_LOOP_SetUp_CrossLayer.py --DataFile --YFile --System Summit --NodesPer 1 --TotalNodes 10 --RunTime 2 --Account SYB105 --NumTrees 1000 --NumIterations 5 --RunName iRF.XX --bypass --Prediction]
# Andes
cd /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/human
mkdir iRF.run
module load python/3.7-anaconda3
cd /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/human/iRF.run
python /gpfs/alpine/syb105/proj-shared/Projects/iRF/iRF_LOOP_SetUp_CrossLayer.py --System Summit --NodesPer 1 --TotalNodes 50 --RunTime 90 --Account SYB105 --NumTrees 1000 --NumIterations 5 --RunName doench2014 --bypass --Prediction /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/human/iRF.run/doench2014.features.txt /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/human/iRF.run/doench2014.score.txt
# Summit
cd /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/human/iRF.run/
module load python/3.7.0-anaconda3-5.3.0
# full
bsub /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/human/iRF.run/Submits/submit_full_doench2014_0.sh
# train
bsub /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/human/iRF.run/Submits/submit_train_doench2014_0.sh
# once the train submissions are done run the test submissions
# test
bsub /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/human/iRF.run/Submits/submit_test_doench2014_0.sh
# Andes
cd /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/human/iRF.run/
module load python/3.7-anaconda3
cd /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/human/iRF.run/
python /gpfs/alpine/syb105/proj-shared/Projects/iRF/iRF_postProcessing.py --Iterations 5 --Prediction --PredAccuracy MAE,MAEA,MSE,R2 --varTot 95 /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli/iRF.run/Ecoli.allCas9/YNames.txt doench2014
# R2=0.4446724476508727
# pearson correlation
setwd("/gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/human/iRF.run/cut.score/foldRuns/fold9/Runs/Set4")
pred <- read.delim("doench2014_Set4_test.prediction", header=T, sep="\t")
y <- read.delim("set4_Y_test_noSampleIDs.txt", header=T, sep="\t")
cor(y$cut.score, pred$Predictions.)
# 0.6370642
# spearman correlation
cor(y$cut.score, pred$Predictions., method=c("spearman"))
# 0.6682296
# sort -k 3,3n cut.score_Normalize.txt | tail
# p17xx_quadrupoleraw cut.score 9.615995504128854e-05
# p3vibrational_energyraw cut.score 9.624203163538926e-05
# p2relativenum_Natomsraw cut.score 9.662886093540266e-06
# p11homo_energyraw cut.score 9.665703863936331e-05
# p7num_singlebondsraw cut.score 9.66946089113108e-06
# p9num_aromaticbondsraw cut.score 9.680500771041813e-05
# p12rot_temp_xraw cut.score 9.717825391211206e-05
# p13num_Natomsraw cut.score 9.829987103079123e-07
# p4num_Natomsraw cut.score 9.88929033233781e-06
# p18xx_polarizabilityraw cut.score 9.944272980322534e-05
# sort -k 3,3n /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli/iRF.run/Ecoli.allCas9/all.features/normalizedEdgeFiles/cut.score_Normalize.txt | tail
# p11relativenum_Catomsraw cut.score 9.739818907811671e-08
# p20tot_dipoleraw cut.score 9.741249959050039e-06
# p13.AAsgRNA.raw cut.score 9.784097316716445e-11
# p8relativenum_doublebondsraw cut.score 9.828628264075046e-05
# p20relativenum_Natomsraw cut.score 9.85261941718885e-05
# p13homo_energyraw cut.score 9.857165109357781e-06
# p19num_Oatomsraw cut.score 9.869118596172378e-05
# p16relativenum_Patomsraw cut.score 9.877873262571798e-07
# p2homo1_energyraw cut.score 9.920888979207425e-06
# p6.CAsgRNA.raw cut.score 9.972827721211694e-09
library(tidyr)
library(ggplot2)
setwd("/gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/human/iRF.run/normalizedEdgeFiles")
feature <- read.delim("cut.score_Normalize.txt", header=F, sep="\t")
feature.order <- feature[order(-feature$V3),]
feature.order.top <- feature.order[1:25,]
feature.order.top$id <- feature.order.top$V1
df <- separate(feature.order.top, V1, c("feature", "type"))
pdf("doench2014.importance.pdf")
ggplot(df, aes(x = reorder(id, -V3), y = V3)) + geom_bar(stat = "identity") + theme_classic() + theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1))
dev.off()
# scp noshayjm@dtn.ccs.ornl.gov:/gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/human/iRF.run/normalizedEdgeFiles/doench2014.importance.pdf /Users/27n/Dropbox (ORNL)/ORNL.Noshay/Projects/SEED/ExploratoryDataForModelGeneration/human/.
library(tidyr)
library(ggplot2)
setwd("/gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli/iRF.run/Ecoli.allCas9/all.features/normalizedEdgeFiles")
feature <- read.delim("cut.score_Normalize.txt", header=F, sep="\t")
feature.order <- feature[order(-feature$V3),]
feature.order.top <- feature.order[1:25,]
feature.order.top$id <- feature.order.top$V1
df <- separate(feature.order.top, V1, c("feature", "type"))
pdf("ecoli.allfeatures.importance.pdf")
ggplot(df, aes(x = reorder(id, -V3), y = V3)) + geom_bar(stat = "identity") + theme_classic() + theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1))
dev.off()
# scp noshayjm@dtn.ccs.ornl.gov:/gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli/iRF.run/Ecoli.allCas9/all.features/normalizedEdgeFiles/ecoli.allfeatures.importance.pdf /Users/27n/Dropbox (ORNL)/ORNL.Noshay/Projects/SEED/ExploratoryDataForModelGeneration/human/.
–> density
module load python
source /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/scripts/loadcondaandes.sh
conda activate /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/Libraries/andes/envs/test
cd /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli
# make 1kb windows of genome
# bedtools intersect to calculate density of feature per 1kb window
/gpfs/alpine/syb105/proj-shared/Personal/noshayjm/Libraries/andes/samtools/samtools faidx genome/GCF_000005845.2_ASM584v2_genomic.fna
cut -f1,2 genome/GCF_000005845.2_ASM584v2_genomic.fna.fai | sort -k 1,1 -k 2,2n > ecoli.sizes.genome
bedtools makewindows -g ecoli.sizes.genome -w 1000 > ecoli.1kb.windows.bed
bedtools makewindows -g ecoli.sizes.genome -w 500 > ecoli.500bp.windows.bed
## genes
bedtools intersect -wo -a ecoli.1kb.windows.bed -b genome/GCF_000005845.2_ASM584v2_genomic.gene.gff > ecoli.gene.windows.bed
bedtools intersect -wo -a ecoli.500bp.windows.bed -b genome/GCF_000005845.2_ASM584v2_genomic.gene.gff > ecoli.gene.windows500.bed
## GC content
bedtools nuc -fi genome/GCF_000005845.2_ASM584v2_genomic.fna -bed ecoli.1kb.windows.bed | sed '1d' > ecoli.GC.windows.bed
bedtools nuc -fi genome/GCF_000005845.2_ASM584v2_genomic.fna -bed ecoli.500bp.windows.bed | sed '1d' > ecoli.GC.windows500.bed
–> melting temp https://biopython.org/docs/1.75/api/Bio.SeqUtils.MeltingTemp.html
module load python
source /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/scripts/loadcondaandes.sh
conda activate /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/Libraries/andes/envs/test
Bio.SeqUtils.MeltingTemp.Tm_NN(seq, check=True, strict=True, c_seq=None, shift=0, nn_table=None, tmm_table=None, imm_table=None, de_table=None, dnac1=25, dnac2=25, selfcomp=False, Na=50, K=0, Tris=0, Mg=0, dNTPs=0, saltcorr=5)
https://warwick.ac.uk/fac/sci/moac/people/students/peter_cock/python/fasta_n
module load python
source /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/scripts/loadcondaandes.sh
conda activate /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/Libraries/andes/envs/test
### 500bp windows
cd /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli
bedtools getfasta -fi genome/GCF_000005845.2_ASM584v2_genomic.fna -bed ecoli.500bp.windows.bed -fo ecoli.500bp.fa
# count nucleotides
cd /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli
python
input_file = open('ecoli.500bp.fa', 'r')
output_file = open('nucleotide_counts_500bp.tsv','w')
output_file.write('Window\tA\tC\tG\tT\tLength\tCG%\n')
from Bio import SeqIO
for cur_record in SeqIO.parse(input_file, "fasta") :
gene_name = cur_record.name
A_count = cur_record.seq.count('A')
C_count = cur_record.seq.count('C')
G_count = cur_record.seq.count('G')
T_count = cur_record.seq.count('T')
length = len(cur_record.seq)
cg_percentage = float(C_count + G_count) / length
output_line = '%s\t%i\t%i\t%i\t%i\t%i\t%f\n' % \
(gene_name, A_count, C_count, G_count, T_count, length, cg_percentage)
output_file.write(output_line)
output_file.close()
input_file.close()
exit()
# Melting temperature(°C) = 64.9 + 41 * (nG+nC-16.4)/(nA+nT+nG+nC)
R
library(dplyr)
setwd("/gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli")
df <- read.delim("nucleotide_counts_500bp.tsv", header=T, sep="\t")
df.melt <- df %>% mutate(MeltingTemp = 64.9 + 41 * (G+C-16.4) / (A+T+G+C))
write.table(df.melt, "nucleotide_counts_500bp_temp.txt", quote=F, row.names=F, sep="\t")
q()
–> structure https://academic.oup.com/bioinformatics/article/34/14/2499/4924718 https://github.com/Superzchen/iFeature/ https://github.com/feliixx/gotranseq
#cd /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/Libraries/andes
#git clone https://github.com/Superzchen/iFeature
cd /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli
python /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/Libraries/andes/iFeature/iFeature.py --file ecoli.1kb.fa --type AAC --out ecoli.structure.txt
# convert from nucleotide to protein sequence first
#cd /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/Libraries/andes
#git clone https://github.com/feliixx/gotranseq.git
cd /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli
/gpfs/alpine/syb105/proj-shared/Personal/noshayjm/Libraries/andes/gotranseq/transeq ecoli.1kb.fa ecoli.1kb.protein.fa
gotranseq --sequence file.fna --outseq test.protein.fa
python /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/Libraries/andes/iFeature/iFeature.py --file test.protein.fa --type AAC --out test.txt
python /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/Libraries/andes/iFeature/iFeature.py --file ecoli.500bp.fa --type AAC --out 500bp.protein.structure.fa
–> rnaseq
library(tidyr)
library(dplyr)
setwd("/gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli/genome")
# sed '1d' GCF_000005845.2_ASM584v2_genomic.gff | sed '1d' | sed '1d' | sed '1d' | sed '1d' | sed '1d' | sed '1d' > GCF_000005845.2_ASM584v2_genomic.txt
annotation <- read.delim("GCF_000005845.2_ASM584v2_genomic.txt", header=F, sep="\t")
gene <- subset(annotation, annotation$V3 == "gene")
gene.id <- separate(gene, V9, c("id1", "id2"), sep="EcoGene:")
gene.id$gene_id <- substr(gene.id$id2, 1, 7)
setwd("/gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli")
rna <- read.delim("GSM2267479_Sample-1.genes.results.txt", header=T, sep="\t")
rna.id <- left_join(rna, gene.id, by="gene_id")
rna.id.idf <- na.omit(rna.id[,c(8,11,12,1,3:7)])
write.table(rna.id.idf, "GSM2267479.fpkm.coord.txt", quote=F, row.names=F, sep="\t")
# calculate density (avg fpkm per 1kb window)
#sed '1d' GSM2267479.fpkm.coord.txt > GSM2267479.fpkm.coord.bed
#bedtools intersect -wo -a ecoli.1kb.windows.bed -b GSM2267479.fpkm.coord.bed > ecoli.rnaseq.windows.bed
setwd("/gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli")
window <- read.delim("ecoli.rnaseq.windows.bed", header=F, sep="\t")
window.df <- window %>% group_by(V1, V2, V3) %>% mutate(avg.fpkm = mean(V12))
window.uniq <- unique(window.df[,c(1:3,14)])
write.table(window.uniq, "ecoli.rnaseq.average.windows.bed", quote=F, row.names=F, sep="\t")
#bedtools intersect -wo -a ecoli.500bp.windows.bed -b GSM2267479.fpkm.coord.bed > ecoli.rnaseq.windows500.bed
setwd("/gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli")
window <- read.delim("ecoli.rnaseq.windows500.bed", header=F, sep="\t")
window.df <- window %>% group_by(V1, V2, V3) %>% mutate(avg.fpkm = mean(V12))
window.uniq <- unique(window.df[,c(1:3,14)])
write.table(window.uniq, "ecoli.rnaseq.average.windows500.bed", quote=F, row.names=F, sep="\t")
–> run DWT on genome-wide patterns
library(dplyr)
library(reshape2)
library(wmtsa)
library(tidyr)
setwd("/gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli")
gene <- read.delim("ecoli.gene.windows500.bed", header=F, sep="\t", stringsAsFactors = F)
structure <- read.delim("500bp.protein.structure.fa", header=T, sep="\t", stringsAsFactors = F)
nuc <- read.delim("nucleotide_counts_500bp_temp.txt", header=T, sep="\t", stringsAsFactors = F)
rnaseq <- read.delim("ecoli.rnaseq.average.windows500.bed", header=T, sep="\t", stringsAsFactors = F)
window <- read.delim("ecoli.500bp.windows.bed", header=F, sep="\t", stringsAsFactors = F)
#score <- read.delim("window500.score.avg.txt", header=T, sep="\t", stringsAsFactors = F)
score <- read.delim("sgRNA.coord.txt", header=T, sep="\t", stringsAsFactors = F)
colnames(score) <- c("chr", "start", "end", "sgRNA", "id", "seq", "id2", "cut.score", "gid", "change.val", "quality")
score.df <- score[,c(1:4,8)]
gene.bin <- gene %>% group_by(V1, V2, V3) %>% mutate(gene.count = n())
gene.count <- unique(gene.bin[,c(1:3,14)])
gene.df <- gene.count$gene.count
structure.df <- structure[,2]
gc.df <- nuc[,7]
temp.df <- nuc[,8]
rna.df <- rnaseq[,4]
# Run DWT instead of CWT
temp.modwt <- wavMODWT(temp.df)
temp.modwt.df <- as.matrix(temp.modwt)
temp.modwt.label <- data.frame(label = row.names(temp.modwt.df), temp.modwt.df)
temp.modwt.name <- temp.modwt.label %>% separate(label, c("name", "number"), sep = "[^[:alnum:]]+")
colnames(temp.modwt.name) <- c("scale", "window", "temp.dwt")
library(wavelets)
temp.mat <- as.matrix(temp.df)
temp.dwt <- modwt(temp.mat, filter="la8")
pdf("ecoli.500bp.temp.pdf")
plot.modwt(temp.dwt)
dev.off()
gc.modwt <- wavMODWT(gc.df)
gc.modwt.df <- as.matrix(gc.modwt)
gc.modwt.label <- data.frame(label = row.names(gc.modwt.df), gc.modwt.df)
gc.modwt.name <- gc.modwt.label %>% separate(label, c("name", "number"), sep = "[^[:alnum:]]+")
colnames(gc.modwt.name) <- c("scale", "window", "gc.dwt")
gc.mat <- as.matrix(gc.df)
gc.dwt <- modwt(gc.mat, filter="la8")
pdf("ecoli.500bp.gc.pdf")
plot.modwt(gc.dwt)
dev.off()
structure.modwt <- wavMODWT(structure.df)
structure.modwt.df <- as.matrix(structure.modwt)
structure.modwt.label <- data.frame(label = row.names(structure.modwt.df), structure.modwt.df)
structure.modwt.name <- structure.modwt.label %>% separate(label, c("name", "number"), sep = "[^[:alnum:]]+")
colnames(structure.modwt.name) <- c("scale", "window", "structure.dwt")
structure.mat <- as.matrix(structure.df)
structure.dwt <- modwt(structure.mat, filter="la8")
pdf("ecoli.500bp.structure.pdf")
plot.modwt(structure.dwt)
dev.off()
rna.modwt <- wavMODWT(rna.df)
rna.modwt.df <- as.matrix(rna.modwt)
rna.modwt.label <- data.frame(label = row.names(rna.modwt.df), rna.modwt.df)
rna.modwt.name <- rna.modwt.label %>% separate(label, c("name", "number"), sep = "[^[:alnum:]]+")
colnames(rna.modwt.name) <- c("scale", "window", "rna.dwt")
rna.mat <- as.matrix(rna.df)
rna.dwt <- modwt(rna.mat, filter="la8")
pdf("ecoli.500bp.rna.pdf")
plot.modwt(rna.dwt)
dev.off()
gene.modwt <- wavMODWT(gene.df)
gene.modwt.df <- as.matrix(gene.modwt)
gene.modwt.label <- data.frame(label = row.names(gene.modwt.df), gene.modwt.df)
gene.modwt.name <- gene.modwt.label %>% separate(label, c("name", "number"), sep = "[^[:alnum:]]+")
colnames(gene.modwt.name) <- c("scale", "window", "gene.dwt")
gene.mat <- as.matrix(gene.df)
gene.dwt <- modwt(gene.mat, filter="la8")
pdf("ecoli.500bp.gene.pdf")
plot.modwt(gene.dwt)
dev.off()
colnames(window) <- c("chr", "start", "end")
window$window <- seq.int(nrow(window))
window$window <- as.character(window$window-1)
library(tidygenomics)
window.score <- genome_intersect(window, score.df, by=c("chr", "start", "end"))
window.score.df <- left_join(window, window.score[,2:4], by=c("window"))
window.score.temp <- left_join(window.score.df, temp.modwt.name, by="window")
window.temp.gc <- left_join(window.score.temp, gc.modwt.name, by=c("window", "scale"))
window.temp.gc.structure <- left_join(window.temp.gc, structure.modwt.name, by=c("window", "scale"))
window.temp.gc.structure.rna <- left_join(window.temp.gc.structure, rna.modwt.name, by=c("window", "scale"))
window.temp.gc.structure.rna.gene <- left_join(window.temp.gc.structure.rna, gene.modwt.name, by=c("window", "scale"))
window.temp.gc.structure.rna.gene.sgRNA <- subset(window.temp.gc.structure.rna.gene, window.temp.gc.structure.rna.gene$cut.score != "NA")
df.melt <- melt(window.temp.gc.structure.rna.gene.sgRNA[,5:12], id=c("cut.score", "scale", "sgRNA"))
df <- na.omit(df.melt)
write.table(df, "ecoli.500bp.dwt.features.txt", quote=F, row.names=F, sep="\t")
df.id <- df %>% unite(feature.scale, c(variable, scale), sep = "")
df.dcast <- df.id %>% dcast(sgRNA + cut.score ~ feature.scale, value.var = "value", fun.aggregate=mean, na.rm=TRUE)
# 40468
df.dcast.na <- na.omit(df.dcast)
# 40468
write.table(df.dcast.na, "ecoli.500bp.dwt.features.dcast.txt", quote=F, row.names=F, sep="\t")
–> other features
# salloc -A SYB105 -N 2 -t 4:00:00
#
module load python
source /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/scripts/loadcondaandes.sh
conda activate /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/Libraries/andes/envs/test
R
library(dplyr)
library(reshape2)
library(tidyr)
setwd("/gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli")
structure <- read.delim("Ecoli.allCas9.structure.txt", header=T, sep="\t", stringsAsFactors = F)
nuc <- read.delim("Ecoli.allCas9.nuc.count.txt", header=T, sep="\t", stringsAsFactors = F)
score <- read.delim("Ecoli.allCas9.txt", header=T, sep="\t", stringsAsFactors = F)
score.df <- score[,c(1:2)]
colnames(score.df) <- c("sgRNAID", "cut.score")
structure.df <- structure[,2]
gc.df <- nuc[,7]
temp.df <- nuc[,8]
# add sgRNA raw data
structure.df <- data.frame(structure[,2])
gc.df <- data.frame(nuc[,7])
temp.df <- data.frame(nuc[,8])
structure.df$scale <- "sgRNA.raw"
gc.df$scale <- "sgRNA.raw"
temp.df$scale <- "sgRNA.raw"
structure.df$sgRNAID <- structure[,1]
gc.df$sgRNAID <- nuc[,1]
temp.df$sgRNAID <- nuc[,1]
structure.temp <- left_join(structure.df, temp.df, by=c("sgRNAID", "scale"))
structure.temp.gc <- left_join(structure.temp, gc.df, by=c("sgRNAID", "scale"))
score.structure.temp.gc <- left_join(score, structure.temp.gc, by=c("sgRNAID"))
colnames(score.structure.temp.gc) <- c("sgRNAID", "cut.score", "seq", "sgRNA.structure", "scale", "sgRNA.temp", "sgRNA.gc")
## add one-hot encoding of sequence
setwd("/gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli")
onehot.ind1 <- read.delim("Ecoli.allCas9_ind1.txt", header=T, sep=" ")
onehot.ind2 <- read.delim("Ecoli.allCas9_ind2.txt", header=T, sep=" ")
onehot.dep1 <- read.delim("Ecoli.allCas9_dep1.txt", header=T, sep=" ")
onehot.dep2 <- read.delim("Ecoli.allCas9_dep2.txt", header=T, sep=" ")
onehot.dep2 <- onehot.dep2[,1:305]
onehot.ind <- full_join(onehot.ind1, onehot.ind2, by="sgRNAID")
onehot.dep <- full_join(onehot.dep1, onehot.dep2, by="sgRNAID")
onehot <- full_join(onehot.ind, onehot.dep, by="sgRNAID")
onehot$scale <- "sgRNA.raw"
data.onehot <- left_join(score.structure.temp.gc, onehot, by=c("sgRNAID", "scale"))
df.melt <- melt(data.onehot[,c(1,2,4:ncol(data.onehot))], id=c("cut.score", "scale", "sgRNAID"))
df <- na.omit(df.melt)
df.id <- df %>% unite(feature.scale, c(variable, scale), sep = "")
df.id$value <- as.numeric(df.id$value)
df.id <- df.id[!(is.na(df.id$value) | df.id$value==""), ]
colnames(df.id) <- c("cut.score", "feature.scale", "sgRNAID", "value")
write.table(df.id, "df.id.test.txt", quote=F, row.names=F, sep="\t")
setwd("/gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli")
tensor <- read.delim("Ecoli.allCas9.tensors.melt.txt", header=T, sep="\t")
tensor[is.na(tensor)] <- 0
tensor$scale <- "raw"
tensor.id <- tensor %>% unite(feature.scale, c(position, variable, scale), sep = "")
tensor.id$value <- as.numeric(tensor.id$value)
tensor.id[is.na(tensor.id)] <- 0
write.table(tensor.id, "tensor.id.test", quote=F, row.names=F, sep="\t")
library(dplyr)
library(reshape2)
library(tidyr)
setwd("/gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli")
df.id <- read.delim("df.id.test.txt", header=T, sep="\t")
tensor.id <- read.delim("tensor.id.test", header=T, sep="\t")
df.score <- unique(df.id[,c(1,3)])
tensor.score <- inner_join(tensor.id, df.score, by="sgRNAID")
tensor.score.order <- tensor.score[,c(5,2,1,4)]
head(df.id)
head(tensor.score.order)
tensor.df <- rbind(df.id, tensor.score.order)
write.table(tensor.df, "Ecoli.allCas9.raw.onehot.tensor.txt", quote=F, row.names=F, sep="\t")
df.dcast <- tensor.df %>% dcast(sgRNAID + cut.score ~ feature.scale, value.var = "value", fun.aggregate=mean, na.rm=TRUE)
write.table(df.dcast, "Ecoli.allCas9.raw.onehot.tensor.dcast.txt", quote=F, row.names=F, sep="\t")
nrow(df.dcast)
# 126182
df.dcast.na <- na.omit(df.dcast)
write.table(df.dcast.na, "Ecoli.allCas9.raw.onehot.tensor.dcast.na.txt", quote=F, row.names=F, sep="\t")
nrow(df.dcast.na)
# 126181
–> combine to generate full feature matrix
# combine regional DWT with other features
library(tidyr)
library(dplyr)
setwd("/gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli")
# df <- df.dcast.na
df <- read.delim("Ecoli.allCas9.raw.onehot.tensor.dcast.na.txt", header=T, sep="\t", stringsAsFactors = F)
df.sep <- df %>% separate(sgRNAID, c("sgRNA", "ID", "type"), sep="_")
# 126182
df.dcast.na <- read.delim("ecoli.500bp.dwt.features.dcast.txt", header=T, sep="\t", stringsAsFactors = F)
df.dcast.sep <- df.dcast.na %>% separate(sgRNA, c("sgRNA", "ID"), sep="_")
df.sep.region <- inner_join(df.sep, df.dcast.sep[,c(1,2,4:ncol(df.dcast.sep))], by=c("sgRNA", "ID"))
df.sep.region.id <- df.sep.region %>% unite(sgRNAID, c("sgRNA", "ID", "type"), sep="_")
# 75267
write.table(df.sep.region.id, "Ecoli.allCas9.DWT.raw.onehot.tensor.dcast.regional.txt", quote=F, row.names=F, sep="\t")
# Summit
module load r/4.0.5
# Andes
# salloc -A SYB105 -N 2 -t 4:00:00
module load python
source /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/scripts/loadcondaandes.sh
conda activate /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/Libraries/andes/envs/test
mkdir /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli/iRF.run/Ecoli.allCas9/all.features/region
R
setwd("/gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli")
df <- read.delim("Ecoli.allCas9.DWT.raw.onehot.tensor.dcast.regional.txt", header=T, sep="\t")
df <- na.omit(df)
## sample ID
df.features <- df[,c(1,3:ncol(df))]
df.score <- df[,1:2]
setwd("/gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli/iRF.run/Ecoli.allCas9/all.features/region")
write.table(df.features, "Ecoli.allCas9.features_overlap.txt", quote=F, row.names=F, sep="\t")
write.table(df.score, "Ecoli.allCas9.score_overlap.txt", quote=F, row.names=F, sep="\t")
write.table(df.features, "Ecoli.allCas9.features.txt", quote=F, row.names=F, sep="\t")
write.table(df.score, "Ecoli.allCas9.score.txt", quote=F, row.names=F, sep="\t")
## no sample ID
## <DataFile>_overlap_noSampleIDs.txt & <YFile>_overlap_noSampleIDs.txt
df.features <- as.matrix(df[,c(3:ncol(df))])
df.score <- as.matrix(data.frame(df[,2]))
colnames(df.score) <- "cut.score"
setwd("/gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli/iRF.run/Ecoli.allCas9/all.features/region")
write.table(df.features, "Ecoli.allCas9.features_overlap_noSampleIDs.txt", quote=F, row.names=F, sep="\t")
write.table(df.score, "Ecoli.allCas9.score_overlap_noSampleIDs.txt", quote=F, row.names=F, sep="\t")
# run python scripts on Andes
# run job submissions on Summit
# Builder script: /gpfs/alpine/syb105/proj-shared/Projects/iRF/iRF_LOOP_SetUp_CrossLayer.py
# [python iRF_LOOP_SetUp_CrossLayer.py --DataFile --YFile --System Summit --NodesPer 1 --TotalNodes 10 --RunTime 2 --Account SYB105 --NumTrees 1000 --NumIterations 5 --RunName iRF.XX --bypass --Prediction]
# Andes
cd /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli/iRF.run/Ecoli.allCas9/all.features/region
module load python/3.7-anaconda3
cd /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli/iRF.run/Ecoli.allCas9/all.features/region
python /gpfs/alpine/syb105/proj-shared/Projects/iRF/iRF_LOOP_SetUp_CrossLayer.py --System Summit --NodesPer 1 --TotalNodes 50 --RunTime 90 --Account SYB105 --NumTrees 1000 --NumIterations 5 --RunName Ecoli.allCas9 --bypass --Prediction /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli/iRF.run/Ecoli.allCas9/all.features/region/Ecoli.allCas9.features.txt /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli/iRF.run/Ecoli.allCas9/all.features/region/Ecoli.allCas9.score.txt
# Summit
cd /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli/iRF.run/Ecoli.allCas9/all.features/region
module load python/3.7.0-anaconda3-5.3.0
# full
bsub /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli/iRF.run/Ecoli.allCas9/all.features/region/Submits/submit_full_Ecoli.allCas9_0.sh
# train
bsub /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli/iRF.run/Ecoli.allCas9/all.features/region/Submits/submit_train_Ecoli.allCas9_0.sh
# once the train submissions are done run the test submissions
# test
bsub /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli/iRF.run/Ecoli.allCas9/all.features/region/Submits/submit_test_Ecoli.allCas9_0.sh
# Andes
cd /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli/iRF.run/Ecoli.allCas9/
module load python/3.7-anaconda3
cd /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli/iRF.run/Ecoli.allCas9/all.features/region
python /gpfs/alpine/syb105/proj-shared/Projects/iRF/iRF_postProcessing.py --Iterations 5 --Prediction --PredAccuracy MAE,MAEA,MSE,R2 --varTot 95 /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli/iRF.run/Ecoli.allCas9/YNames.txt Ecoli.allCas9
# R2 = -0.044506064566064034
# pearson correlation
setwd("/gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli/iRF.run/Ecoli.allCas9/all.features/region/cut.score/foldRuns/fold9/Runs/Set4")
pred <- read.delim("Ecoli.allCas9_Set4_test.prediction", header=T, sep="\t")
y <- read.delim("set4_Y_test_noSampleIDs.txt", header=T, sep="\t")
cor(y$cut.score, pred$Predictions.)
# 0.305635
# spearman correlation
cor(y$cut.score, pred$Predictions., method=c("spearman"))
# 0.3603346
library(tidyr)
library(ggplot2)
setwd("/gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli/iRF.run/Ecoli.allCas9/all.features/region/normalizedEdgeFiles")
feature <- read.delim("cut.score_Normalize.txt", header=F, sep="\t")
feature.order <- feature[order(-feature$V3),]
feature.order.top <- feature.order[1:25,]
feature.order.top$id <- feature.order.top$V1
df <- separate(feature.order.top, V1, c("feature", "type"))
pdf("ecoli.allfeatures.region.importance.pdf")
ggplot(df, aes(x = reorder(id, -V3), y = V3)) + geom_bar(stat = "identity") + theme_classic() + theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1))
dev.off()
# scp noshayjm@dtn.ccs.ornl.gov:/gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli/iRF.run/Ecoli.allCas9/all.features/region/normalizedEdgeFiles/ecoli.allfeatures.region.importance.pdf /Users/27n/Dropbox (ORNL)/ORNL.Noshay/Projects/SEED/ExploratoryDataForModelGeneration/e.coli/.
library(tidyr)
library(ggplot2)
setwd("/gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli/iRF.run/Ecoli.allCas9/all.features/region/cut.score/foldRuns/results/")
feature <- read.delim("importanceScores.txt", header=T, sep="\t")
feature.order <- feature[order(-feature$MedianImportance),]
feature.order.top <- feature.order[1:25,]
feature.order.top$id <- feature.order.top$Feature
df <- separate(feature.order.top, Feature, c("feature", "type"))
pdf("ecoli.allfeatures.region.importanceScores.pdf")
ggplot(df, aes(x = reorder(id, -MedianImportance), y = MedianImportance)) + geom_bar(stat = "identity") + theme_classic() + theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1))
dev.off()
# scp noshayjm@dtn.ccs.ornl.gov:/gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli/iRF.run/Ecoli.allCas9/all.features/region/cut.score/foldRuns/results/ecoli.allfeatures.region.importanceScores.pdf /Users/27n/Dropbox (ORNL)/ORNL.Noshay/Projects/SEED/ExploratoryDataForModelGeneration/e.coli/.
–> remove correlated features and re-run model…
# salloc -A SYB105 -N 2 -t 4:00:00
module load python
source /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/scripts/loadcondaandes.sh
conda activate /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/Libraries/andes/envs/test
R
setwd("/gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli")
df <- read.delim("Ecoli.allCas9.DWT.raw.onehot.tensor.dcast.regional.txt", header=T, sep="\t")
df <- na.omit(df)
## sample ID
df.features <- as.matrix(df[,c(3:ncol(df))])
df.cor <- cor(df.features)
write.table(df.cor, "Ecoli.allCas9.DWT.raw.onehot.tensor.dcast.regional.correlation.txt", quote=F, row.names=F, sep="\t")
######### find highly correlated variales using caret
setwd("/gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli")
df.cor <- read.delim("Ecoli.allCas9.DWT.raw.onehot.tensor.dcast.regional.correlation.txt", header=T, sep="\t")
df.cor[is.na(df.cor)] <- 0
df.num <- as.matrix(df.cor)
features <- colnames(df.num)
rownames(df.num) <- features
# 1655
library(caret)
findCorrelation(df.num, cutoff = .4, exact = TRUE, names = TRUE)
# 1208
findCorrelation(df.num, cutoff = .6, exact = TRUE, names = TRUE)
#
findCorrelation(df.num, cutoff = .8, exact = TRUE, names = TRUE)
#
findCorrelation(df.num, cutoff = .9, exact = TRUE, names = TRUE)
#
### remove from iRF run???
df.num.remove <- findCorrelation(df.num, cutoff = .9, exact = TRUE, names = TRUE)
write.table(df.num.remove, "ecoli.allCas9.regional.features.highlycorrelated.txt", quote=F, row.names=F, sep="\t")
setwd("/gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli/iRF.run/Ecoli.allCas9/all.features/region/")
df <- read.delim("Ecoli.allCas9.features.txt", header=T, sep="\t")
df.subset <- df[ , -which(names(df) %in% df.num.remove)]
df.mat <- as.matrix(df.subset[,2:ncol(df.subset)])
df.mat.id <- cbind(as.data.frame(df$sgRNAID), df.mat)
write.table(df.mat.id, "Ecoli.allCas9.noncor.features.txt", quote=F, row.names=F, sep="\t")
write.table(df.mat.id, "Ecoli.allCas9.noncor.features_overlap.txt", quote=F, row.names=F, sep="\t")
write.table(df.mat, "Ecoli.allCas9.noncor.features_overlap_noSampleIDs.txt", quote=F, row.names=F, sep="\t")
# run python scripts on Andes
# run job submissions on Summit
# Builder script: /gpfs/alpine/syb105/proj-shared/Projects/iRF/iRF_LOOP_SetUp_CrossLayer.py
# [python iRF_LOOP_SetUp_CrossLayer.py --DataFile --YFile --System Summit --NodesPer 1 --TotalNodes 10 --RunTime 2 --Account SYB105 --NumTrees 1000 --NumIterations 5 --RunName iRF.XX --bypass --Prediction]
# Andes
cd /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli/iRF.run/Ecoli.allCas9/all.features/region.noncor
module load python/3.7-anaconda3
cd /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli/iRF.run/Ecoli.allCas9/all.features/region.noncor
python /gpfs/alpine/syb105/proj-shared/Projects/iRF/iRF_LOOP_SetUp_CrossLayer.py --System Summit --NodesPer 1 --TotalNodes 50 --RunTime 90 --Account SYB105 --NumTrees 1000 --NumIterations 5 --RunName Ecoli.allCas9 --bypass --Prediction /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli/iRF.run/Ecoli.allCas9/all.features/region/Ecoli.allCas9.noncor.features.txt /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli/iRF.run/Ecoli.allCas9/all.features/region/Ecoli.allCas9.score.txt
# Summit
cd /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli/iRF.run/Ecoli.allCas9/all.features/region.noncor
module load python/3.7.0-anaconda3-5.3.0
# full
bsub /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli/iRF.run/Ecoli.allCas9/all.features/region.noncor/Submits/submit_full_Ecoli.allCas9_0.sh
# train
bsub /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli/iRF.run/Ecoli.allCas9/all.features/region.noncor/Submits/submit_train_Ecoli.allCas9_0.sh
# once the train submissions are done run the test submissions
# test
bsub /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli/iRF.run/Ecoli.allCas9/all.features/region.noncor/Submits/submit_test_Ecoli.allCas9_0.sh
# Andes
cd /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli/iRF.run/Ecoli.allCas9/
module load python/3.7-anaconda3
cd /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli/iRF.run/Ecoli.allCas9/all.features/region.noncor
python /gpfs/alpine/syb105/proj-shared/Projects/iRF/iRF_postProcessing.py --Iterations 5 --Prediction --PredAccuracy MAE,MAEA,MSE,R2 --varTot 95 /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli/iRF.run/Ecoli.allCas9/YNames.txt Ecoli.allCas9
# R2 = -0.04313654551715315
# pearson correlation
setwd("/gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli/iRF.run/Ecoli.allCas9/all.features/region.noncor/cut.score/foldRuns/fold9/Runs/Set4")
pred <- read.delim("Ecoli.allCas9_Set4_test.prediction", header=T, sep="\t")
y <- read.delim("set4_Y_test_noSampleIDs.txt", header=T, sep="\t")
cor(y$cut.score, pred$Predictions.)
# 0.3050326
library(tidyr)
library(ggplot2)
setwd("/gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli/iRF.run/Ecoli.allCas9/all.features/region.noncor/cut.score/foldRuns/results/")
feature <- read.delim("importanceScores.txt", header=T, sep="\t")
feature.order <- feature[order(-feature$MedianImportance),]
feature.order.top <- feature.order[1:25,]
feature.order.top$id <- feature.order.top$Feature
df <- separate(feature.order.top, Feature, c("feature", "type"))
pdf("ecoli.allfeatures.region.noncor.importanceScores.pdf")
ggplot(df, aes(x = reorder(id, -MedianImportance), y = MedianImportance)) + geom_bar(stat = "identity") + theme_classic() + theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1))
dev.off()
# scp noshayjm@dtn.ccs.ornl.gov:/gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli/iRF.run/Ecoli.allCas9/all.features/region/cut.score/foldRuns/results/ecoli.allfeatures.region.noncor.importanceScores.pdf /Users/27n/Dropbox (ORNL)/ORNL.Noshay/Projects/SEED/ExploratoryDataForModelGeneration/e.coli/.
–> GATC motif
module load python
source /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/scripts/loadcondaandes.sh
conda activate /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/Libraries/andes/envs/test
#source /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/scripts/loadcondasummit.sh
#conda activate /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/Libraries/summit/anaconda3/envs/summit-test
## GATC motif
# find locations in genome
## try seqkit... can't get to load
cd /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/Libraries/andes/
git clone https://github.com/shenwei356/seqkit
cd /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli
/gpfs/alpine/syb105/proj-shared/Personal/noshayjm/Libraries/andes/seqkit/seqkit locate -ip "GATC" genome/GCF_000005845.2_ASM584v2_genomic.fna > ecoli.gatc.coord.txt
## try blastn... but really short sequences so not getting any output??
cd /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli
/gpfs/alpine/syb105/proj-shared/Personal/noshayjm/Libraries/andes/ncbi-blast-2.11.0+/bin/blastn -query gatc.fasta -db genome/GCF_000005845.2_ASM584v2_genomic.fna -out ecoli.gatc.blast.tab -dust no -outfmt 6 -task blastn -num_threads 10 -perc_identity 100 -ungapped
## try fastaregex
#cd /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/Libraries/andes/
#wget https://github.com/dariober/bioinformatics-cafe/blob/master/fastaRegexFinder/fastaRegexFinder.py?raw=true -O fastaRegexFinder.py
chmod a+x fastaRegexFinder.py
cd /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli
python /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/Libraries/andes/fastaRegexFinder.py -q -f genome/GCF_000005845.2_ASM584v2_genomic.fna -r 'GATC' > ecoli.gatc.bed
bedtools intersect -wo -a ecoli.500bp.windows.bed -b ecoli.gatc.coord.bed > ecoli.gatc.windows500.bed
–> IPD ratios
setwd("/gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli")
df <- read.delim("GSM3264688_Ecoli.gff", header=F, sep="\t")
df2 <- df[5:nrow(df),]
library(dplyr)
library(tidyr)
df.sep <- df2 %>% separate(V9, c("coverage", "context", "IPD"), sep=";")
df.ipd <- df.sep %>% separate(IPD, c("IPD", "IPD.value"), sep="=")
df.ipd$chr <- "NC_000913.3"
df.coord <- df.ipd[,c(13,4,5,12)]
write.table(df.coord, "GSM3264688_Ecoli.coord.bed", quote=F, row.names=F, col.names=F, sep="\t")
cd /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli
bedtools intersect -wo -a ecoli.500bp.windows.bed -b GSM3264688_Ecoli.coord.bed > ecoli.ipd.windows500.bed
setwd("/gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli")
window <- read.delim("ecoli.ipd.windows500.bed", header=F, sep="\t")
window.df <- window %>% group_by(V1, V2, V3) %>% mutate(avg.fpkm = mean(V7))
window.uniq <- unique(window.df[,c(1:3,9)])
write.table(window.uniq, "ecoli.ipd.average.windows500.bed", quote=F, row.names=F, sep="\t")
–> run DWT on genome-wide patterns
library(dplyr)
library(reshape2)
library(wmtsa)
library(tidyr)
setwd("/gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli")
gatc <- read.delim("ecoli.gatc.windows500.bed", header=F, sep="\t", stringsAsFactors = F)
ipd <- read.delim("ecoli.ipd.average.windows500.bed", header=T, sep="\t", stringsAsFactors = F)
gene <- read.delim("ecoli.gene.windows500.bed", header=F, sep="\t", stringsAsFactors = F)
structure <- read.delim("500bp.protein.structure.fa", header=T, sep="\t", stringsAsFactors = F)
nuc <- read.delim("nucleotide_counts_500bp_temp.txt", header=T, sep="\t", stringsAsFactors = F)
rnaseq <- read.delim("ecoli.rnaseq.average.windows500.bed", header=T, sep="\t", stringsAsFactors = F)
window <- read.delim("ecoli.500bp.windows.bed", header=F, sep="\t", stringsAsFactors = F)
#score <- read.delim("window500.score.avg.txt", header=T, sep="\t", stringsAsFactors = F)
score <- read.delim("sgRNA.coord.txt", header=T, sep="\t", stringsAsFactors = F)
colnames(score) <- c("chr", "start", "end", "sgRNA", "id", "seq", "id2", "cut.score", "gid", "change.val", "quality")
score.df <- score[,c(1:4,8)]
window.v <- window[,1:3]
colnames(window.v) <- c("V1", "V2", "V3")
gatc.win <- left_join(window.v, gatc, by=c("V1", "V2", "V3"))
gatc.win[is.na(gatc.win)] <- 0
ipd.win <- left_join(window.v, ipd, by=c("V1", "V2", "V3"))
ipd.win[is.na(ipd.win)] <- 0
gene.win <- left_join(window.v, gene, by=c("V1", "V2", "V3"))
gene.win[is.na(gene.win)] <- 0
rnaseq.win <- left_join(window.v, rnaseq, by=c("V1", "V2", "V3"))
rnaseq.win[is.na(rnaseq.win)] <- 0
gatc.bin <- gatc.win %>% group_by(V1, V2, V3) %>% mutate(gatc.count = n())
gatc.count <- unique(gatc.bin[,c(1:3,12)])
gatc.df <- gatc.count$gatc.count
ipd.df <- ipd.win[,4]
gene.bin <- gene.win %>% group_by(V1, V2, V3) %>% mutate(gene.count = n())
gene.count <- unique(gene.bin[,c(1:3,14)])
gene.df <- gene.count$gene.count
structure.df <- structure[,2]
gc.df <- nuc[,7]
temp.df <- nuc[,8]
rna.df <- rnaseq.win[,4]
# Run DWT instead of CWT
temp.modwt <- wavMODWT(temp.df)
temp.modwt.df <- as.matrix(temp.modwt)
temp.modwt.label <- data.frame(label = row.names(temp.modwt.df), temp.modwt.df)
temp.modwt.name <- temp.modwt.label %>% separate(label, c("name", "number"), sep = "[^[:alnum:]]+")
colnames(temp.modwt.name) <- c("scale", "window", "temp.dwt")
gc.modwt <- wavMODWT(gc.df)
gc.modwt.df <- as.matrix(gc.modwt)
gc.modwt.label <- data.frame(label = row.names(gc.modwt.df), gc.modwt.df)
gc.modwt.name <- gc.modwt.label %>% separate(label, c("name", "number"), sep = "[^[:alnum:]]+")
colnames(gc.modwt.name) <- c("scale", "window", "gc.dwt")
structure.modwt <- wavMODWT(structure.df)
structure.modwt.df <- as.matrix(structure.modwt)
structure.modwt.label <- data.frame(label = row.names(structure.modwt.df), structure.modwt.df)
structure.modwt.name <- structure.modwt.label %>% separate(label, c("name", "number"), sep = "[^[:alnum:]]+")
colnames(structure.modwt.name) <- c("scale", "window", "structure.dwt")
rna.modwt <- wavMODWT(rna.df)
rna.modwt.df <- as.matrix(rna.modwt)
rna.modwt.label <- data.frame(label = row.names(rna.modwt.df), rna.modwt.df)
rna.modwt.name <- rna.modwt.label %>% separate(label, c("name", "number"), sep = "[^[:alnum:]]+")
colnames(rna.modwt.name) <- c("scale", "window", "rna.dwt")
gene.modwt <- wavMODWT(gene.df)
gene.modwt.df <- as.matrix(gene.modwt)
gene.modwt.label <- data.frame(label = row.names(gene.modwt.df), gene.modwt.df)
gene.modwt.name <- gene.modwt.label %>% separate(label, c("name", "number"), sep = "[^[:alnum:]]+")
colnames(gene.modwt.name) <- c("scale", "window", "gene.dwt")
ipd.modwt <- wavMODWT(ipd.df)
ipd.modwt.df <- as.matrix(ipd.modwt)
ipd.modwt.label <- data.frame(label = row.names(ipd.modwt.df), ipd.modwt.df)
ipd.modwt.name <- ipd.modwt.label %>% separate(label, c("name", "number"), sep = "[^[:alnum:]]+")
colnames(ipd.modwt.name) <- c("scale", "window", "ipd.dwt")
library(wavelets)
ipd.mat <- as.matrix(ipd.df)
ipd.dwt <- modwt(ipd.mat, filter="la8")
pdf("ecoli.500bp.ipd.pdf")
plot.modwt(ipd.dwt)
dev.off()
gatc.modwt <- wavMODWT(gatc.df)
gatc.modwt.df <- as.matrix(gatc.modwt)
gatc.modwt.label <- data.frame(label = row.names(gatc.modwt.df), gatc.modwt.df)
gatc.modwt.name <- gatc.modwt.label %>% separate(label, c("name", "number"), sep = "[^[:alnum:]]+")
colnames(gatc.modwt.name) <- c("scale", "window", "gatc.dwt")
gatc.mat <- as.matrix(gatc.df)
gatc.dwt <- modwt(gatc.mat, filter="la8")
pdf("ecoli.500bp.gatc.pdf")
plot.modwt(gatc.dwt)
dev.off()
colnames(window) <- c("chr", "start", "end")
window$window <- seq.int(nrow(window))
window$window <- as.character(window$window-1)
library(tidygenomics)
window.score <- genome_intersect(score.df, window, by=c("chr", "start", "end"))
window.score.df <- left_join(window, window.score[,2:4], by=c("window"))
window.score.temp <- left_join(window.score.df, temp.modwt.name, by="window")
window.temp.gc <- left_join(window.score.temp, gc.modwt.name, by=c("window", "scale"))
window.temp.gc.structure <- left_join(window.temp.gc, structure.modwt.name, by=c("window", "scale"))
window.temp.gc.structure.rna <- left_join(window.temp.gc.structure, rna.modwt.name, by=c("window", "scale"))
window.temp.gc.structure.rna.gene <- left_join(window.temp.gc.structure.rna, gene.modwt.name, by=c("window", "scale"))
window.temp.gc.structure.rna.gene.gatc <- left_join(window.temp.gc.structure.rna.gene, gatc.modwt.name, by=c("window", "scale"))
window.temp.gc.structure.rna.gene.gatc.ipd <- left_join(window.temp.gc.structure.rna.gene.gatc, ipd.modwt.name, by=c("window", "scale"))
window.temp.gc.structure.rna.gene.gatc.ipd.sgRNA <- subset(window.temp.gc.structure.rna.gene.gatc.ipd, window.temp.gc.structure.rna.gene.gatc.ipd$cut.score != "NA")
df.melt <- melt(window.temp.gc.structure.rna.gene.gatc.ipd.sgRNA[,5:14], id=c("cut.score", "scale", "sgRNA"))
df <- na.omit(df.melt)
#write.table(df, "ecoli.500bp.dwt.features2.txt", quote=F, row.names=F, sep="\t")
df.id <- df %>% unite(feature.scale, c(variable, scale), sep = "")
df.dcast <- df.id %>% dcast(sgRNA + cut.score ~ feature.scale, value.var = "value", fun.aggregate=mean, na.rm=TRUE)
df.dcast.na <- na.omit(df.dcast)
# 40468
write.table(df.dcast.na, "ecoli.500bp.dwt.features2.dcast.txt", quote=F, row.names=F, sep="\t")
–> combine to generate full feature matrix
# combine regional DWT with other features
library(tidyr)
library(dplyr)
setwd("/gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli")
df.dcast.na <- read.delim("ecoli.500bp.dwt.features2.dcast.txt", header=T, sep="\t", stringsAsFactors = F)
df.dcast.sep <- df.dcast.na %>% separate(sgRNA, c("sgRNA", "ID"), sep="_")
df <- read.delim("Ecoli.allCas9.raw.onehot.tensor.dcast.na.txt", header=T, sep="\t", stringsAsFactors = F)
df.sep <- df %>% separate(sgRNAID, c("sgRNA", "ID", "type"), sep="_")
# 126182
df.sep.region <- inner_join(df.sep, df.dcast.sep[,c(1,2,4:ncol(df.dcast.sep))], by=c("sgRNA", "ID"))
df.sep.region.id <- df.sep.region %>% unite(sgRNAID, c("sgRNA", "ID", "type"), sep="_")
# 118143
write.table(df.sep.region.id, "Ecoli.allCas9.DWT.raw.onehot.tensor.dcast.regional2.txt", quote=F, row.names=F, sep="\t")
# Summit
module load r/4.0.5
# Andes
# salloc -A SYB105 -N 2 -t 4:00:00
module load python
source /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/scripts/loadcondaandes.sh
conda activate /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/Libraries/andes/envs/test
mkdir /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli/iRF.run/Ecoli.allCas9/all.features/region
R
setwd("/gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli")
df <- read.delim("Ecoli.allCas9.DWT.raw.onehot.tensor.dcast.regional2.txt", header=T, sep="\t")
df <- na.omit(df)
## sample ID
df.features <- df[,c(1,3:ncol(df))]
df.score <- df[,1:2]
setwd("/gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli/iRF.run/Ecoli.allCas9/all.features/region")
write.table(df.features, "region.dwt.features_overlap.txt", quote=F, row.names=F, sep="\t")
write.table(df.score, "region.dwt.score_overlap.txt", quote=F, row.names=F, sep="\t")
write.table(df.features, "region.dwt.features.txt", quote=F, row.names=F, sep="\t")
write.table(df.score, "region.dwt.score.txt", quote=F, row.names=F, sep="\t")
## no sample ID
## <DataFile>_overlap_noSampleIDs.txt & <YFile>_overlap_noSampleIDs.txt
df.features <- as.matrix(df[,c(3:ncol(df))])
df.score <- as.matrix(data.frame(df[,2]))
colnames(df.score) <- "cut.score"
setwd("/gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli/iRF.run/Ecoli.allCas9/all.features/region")
write.table(df.features, "region.dwt.features_overlap_noSampleIDs.txt", quote=F, row.names=F, sep="\t")
write.table(df.score, "region.dwt.score_overlap_noSampleIDs.txt", quote=F, row.names=F, sep="\t")
–> non-correlated features
# salloc -A SYB105 -N 2 -t 4:00:00
module load python
source /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/scripts/loadcondaandes.sh
conda activate /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/Libraries/andes/envs/test
R
setwd("/gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli")
df <- read.delim("Ecoli.allCas9.DWT.raw.onehot.tensor.dcast.regional2.txt", header=T, sep="\t")
df <- na.omit(df)
## sample ID
df.features <- as.matrix(df[,c(3:ncol(df))])
df.cor <- cor(df.features)
write.table(df.cor, "Ecoli.allCas9.DWT.raw.onehot.tensor.dcast.regional2.correlation.txt", quote=F, row.names=F, sep="\t")
######### find highly correlated variales using caret
setwd("/gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli")
df.cor <- read.delim("Ecoli.allCas9.DWT.raw.onehot.tensor.dcast.regional2.correlation.txt", header=T, sep="\t")
df.cor[is.na(df.cor)] <- 0
df.num <- as.matrix(df.cor)
features <- colnames(df.num)
rownames(df.num) <- features
library(caret)
### remove from iRF run???
df.num.remove <- findCorrelation(df.num, cutoff = .9, exact = TRUE, names = TRUE)
write.table(df.num.remove, "region.dwt.features.highlycorrelated.txt", quote=F, row.names=F, sep="\t")
setwd("/gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli/iRF.run/Ecoli.allCas9/all.features/region2/")
df <- read.delim("region.dwt.features.txt", header=T, sep="\t")
df.subset <- df[ , -which(names(df) %in% df.num.remove)]
df.mat <- as.matrix(df.subset[,2:ncol(df.subset)])
df.mat.id <- cbind(as.data.frame(df$sgRNAID), df.mat)
write.table(df.mat.id, "region.dwt.noncor.features.txt", quote=F, row.names=F, sep="\t")
write.table(df.mat.id, "region.dwt.noncor.features_overlap.txt", quote=F, row.names=F, sep="\t")
write.table(df.mat, "region.dwt.noncor.features_overlap_noSampleIDs.txt", quote=F, row.names=F, sep="\t")
# run python scripts on Andes
# run job submissions on Summit
# Builder script: /gpfs/alpine/syb105/proj-shared/Projects/iRF/iRF_LOOP_SetUp_CrossLayer.py
# [python iRF_LOOP_SetUp_CrossLayer.py --DataFile --YFile --System Summit --NodesPer 1 --TotalNodes 10 --RunTime 2 --Account SYB105 --NumTrees 1000 --NumIterations 5 --RunName iRF.XX --bypass --Prediction]
# Andes
mkdir /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli/iRF.run/Ecoli.allCas9/all.features/region2
cd /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli/iRF.run/Ecoli.allCas9/all.features/region2
module load python/3.7-anaconda3
cd /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli/iRF.run/Ecoli.allCas9/all.features/region2
python /gpfs/alpine/syb105/proj-shared/Projects/iRF/iRF_LOOP_SetUp_CrossLayer.py --System Summit --NodesPer 1 --TotalNodes 50 --RunTime 90 --Account SYB105 --NumTrees 1000 --NumIterations 5 --RunName region.dwt --bypass --Prediction /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli/iRF.run/Ecoli.allCas9/all.features/region2/region.dwt.noncor.features.txt /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli/iRF.run/Ecoli.allCas9/all.features/region2/region.dwt.score.txt
# Summit
cd /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli/iRF.run/Ecoli.allCas9/all.features/region2
module load python/3.7.0-anaconda3-5.3.0
# full
bsub /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli/iRF.run/Ecoli.allCas9/all.features/region2/Submits/submit_full_region.dwt_0.sh
# train
bsub /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli/iRF.run/Ecoli.allCas9/all.features/region2/Submits/submit_train_region.dwt_0.sh
# once the train submissions are done run the test submissions
# test
bsub /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli/iRF.run/Ecoli.allCas9/all.features/region2/Submits/submit_test_region.dwt_0.sh
# Andes
cd /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli/iRF.run/Ecoli.allCas9/
module load python/3.7-anaconda3
cd /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli/iRF.run/Ecoli.allCas9/all.features/region2
python /gpfs/alpine/syb105/proj-shared/Projects/iRF/iRF_postProcessing.py --Iterations 5 --Prediction --PredAccuracy MAE,MAEA,MSE,R2 --varTot 95 /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli/iRF.run/Ecoli.allCas9/YNames.txt region.dwt
# R2 = -0.0638904038483941
# pearson correlation
setwd("/gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli/iRF.run/Ecoli.allCas9/all.features/region2/cut.score/foldRuns/fold9/Runs/Set4")
pred <- read.delim("region.dwt_Set4_test.prediction", header=T, sep="\t")
y <- read.delim("set4_Y_test_noSampleIDs.txt", header=T, sep="\t")
cor(y$cut.score, pred$Predictions.)
# 0.2842348
# spearman correlation
cor(y$cut.score, pred$Predictions., method=c("spearman"))
# 0.326081
library(tidyr)
library(ggplot2)
setwd("/gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli/iRF.run/Ecoli.allCas9/all.features/region2/cut.score/foldRuns/results/")
feature <- read.delim("importanceScores.txt", header=T, sep="\t")
feature.order <- feature[order(-feature$MedianImportance),]
feature.order.top <- feature.order[1:25,]
feature.order.top$id <- feature.order.top$Feature
df <- separate(feature.order.top, Feature, c("feature", "type"))
pdf("ecoli.allfeatures.region2.importanceScores.pdf")
ggplot(df, aes(x = reorder(id, -MedianImportance), y = MedianImportance)) + geom_bar(stat = "identity") + theme_classic() + theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1))
dev.off()
# scp noshayjm@dtn.ccs.ornl.gov:/gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli/iRF.run/Ecoli.allCas9/all.features/region2/cut.score/foldRuns/results/ecoli.allfeatures.region2.importanceScores.pdf /Users/27n/Dropbox (ORNL)/ORNL.Noshay/Projects/SEED/ExploratoryDataForModelGeneration/e.coli/.
# correlation - by Cas9 type
setwd("/gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli/iRF.run/Ecoli.allCas9/all.features/region2/cut.score/foldRuns/fold9/Runs/Set4")
pred <- read.delim("region.dwt_Set4_test.prediction", header=T, sep="\t")
y <- read.delim("set4_Y_test_noSampleIDs.txt", header=T, sep="\t")
id <- read.delim("set4_test_SampleIDs.txt", header=F, sep="\t")
colnames(id) <- "sgRNAID"
id.pred <- cbind(id, pred)
id.pred.y <- cbind(id.pred, y)
library(tidyr)
id.pred.y.group <- id.pred.y %>% separate(sgRNAID, c("sgRNA", "ID", "group"), "_")
# 23117
cor(id.pred.y.group$cut.score, id.pred.y.group$Predictions., method=c("pearson"))
# 0.2839935
pred.Cas9 <- subset(id.pred.y.group, id.pred.y.group$group == "Cas9")
cor(pred.Cas9$cut.score, pred.Cas9$Predictions., method=c("pearson"))
# 0.7182784
pred.eSpCas9 <- subset(id.pred.y.group, id.pred.y.group$group == "eSpCas9")
cor(pred.eSpCas9$cut.score, pred.eSpCas9$Predictions., method=c("pearson"))
# 0.6612769
pred.recACas9 <- subset(id.pred.y.group, id.pred.y.group$group == "recACas9")
cor(pred.recACas9$cut.score, pred.recACas9$Predictions., method=c("pearson"))
# 0.4593487
# combine regional DWT with other features
library(tidyr)
library(dplyr)
setwd("/gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli")
df.dcast.na <- read.delim("ecoli.500bp.dwt.features2.dcast.txt", header=T, sep="\t", stringsAsFactors = F)
df.dcast.sep <- df.dcast.na %>% separate(sgRNA, c("sgRNA", "ID"), sep="_")
df <- read.delim("Ecoli.allCas9.raw.onehot.tensor.dcast.na.txt", header=T, sep="\t", stringsAsFactors = F)
df.sep <- df %>% separate(sgRNAID, c("sgRNA", "ID", "type"), sep="_")
# 126182
df.sep.region <- inner_join(df.sep[,1:4], df.dcast.sep[,c(1,2,4:ncol(df.dcast.sep))], by=c("sgRNA", "ID"))
df.sep.region.id <- df.sep.region %>% unite(sgRNAID, c("sgRNA", "ID", "type"), sep="_")
# 118143
write.table(df.sep.region.id, "Ecoli.allCas9.DWT.regional.txt", quote=F, row.names=F, sep="\t")
# Summit
module load r/4.0.5
# Andes
# salloc -A SYB105 -N 2 -t 4:00:00
module load python
source /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/scripts/loadcondaandes.sh
conda activate /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/Libraries/andes/envs/test
mkdir /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli/iRF.run/Ecoli.allCas9/all.features/dwt
R
setwd("/gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli")
df <- read.delim("Ecoli.allCas9.DWT.regional.txt", header=T, sep="\t")
df <- na.omit(df)
## sample ID
df.features <- df[,c(1,3:ncol(df))]
df.score <- df[,1:2]
setwd("/gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli/iRF.run/Ecoli.allCas9/all.features/dwt")
write.table(df.features, "region.dwtONLY.features_overlap.txt", quote=F, row.names=F, sep="\t")
write.table(df.score, "region.dwtONLY.score_overlap.txt", quote=F, row.names=F, sep="\t")
write.table(df.features, "region.dwtONLY.features.txt", quote=F, row.names=F, sep="\t")
write.table(df.score, "region.dwtONLY.score.txt", quote=F, row.names=F, sep="\t")
## no sample ID
## <DataFile>_overlap_noSampleIDs.txt & <YFile>_overlap_noSampleIDs.txt
df.features <- as.matrix(df[,c(3:ncol(df))])
df.score <- as.matrix(data.frame(df[,2]))
colnames(df.score) <- "cut.score"
setwd("/gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli/iRF.run/Ecoli.allCas9/all.features/dwt")
write.table(df.features, "region.dwtONLY.features_overlap_noSampleIDs.txt", quote=F, row.names=F, sep="\t")
write.table(df.score, "region.dwtONLY.score_overlap_noSampleIDs.txt", quote=F, row.names=F, sep="\t")
# run python scripts on Andes
# run job submissions on Summit
# Builder script: /gpfs/alpine/syb105/proj-shared/Projects/iRF/iRF_LOOP_SetUp_CrossLayer.py
# [python iRF_LOOP_SetUp_CrossLayer.py --DataFile --YFile --System Summit --NodesPer 1 --TotalNodes 10 --RunTime 2 --Account SYB105 --NumTrees 1000 --NumIterations 5 --RunName iRF.XX --bypass --Prediction]
# Andes
mkdir /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli/iRF.run/Ecoli.allCas9/all.features/dwt
cd /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli/iRF.run/Ecoli.allCas9/all.features/dwt
module load python/3.7-anaconda3
cd /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli/iRF.run/Ecoli.allCas9/all.features/dwt
python /gpfs/alpine/syb105/proj-shared/Projects/iRF/iRF_LOOP_SetUp_CrossLayer.py --System Summit --NodesPer 1 --TotalNodes 50 --RunTime 90 --Account SYB105 --NumTrees 1000 --NumIterations 5 --RunName region.dwtONLY --bypass --Prediction /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli/iRF.run/Ecoli.allCas9/all.features/dwt/region.dwtONLY.features.txt /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli/iRF.run/Ecoli.allCas9/all.features/dwt/region.dwtONLY.score.txt
# Summit
cd /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli/iRF.run/Ecoli.allCas9/all.features/dwt
module load python/3.7.0-anaconda3-5.3.0
# full
bsub /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli/iRF.run/Ecoli.allCas9/all.features/dwt/Submits/submit_full_region.dwtONLY_0.sh
# train
bsub /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli/iRF.run/Ecoli.allCas9/all.features/dwt/Submits/submit_train_region.dwtONLY_0.sh
# once the train submissions are done run the test submissions
# test
bsub /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli/iRF.run/Ecoli.allCas9/all.features/dwt/Submits/submit_test_region.dwtONLY_0.sh
# Andes
cd /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli/iRF.run/Ecoli.allCas9/
module load python/3.7-anaconda3
cd /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli/iRF.run/Ecoli.allCas9/all.features/dwt
python /gpfs/alpine/syb105/proj-shared/Projects/iRF/iRF_postProcessing.py --Iterations 5 --Prediction --PredAccuracy MAE,MAEA,MSE,R2 --varTot 95 /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli/iRF.run/Ecoli.allCas9/YNames.txt region.dwtONLY
# R2 = 0.0027176115871570625
# pearson correlation
setwd("/gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli/iRF.run/Ecoli.allCas9/all.features/dwt/cut.score/foldRuns/fold9/Runs/Set4")
pred <- read.delim("region.dwtONLY_Set4_test.prediction", header=T, sep="\t")
y <- read.delim("set4_Y_test_noSampleIDs.txt", header=T, sep="\t")
cor(y$cut.score, pred$Predictions.)
# 0.1857155
# spearman correlation
cor(y$cut.score, pred$Predictions., method=c("spearman"))
# 0.1903883
library(tidyr)
library(ggplot2)
setwd("/gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli/iRF.run/Ecoli.allCas9/all.features/dwt/cut.score/foldRuns/results/")
feature <- read.delim("importanceScores.txt", header=T, sep="\t")
feature.order <- feature[order(-feature$MedianImportance),]
feature.order.top <- feature.order[1:25,]
feature.order.top$id <- feature.order.top$Feature
df <- separate(feature.order.top, Feature, c("feature", "type"))
pdf("ecoli.dwt.regional.importanceScores.pdf")
ggplot(df, aes(x = reorder(id, -MedianImportance), y = MedianImportance)) + geom_bar(stat = "identity") + theme_classic() + theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1))
dev.off()
# scp noshayjm@dtn.ccs.ornl.gov:/gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli/iRF.run/Ecoli.allCas9/all.features/dwt/cut.score/foldRuns/results/ecoli.dwt.regional.importanceScores.pdf /Users/27n/Dropbox (ORNL)/ORNL.Noshay/Projects/SEED/ExploratoryDataForModelGeneration/e.coli/.
# Summit
module load r/4.0.5
# Andes
# salloc -A SYB105 -N 2 -t 4:00:00
module load python
source /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/scripts/loadcondaandes.sh
conda activate /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/Libraries/andes/envs/test
mkdir /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli/iRF.run/Ecoli.allCas9/all.features/cas9only
cd /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli/
grep '_Cas9' Ecoli.allCas9.DWT.raw.onehot.tensor.dcast.regional2.txt > Ecoli.cas9only.DWT.raw.onehot.tensor.dcast.regional2.txt
awk 'NR==1{print $0}' Ecoli.allCas9.DWT.raw.onehot.tensor.dcast.regional2.txt > Ecoli.allCas9.DWT.raw.onehot.tensor.dcast.regional2.header.txt
cat Ecoli.allCas9.DWT.raw.onehot.tensor.dcast.regional2.header.txt Ecoli.cas9only.DWT.raw.onehot.tensor.dcast.regional2.txt > Ecoli.cas9only.DWT.raw.onehot.tensor.dcast.regional2.header.txt
R
setwd("/gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli")
df <- read.delim("Ecoli.cas9only.DWT.raw.onehot.tensor.dcast.regional2.header.txt", header=T, sep="\t")
df <- na.omit(df)
## sample ID
df.features <- df[,c(1,3:ncol(df))]
df.score <- df[,1:2]
setwd("/gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli/iRF.run/Ecoli.allCas9/all.features/cas9only")
write.table(df.features, "cas9only.features_overlap.txt", quote=F, row.names=F, sep="\t")
write.table(df.score, "cas9only.score_overlap.txt", quote=F, row.names=F, sep="\t")
write.table(df.features, "cas9only.features.txt", quote=F, row.names=F, sep="\t")
write.table(df.score, "cas9only.score.txt", quote=F, row.names=F, sep="\t")
## no sample ID
## <DataFile>_overlap_noSampleIDs.txt & <YFile>_overlap_noSampleIDs.txt
df.features <- as.matrix(df[,c(3:ncol(df))])
df.score <- as.matrix(data.frame(df[,2]))
colnames(df.score) <- "cut.score"
setwd("/gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli/iRF.run/Ecoli.allCas9/all.features/cas9only")
write.table(df.features, "cas9only.features_overlap_noSampleIDs.txt", quote=F, row.names=F, sep="\t")
write.table(df.score, "cas9only.score_overlap_noSampleIDs.txt", quote=F, row.names=F, sep="\t")
# run python scripts on Andes
# run job submissions on Summit
# Builder script: /gpfs/alpine/syb105/proj-shared/Projects/iRF/iRF_LOOP_SetUp_CrossLayer.py
# [python iRF_LOOP_SetUp_CrossLayer.py --DataFile --YFile --System Summit --NodesPer 1 --TotalNodes 10 --RunTime 2 --Account SYB105 --NumTrees 1000 --NumIterations 5 --RunName iRF.XX --bypass --Prediction]
# Andes
mkdir /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli/iRF.run/Ecoli.allCas9/all.features/cas9only
cd /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli/iRF.run/Ecoli.allCas9/all.features/cas9only
module load python/3.7-anaconda3
cd /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli/iRF.run/Ecoli.allCas9/all.features/cas9only
python /gpfs/alpine/syb105/proj-shared/Projects/iRF/iRF_LOOP_SetUp_CrossLayer.py --System Summit --NodesPer 1 --TotalNodes 50 --RunTime 90 --Account SYB105 --NumTrees 1000 --NumIterations 5 --RunName cas9only --bypass --Prediction /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli/iRF.run/Ecoli.allCas9/all.features/cas9only/cas9only.features.txt /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli/iRF.run/Ecoli.allCas9/all.features/cas9only/cas9only.score.txt
# Summit
cd /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli/iRF.run/Ecoli.allCas9/all.features/cas9only
module load python/3.7.0-anaconda3-5.3.0
# full
bsub /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli/iRF.run/Ecoli.allCas9/all.features/cas9only/Submits/submit_full_cas9only_0.sh
# train
bsub /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli/iRF.run/Ecoli.allCas9/all.features/cas9only/Submits/submit_train_cas9only_0.sh
# once the train submissions are done run the test submissions
# test
bsub /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli/iRF.run/Ecoli.allCas9/all.features/cas9only/Submits/submit_test_cas9only_0.sh
# Andes
cd /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli/iRF.run/Ecoli.allCas9/
module load python/3.7-anaconda3
cd /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli/iRF.run/Ecoli.allCas9/all.features/cas9only
python /gpfs/alpine/syb105/proj-shared/Projects/iRF/iRF_postProcessing.py --Iterations 5 --Prediction --PredAccuracy MAE,MAEA,MSE,R2 --varTot 95 /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli/iRF.run/Ecoli.allCas9/YNames.txt cas9only
# R2 = 0.23775829195102294
# pearson correlation
setwd("/gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli/iRF.run/Ecoli.allCas9/all.features/cas9only/cut.score/foldRuns/fold9/Runs/Set4")
pred <- read.delim("cas9only_Set4_test.prediction", header=T, sep="\t")
y <- read.delim("set4_Y_test_noSampleIDs.txt", header=T, sep="\t")
cor(y$cut.score, pred$Predictions.)
# 0.4798756
# spearman correlation
cor(y$cut.score, pred$Predictions., method=c("spearman"))
# 0.4685236
library(tidyr)
library(ggplot2)
setwd("/gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli/iRF.run/Ecoli.allCas9/all.features/cas9only/cut.score/foldRuns/results/")
feature <- read.delim("importanceScores.txt", header=T, sep="\t")
feature.order <- feature[order(-feature$MedianImportance),]
feature.order.top <- feature.order[1:25,]
feature.order.top$id <- feature.order.top$Feature
df <- separate(feature.order.top, Feature, c("feature", "type"))
pdf("ecoli.cas9only.importanceScores.pdf")
ggplot(df, aes(x = reorder(id, -MedianImportance), y = MedianImportance)) + geom_bar(stat = "identity") + theme_classic() + theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1))
dev.off()
# scp noshayjm@dtn.ccs.ornl.gov:/gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli/iRF.run/Ecoli.allCas9/all.features/cas9only/cut.score/foldRuns/results/ecoli.cas9only.importanceScores.pdf /Users/27n/Dropbox (ORNL)/ORNL.Noshay/Projects/SEED/ExploratoryDataForModelGeneration/e.coli/.
–> feature-specific again
#/gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli/iRF.run/Ecoli.allCas9/all.features/cas9only/cas9only.features_overlap_noSampleIDs.txt
#/gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli/iRF.run/Ecoli.allCas9/all.features/cas9only/cas9only.features_overlap.txt
#/gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli/iRF.run/Ecoli.allCas9/all.features/cas9only/cas9only.features.txt
cd /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli/iRF.run/Ecoli.allCas9/all.features/cas9only/
# raw
cut -f 1580-1582 cas9only.features_overlap_noSampleIDs.txt > cas9only.raw.features_overlap_noSampleIDs.txt
cut -f 1,1581-1582 cas9only.features_overlap.txt > cas9only.raw.features_overlap.txt
cut -f 1,1581-1582 cas9only.features.txt > cas9only.raw.features.txt
# dwt
cut -f 1588-1686 cas9only.features_overlap_noSampleIDs.txt > cas9only.dwt.features_overlap_noSampleIDs.txt
cut -f 1,1589-1687 cas9only.features_overlap.txt > cas9only.dwt.features_overlap.txt
cut -f 1,1589-1687 cas9only.features.txt > cas9only.dwt.features.txt
# onehot
cut -f 1-51,114-129,192-207,270-285,348-363,426-441,504-519,582-597,660-675,738-753,878-897,1022-1041,1104-1123,1186-1205,1268-1283,1346-1361,1424-1439,1502-1517,1583-1587 cas9only.features_overlap_noSampleIDs.txt > cas9only.onehot.features_overlap_noSampleIDs.txt
cut -f 1-52,115-130,193-208,271-286,349-364,427-442,505-520,583-598,661-676,739-754,879-898,1023-1042,1105-1124,1187-1206,1269-1284,1347-1362,1425-1440,1503-1518,1584-1588 cas9only.features_overlap.txt > cas9only.onehot.features_overlap.txt
cut -f 1-52,115-130,193-208,271-286,349-364,427-442,505-520,583-598,661-676,739-754,879-898,1023-1042,1105-1124,1187-1206,1269-1284,1347-1362,1425-1440,1503-1518,1584-1588 cas9only.features.txt > cas9only.onehot.features.txt
# tensor
cut -f 52-113,130-191,208-269,286-347,364-425,442-503,520-581,598-659,676-737,754-877,898-1021,1042-1103,1124-1185,1206-1267,1284-1345,1362-1423,1440-1501,1518-1579 cas9only.features_overlap_noSampleIDs.txt > cas9only.tensor.features_overlap_noSampleIDs.txt
cut -f 1,53-114,131-192,209-270,287-348,365-426,443-504,521-582,599-660,677-738,755-878,899-1022,1043-1104,1125-1186,1207-1268,1285-1346,1363-1424,1441-1502,1519-1580 cas9only.features_overlap.txt > cas9only.tensor.features_overlap.txt
cut -f 1,53-114,131-192,209-270,287-348,365-426,443-504,521-582,599-660,677-738,755-878,899-1022,1043-1104,1125-1186,1207-1268,1285-1346,1363-1424,1441-1502,1519-1580 cas9only.features.txt > cas9only.tensor.features.txt
# Andes
mkdir /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli/iRF.run/Ecoli.allCas9/all.features/cas9only/raw
mkdir /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli/iRF.run/Ecoli.allCas9/all.features/cas9only/dwt
mkdir /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli/iRF.run/Ecoli.allCas9/all.features/cas9only/onehot
mkdir /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli/iRF.run/Ecoli.allCas9/all.features/cas9only/tensor
module load python/3.7-anaconda3
cd /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli/iRF.run/Ecoli.allCas9/all.features/cas9only/raw
python /gpfs/alpine/syb105/proj-shared/Projects/iRF/iRF_LOOP_SetUp_CrossLayer.py --System Summit --NodesPer 1 --TotalNodes 50 --RunTime 90 --Account SYB105 --NumTrees 1000 --NumIterations 5 --RunName cas9only.raw --bypass --Prediction /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli/iRF.run/Ecoli.allCas9/all.features/cas9only/cas9only.raw.features.txt /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli/iRF.run/Ecoli.allCas9/all.features/cas9only/cas9only.score.txt
cd /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli/iRF.run/Ecoli.allCas9/all.features/cas9only/dwt
python /gpfs/alpine/syb105/proj-shared/Projects/iRF/iRF_LOOP_SetUp_CrossLayer.py --System Summit --NodesPer 1 --TotalNodes 50 --RunTime 90 --Account SYB105 --NumTrees 1000 --NumIterations 5 --RunName cas9only.dwt --bypass --Prediction /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli/iRF.run/Ecoli.allCas9/all.features/cas9only/cas9only.dwt.features.txt /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli/iRF.run/Ecoli.allCas9/all.features/cas9only/cas9only.score.txt
cd /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli/iRF.run/Ecoli.allCas9/all.features/cas9only/onehot
python /gpfs/alpine/syb105/proj-shared/Projects/iRF/iRF_LOOP_SetUp_CrossLayer.py --System Summit --NodesPer 1 --TotalNodes 50 --RunTime 90 --Account SYB105 --NumTrees 1000 --NumIterations 5 --RunName cas9only.onehot --bypass --Prediction /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli/iRF.run/Ecoli.allCas9/all.features/cas9only/cas9only.onehot.features.txt /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli/iRF.run/Ecoli.allCas9/all.features/cas9only/cas9only.score.txt
cd /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli/iRF.run/Ecoli.allCas9/all.features/cas9only/tensor
python /gpfs/alpine/syb105/proj-shared/Projects/iRF/iRF_LOOP_SetUp_CrossLayer.py --System Summit --NodesPer 1 --TotalNodes 50 --RunTime 90 --Account SYB105 --NumTrees 1000 --NumIterations 5 --RunName cas9only.tensor --bypass --Prediction /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli/iRF.run/Ecoli.allCas9/all.features/cas9only/cas9only.tensor.features.txt /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli/iRF.run/Ecoli.allCas9/all.features/cas9only/cas9only.score.txt
# Summit
cd /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli/iRF.run/Ecoli.allCas9/all.features/cas9only
module load python/3.7.0-anaconda3-5.3.0
# full
bsub /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli/iRF.run/Ecoli.allCas9/all.features/cas9only/raw/Submits/submit_full_cas9only.raw_0.sh
bsub /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli/iRF.run/Ecoli.allCas9/all.features/cas9only/dwt/Submits/submit_full_cas9only.dwt_0.sh
bsub /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli/iRF.run/Ecoli.allCas9/all.features/cas9only/onehot/Submits/submit_full_cas9only.onehot_0.sh
bsub /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli/iRF.run/Ecoli.allCas9/all.features/cas9only/tensor/Submits/submit_full_cas9only.tensor_0.sh
# train
bsub /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli/iRF.run/Ecoli.allCas9/all.features/cas9only/raw/Submits/submit_train_cas9only.raw_0.sh
bsub /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli/iRF.run/Ecoli.allCas9/all.features/cas9only/dwt/Submits/submit_train_cas9only.dwt_0.sh
bsub /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli/iRF.run/Ecoli.allCas9/all.features/cas9only/onehot/Submits/submit_train_cas9only.onehot_0.sh
bsub /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli/iRF.run/Ecoli.allCas9/all.features/cas9only/tensor/Submits/submit_train_cas9only.tensor_0.sh
# once the train submissions are done run the test submissions
# test
bsub /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli/iRF.run/Ecoli.allCas9/all.features/cas9only/raw/Submits/submit_test_cas9only.raw_0.sh
bsub /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli/iRF.run/Ecoli.allCas9/all.features/cas9only/dwt/Submits/submit_test_cas9only.dwt_0.sh
bsub /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli/iRF.run/Ecoli.allCas9/all.features/cas9only/onehot/Submits/submit_test_cas9only.onehot_0.sh
bsub /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli/iRF.run/Ecoli.allCas9/all.features/cas9only/tensor/Submits/submit_test_cas9only.tensor_0.sh
# Andes
module load python/3.7-anaconda3
cd /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli/iRF.run/Ecoli.allCas9/all.features/cas9only/raw
python /gpfs/alpine/syb105/proj-shared/Projects/iRF/iRF_postProcessing.py --Iterations 5 --Prediction --PredAccuracy MAE,MAEA,MSE,R2 --varTot 95 /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli/iRF.run/Ecoli.allCas9/YNames.txt cas9only.raw
# R2 = 0.04068614923376265
cd /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli/iRF.run/Ecoli.allCas9/all.features/cas9only/dwt
python /gpfs/alpine/syb105/proj-shared/Projects/iRF/iRF_postProcessing.py --Iterations 5 --Prediction --PredAccuracy MAE,MAEA,MSE,R2 --varTot 95 /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli/iRF.run/Ecoli.allCas9/YNames.txt cas9only.dwt
# R2 = -0.12608763158026934
cd /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli/iRF.run/Ecoli.allCas9/all.features/cas9only/onehot
python /gpfs/alpine/syb105/proj-shared/Projects/iRF/iRF_postProcessing.py --Iterations 5 --Prediction --PredAccuracy MAE,MAEA,MSE,R2 --varTot 95 /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli/iRF.run/Ecoli.allCas9/YNames.txt cas9only.onehot
# R2 = 0.1912168348378202
cd /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli/iRF.run/Ecoli.allCas9/all.features/cas9only/tensor
python /gpfs/alpine/syb105/proj-shared/Projects/iRF/iRF_postProcessing.py --Iterations 5 --Prediction --PredAccuracy MAE,MAEA,MSE,R2 --varTot 95 /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli/iRF.run/Ecoli.allCas9/YNames.txt cas9only.tensor
# R2 = 0.23839846571564366
# pearson correlation
setwd("/gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli/iRF.run/Ecoli.allCas9/all.features/cas9only/raw/cut.score/foldRuns/fold9/Runs/Set4")
pred <- read.delim("cas9only.raw_Set4_test.prediction", header=T, sep="\t")
y <- read.delim("set4_Y_test_noSampleIDs.txt", header=T, sep="\t")
cor(y$cut.score, pred$Predictions.)
# 0.1983272
setwd("/gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli/iRF.run/Ecoli.allCas9/all.features/cas9only/dwt/cut.score/foldRuns/fold9/Runs/Set4")
pred <- read.delim("cas9only.dwt_Set4_test.prediction", header=T, sep="\t")
y <- read.delim("set4_Y_test_noSampleIDs.txt", header=T, sep="\t")
cor(y$cut.score, pred$Predictions.)
# 0.1176847
setwd("/gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli/iRF.run/Ecoli.allCas9/all.features/cas9only/onehot/cut.score/foldRuns/fold9/Runs/Set4")
pred <- read.delim("cas9only.onehot_Set4_test.prediction", header=T, sep="\t")
y <- read.delim("set4_Y_test_noSampleIDs.txt", header=T, sep="\t")
cor(y$cut.score, pred$Predictions.)
# 0.4385451
setwd("/gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli/iRF.run/Ecoli.allCas9/all.features/cas9only/tensor/cut.score/foldRuns/fold9/Runs/Set4")
pred <- read.delim("cas9only.tensor_Set4_test.prediction", header=T, sep="\t")
y <- read.delim("set4_Y_test_noSampleIDs.txt", header=T, sep="\t")
cor(y$cut.score, pred$Predictions.)
# 0.4780769
cd /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli/iRF.run/Ecoli.allCas9/all.features/cas9only/
# raw + onehot
cut -f 1-51,114-129,192-207,270-285,348-363,426-441,504-519,582-597,660-675,738-753,878-897,1022-1041,1104-1123,1186-1205,1268-1283,1346-1361,1424-1439,1502-1517,1583-1587,1580-1582 cas9only.features_overlap_noSampleIDs.txt > cas9only.raw.onehot.features_overlap_noSampleIDs.txt
cut -f 1-52,115-130,193-208,271-286,349-364,427-442,505-520,583-598,661-676,739-754,879-898,1023-1042,1105-1124,1187-1206,1269-1284,1347-1362,1425-1440,1503-1518,1584-1588,1581-1582 cas9only.features_overlap.txt > cas9only.raw.onehot.features_overlap.txt
cut -f 1-52,115-130,193-208,271-286,349-364,427-442,505-520,583-598,661-676,739-754,879-898,1023-1042,1105-1124,1187-1206,1269-1284,1347-1362,1425-1440,1503-1518,1584-1588,1581-1582 cas9only.features.txt > cas9only.raw.onehot.features.txt
# Andes
mkdir /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli/iRF.run/Ecoli.allCas9/all.features/cas9only/raw.onehot
module load python/3.7-anaconda3
cd /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli/iRF.run/Ecoli.allCas9/all.features/cas9only/raw.onehot
python /gpfs/alpine/syb105/proj-shared/Projects/iRF/iRF_LOOP_SetUp_CrossLayer.py --System Summit --NodesPer 1 --TotalNodes 50 --RunTime 90 --Account SYB105 --NumTrees 1000 --NumIterations 5 --RunName cas9only.raw.onehot --bypass --Prediction /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli/iRF.run/Ecoli.allCas9/all.features/cas9only/cas9only.raw.onehot.features.txt /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli/iRF.run/Ecoli.allCas9/all.features/cas9only/cas9only.score.txt
# Summit
cd /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli/iRF.run/Ecoli.allCas9/all.features/cas9only
module load python/3.7.0-anaconda3-5.3.0
# full
bsub /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli/iRF.run/Ecoli.allCas9/all.features/cas9only/raw.onehot/Submits/submit_full_cas9only.raw.onehot_0.sh
# train
bsub /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli/iRF.run/Ecoli.allCas9/all.features/cas9only/raw.onehot/Submits/submit_train_cas9only.raw.onehot_0.sh
# once the train submissions are done run the test submissions
# test
bsub /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli/iRF.run/Ecoli.allCas9/all.features/cas9only/raw.onehot/Submits/submit_test_cas9only.raw.onehot_0.sh
# Andes
module load python/3.7-anaconda3
cd /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli/iRF.run/Ecoli.allCas9/all.features/cas9only/raw.onehot
python /gpfs/alpine/syb105/proj-shared/Projects/iRF/iRF_postProcessing.py --Iterations 5 --Prediction --PredAccuracy MAE,MAEA,MSE,R2 --varTot 95 /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli/iRF.run/Ecoli.allCas9/YNames.txt cas9only.raw.onehot
# R2 = 0.1920748158585695
# pearson correlation
setwd("/gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli/iRF.run/Ecoli.allCas9/all.features/cas9only/raw.onehot/cut.score/foldRuns/fold9/Runs/Set4")
pred <- read.delim("cas9only.raw.onehot_Set4_test.prediction", header=T, sep="\t")
y <- read.delim("set4_Y_test_noSampleIDs.txt", header=T, sep="\t")
cor(y$cut.score, pred$Predictions.)
# 0.438235
RAW cut -f 1580-1582 cas9only.features_overlap_noSampleIDs.txt > cas9only.raw.features_overlap_noSampleIDs.txt cut -f 1,1581-1582 cas9only.features_overlap.txt > cas9only.raw.features_overlap.txt cut -f 1,1581-1582 cas9only.features.txt > cas9only.raw.features.txt
DWT cut -f 1588-1686 cas9only.features_overlap_noSampleIDs.txt > cas9only.dwt.features_overlap_noSampleIDs.txt cut -f 1,1589-1687 cas9only.features_overlap.txt > cas9only.dwt.features_overlap.txt cut -f 1,1589-1687 cas9only.features.txt > cas9only.dwt.features.txt
ONEHOT cut -f 1-51,114-129,192-207,270-285,348-363,426-441,504-519,582-597,660-675,738-753,878-897,1022-1041,1104-1123,1186-1205,1268-1283,1346-1361,1424-1439,1502-1517,1583-1587 cas9only.features_overlap_noSampleIDs.txt > cas9only.onehot.features_overlap_noSampleIDs.txt cut -f 1-52,115-130,193-208,271-286,349-364,427-442,505-520,583-598,661-676,739-754,879-898,1023-1042,1105-1124,1187-1206,1269-1284,1347-1362,1425-1440,1503-1518,1584-1588 cas9only.features_overlap.txt > cas9only.onehot.features_overlap.txt cut -f 1-52,115-130,193-208,271-286,349-364,427-442,505-520,583-598,661-676,739-754,879-898,1023-1042,1105-1124,1187-1206,1269-1284,1347-1362,1425-1440,1503-1518,1584-1588 cas9only.features.txt > cas9only.onehot.features.txt
TENSOR cut -f 52-113,130-191,208-269,286-347,364-425,442-503,520-581,598-659,676-737,754-877,898-1021,1042-1103,1124-1185,1206-1267,1284-1345,1362-1423,1440-1501,1518-1579 cas9only.features_overlap_noSampleIDs.txt > cas9only.tensor.features_overlap_noSampleIDs.txt cut -f 1,53-114,131-192,209-270,287-348,365-426,443-504,521-582,599-660,677-738,755-878,899-1022,1043-1104,1125-1186,1207-1268,1285-1346,1363-1424,1441-1502,1519-1580 cas9only.features_overlap.txt > cas9only.tensor.features_overlap.txt cut -f 1,53-114,131-192,209-270,287-348,365-426,443-504,521-582,599-660,677-738,755-878,899-1022,1043-1104,1125-1186,1207-1268,1285-1346,1363-1424,1441-1502,1519-1580 cas9only.features.txt > cas9only.tensor.features.txt
cd /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli/iRF.run/Ecoli.allCas9/all.features/cas9only/
# raw + onehot + tensor
cut -f 1-51,114-129,192-207,270-285,348-363,426-441,504-519,582-597,660-675,738-753,878-897,1022-1041,1104-1123,1186-1205,1268-1283,1346-1361,1424-1439,1502-1517,1583-1587,1580-1582,52-113,130-191,208-269,286-347,364-425,442-503,520-581,598-659,676-737,754-877,898-1021,1042-1103,1124-1185,1206-1267,1284-1345,1362-1423,1440-1501,1518-1579 cas9only.features_overlap_noSampleIDs.txt > cas9only.raw.onehot.tensor.features_overlap_noSampleIDs.txt
cut -f 1-52,115-130,193-208,271-286,349-364,427-442,505-520,583-598,661-676,739-754,879-898,1023-1042,1105-1124,1187-1206,1269-1284,1347-1362,1425-1440,1503-1518,1584-1588,1581-1582,53-114,131-192,209-270,287-348,365-426,443-504,521-582,599-660,677-738,755-878,899-1022,1043-1104,1125-1186,1207-1268,1285-1346,1363-1424,1441-1502,1519-1580 cas9only.features_overlap.txt > cas9only.raw.onehot.tensor.features_overlap.txt
cut -f 1-52,115-130,193-208,271-286,349-364,427-442,505-520,583-598,661-676,739-754,879-898,1023-1042,1105-1124,1187-1206,1269-1284,1347-1362,1425-1440,1503-1518,1584-1588,1581-1582,53-114,131-192,209-270,287-348,365-426,443-504,521-582,599-660,677-738,755-878,899-1022,1043-1104,1125-1186,1207-1268,1285-1346,1363-1424,1441-1502,1519-1580 cas9only.features.txt > cas9only.raw.onehot.tensor.features.txt
# raw + onehot + tensor + dwt
cut -f 1-51,114-129,192-207,270-285,348-363,426-441,504-519,582-597,660-675,738-753,878-897,1022-1041,1104-1123,1186-1205,1268-1283,1346-1361,1424-1439,1502-1517,1583-1587,1580-1582,52-113,130-191,208-269,286-347,364-425,442-503,520-581,598-659,676-737,754-877,898-1021,1042-1103,1124-1185,1206-1267,1284-1345,1362-1423,1440-1501,1518-1579,1588-1686 cas9only.features_overlap_noSampleIDs.txt > cas9only.raw.onehot.tensor.dwt.features_overlap_noSampleIDs.txt
cut -f 1-52,115-130,193-208,271-286,349-364,427-442,505-520,583-598,661-676,739-754,879-898,1023-1042,1105-1124,1187-1206,1269-1284,1347-1362,1425-1440,1503-1518,1584-1588,1581-1582,53-114,131-192,209-270,287-348,365-426,443-504,521-582,599-660,677-738,755-878,899-1022,1043-1104,1125-1186,1207-1268,1285-1346,1363-1424,1441-1502,1519-1580,1589-1687 cas9only.features_overlap.txt > cas9only.raw.onehot.tensor.dwt.features_overlap.txt
cut -f 1-52,115-130,193-208,271-286,349-364,427-442,505-520,583-598,661-676,739-754,879-898,1023-1042,1105-1124,1187-1206,1269-1284,1347-1362,1425-1440,1503-1518,1584-1588,1581-1582,53-114,131-192,209-270,287-348,365-426,443-504,521-582,599-660,677-738,755-878,899-1022,1043-1104,1125-1186,1207-1268,1285-1346,1363-1424,1441-1502,1519-1580,1589-1687 cas9only.features.txt > cas9only.raw.onehot.tensor.dwt.features.txt
# raw + dwt
cut -f 1580-1582,1588-1686 cas9only.features_overlap_noSampleIDs.txt > cas9only.raw.dwt.features_overlap_noSampleIDs.txt
cut -f 1,1581-1582,1589-1687 cas9only.features_overlap.txt > cas9only.raw.dwt.features_overlap.txt
cut -f 1,1581-1582,1589-1687 cas9only.features.txt > cas9only.raw.dwt.features.txt
# raw + tensor
cut -f 1580-1582,52-113,130-191,208-269,286-347,364-425,442-503,520-581,598-659,676-737,754-877,898-1021,1042-1103,1124-1185,1206-1267,1284-1345,1362-1423,1440-1501,1518-1579 cas9only.features_overlap_noSampleIDs.txt > cas9only.raw.tensor.features_overlap_noSampleIDs.txt
cut -f 1,1581-1582,53-114,131-192,209-270,287-348,365-426,443-504,521-582,599-660,677-738,755-878,899-1022,1043-1104,1125-1186,1207-1268,1285-1346,1363-1424,1441-1502,1519-1580 cas9only.features_overlap.txt > cas9only.raw.tensor.features_overlap.txt
cut -f 1,1581-1582,53-114,131-192,209-270,287-348,365-426,443-504,521-582,599-660,677-738,755-878,899-1022,1043-1104,1125-1186,1207-1268,1285-1346,1363-1424,1441-1502,1519-1580 cas9only.features.txt > cas9only.raw.tensor.features.txt
# dwt + onehot
cut -f 1-51,114-129,192-207,270-285,348-363,426-441,504-519,582-597,660-675,738-753,878-897,1022-1041,1104-1123,1186-1205,1268-1283,1346-1361,1424-1439,1502-1517,1583-1587,1588-1686 cas9only.features_overlap_noSampleIDs.txt > cas9only.dwt.onehot.features_overlap_noSampleIDs.txt
cut -f 1-52,115-130,193-208,271-286,349-364,427-442,505-520,583-598,661-676,739-754,879-898,1023-1042,1105-1124,1187-1206,1269-1284,1347-1362,1425-1440,1503-1518,1584-1588,1589-1687 cas9only.features_overlap.txt > cas9only.dwt.onehot.features_overlap.txt
cut -f 1-52,115-130,193-208,271-286,349-364,427-442,505-520,583-598,661-676,739-754,879-898,1023-1042,1105-1124,1187-1206,1269-1284,1347-1362,1425-1440,1503-1518,1584-1588,1589-1687 cas9only.features.txt > cas9only.dwt.onehot.features.txt
# dwt + tensor
cut -f 1588-1686,52-113,130-191,208-269,286-347,364-425,442-503,520-581,598-659,676-737,754-877,898-1021,1042-1103,1124-1185,1206-1267,1284-1345,1362-1423,1440-1501,1518-1579 cas9only.features_overlap_noSampleIDs.txt > cas9only.dwt.tensor.features_overlap_noSampleIDs.txt
cut -f 1,1589-1687,53-114,131-192,209-270,287-348,365-426,443-504,521-582,599-660,677-738,755-878,899-1022,1043-1104,1125-1186,1207-1268,1285-1346,1363-1424,1441-1502,1519-1580 cas9only.features_overlap.txt > cas9only.dwt.tensor.features_overlap.txt
cut -f 1,1589-1687,53-114,131-192,209-270,287-348,365-426,443-504,521-582,599-660,677-738,755-878,899-1022,1043-1104,1125-1186,1207-1268,1285-1346,1363-1424,1441-1502,1519-1580 cas9only.features.txt > cas9only.dwt.tensor.features.txt
# dwt + onehot + tensor
cut -f 1-51,114-129,192-207,270-285,348-363,426-441,504-519,582-597,660-675,738-753,878-897,1022-1041,1104-1123,1186-1205,1268-1283,1346-1361,1424-1439,1502-1517,1583-1587,1588-1686,52-113,130-191,208-269,286-347,364-425,442-503,520-581,598-659,676-737,754-877,898-1021,1042-1103,1124-1185,1206-1267,1284-1345,1362-1423,1440-1501,1518-1579 cas9only.features_overlap_noSampleIDs.txt > cas9only.dwt.onehot.tensor.features_overlap_noSampleIDs.txt
cut -f 1-52,115-130,193-208,271-286,349-364,427-442,505-520,583-598,661-676,739-754,879-898,1023-1042,1105-1124,1187-1206,1269-1284,1347-1362,1425-1440,1503-1518,1584-1588,1589-1687,1589-1687,53-114,131-192,209-270,287-348,365-426,443-504,521-582,599-660,677-738,755-878,899-1022,1043-1104,1125-1186,1207-1268,1285-1346,1363-1424,1441-1502,1519-1580 cas9only.features_overlap.txt > cas9only.dwt.onehot.tensor.features_overlap.txt
cut -f 1-52,115-130,193-208,271-286,349-364,427-442,505-520,583-598,661-676,739-754,879-898,1023-1042,1105-1124,1187-1206,1269-1284,1347-1362,1425-1440,1503-1518,1584-1588,1589-1687,1589-1687,53-114,131-192,209-270,287-348,365-426,443-504,521-582,599-660,677-738,755-878,899-1022,1043-1104,1125-1186,1207-1268,1285-1346,1363-1424,1441-1502,1519-1580 cas9only.features.txt > cas9only.dwt.onehot.tensor.features.txt
# onehot + tensor
cut -f 1-51,114-129,192-207,270-285,348-363,426-441,504-519,582-597,660-675,738-753,878-897,1022-1041,1104-1123,1186-1205,1268-1283,1346-1361,1424-1439,1502-1517,1583-1587,52-113,130-191,208-269,286-347,364-425,442-503,520-581,598-659,676-737,754-877,898-1021,1042-1103,1124-1185,1206-1267,1284-1345,1362-1423,1440-1501,1518-1579 cas9only.features_overlap_noSampleIDs.txt > cas9only.onehot.tensor.features_overlap_noSampleIDs.txt
cut -f 1-52,115-130,193-208,271-286,349-364,427-442,505-520,583-598,661-676,739-754,879-898,1023-1042,1105-1124,1187-1206,1269-1284,1347-1362,1425-1440,1503-1518,1584-1588,53-114,131-192,209-270,287-348,365-426,443-504,521-582,599-660,677-738,755-878,899-1022,1043-1104,1125-1186,1207-1268,1285-1346,1363-1424,1441-1502,1519-1580 cas9only.features_overlap.txt > cas9only.onehot.tensor.features_overlap.txt
cut -f 1-52,115-130,193-208,271-286,349-364,427-442,505-520,583-598,661-676,739-754,879-898,1023-1042,1105-1124,1187-1206,1269-1284,1347-1362,1425-1440,1503-1518,1584-1588,53-114,131-192,209-270,287-348,365-426,443-504,521-582,599-660,677-738,755-878,899-1022,1043-1104,1125-1186,1207-1268,1285-1346,1363-1424,1441-1502,1519-1580 cas9only.features.txt > cas9only.onehot.tensor.features.txt
# Andes
# raw.onehot.tensor, raw.onehot.tensor.dwt, raw.dwt, raw.tensor, dwt.onehot, dwt.tensor, dwt.onehot.tensor, onehot.tensor
mkdir /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli/iRF.run/Ecoli.allCas9/all.features/cas9only/raw.onehot.tensor
mkdir /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli/iRF.run/Ecoli.allCas9/all.features/cas9only/raw.onehot.tensor.dwt
mkdir /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli/iRF.run/Ecoli.allCas9/all.features/cas9only/raw.dwt
mkdir /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli/iRF.run/Ecoli.allCas9/all.features/cas9only/raw.tensor
mkdir /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli/iRF.run/Ecoli.allCas9/all.features/cas9only/dwt.onehot
mkdir /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli/iRF.run/Ecoli.allCas9/all.features/cas9only/dwt.tensor
mkdir /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli/iRF.run/Ecoli.allCas9/all.features/cas9only/dwt.onehot.tensor
mkdir /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli/iRF.run/Ecoli.allCas9/all.features/cas9only/onehot.tensor
module load python/3.7-anaconda3
cd /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli/iRF.run/Ecoli.allCas9/all.features/cas9only/raw.onehot.tensor
python /gpfs/alpine/syb105/proj-shared/Projects/iRF/iRF_LOOP_SetUp_CrossLayer.py --System Summit --NodesPer 1 --TotalNodes 50 --RunTime 90 --Account SYB105 --NumTrees 1000 --NumIterations 5 --RunName cas9only.raw.onehot.tensor --bypass --Prediction /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli/iRF.run/Ecoli.allCas9/all.features/cas9only/cas9only.raw.onehot.tensor.features.txt /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli/iRF.run/Ecoli.allCas9/all.features/cas9only/cas9only.score.txt
cd /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli/iRF.run/Ecoli.allCas9/all.features/cas9only/raw.onehot.tensor.dwt
python /gpfs/alpine/syb105/proj-shared/Projects/iRF/iRF_LOOP_SetUp_CrossLayer.py --System Summit --NodesPer 1 --TotalNodes 50 --RunTime 90 --Account SYB105 --NumTrees 1000 --NumIterations 5 --RunName cas9only.raw.onehot.tensor.dwt --bypass --Prediction /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli/iRF.run/Ecoli.allCas9/all.features/cas9only/cas9only.raw.onehot.tensor.dwt.features.txt /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli/iRF.run/Ecoli.allCas9/all.features/cas9only/cas9only.score.txt
cd /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli/iRF.run/Ecoli.allCas9/all.features/cas9only/raw.dwt
python /gpfs/alpine/syb105/proj-shared/Projects/iRF/iRF_LOOP_SetUp_CrossLayer.py --System Summit --NodesPer 1 --TotalNodes 50 --RunTime 90 --Account SYB105 --NumTrees 1000 --NumIterations 5 --RunName cas9only.raw.dwt --bypass --Prediction /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli/iRF.run/Ecoli.allCas9/all.features/cas9only/cas9only.raw.dwt.features.txt /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli/iRF.run/Ecoli.allCas9/all.features/cas9only/cas9only.score.txt
cd /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli/iRF.run/Ecoli.allCas9/all.features/cas9only/raw.tensor
python /gpfs/alpine/syb105/proj-shared/Projects/iRF/iRF_LOOP_SetUp_CrossLayer.py --System Summit --NodesPer 1 --TotalNodes 50 --RunTime 90 --Account SYB105 --NumTrees 1000 --NumIterations 5 --RunName cas9only.raw.tensor --bypass --Prediction /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli/iRF.run/Ecoli.allCas9/all.features/cas9only/cas9only.raw.tensor.features.txt /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli/iRF.run/Ecoli.allCas9/all.features/cas9only/cas9only.score.txt
cd /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli/iRF.run/Ecoli.allCas9/all.features/cas9only/dwt.onehot
python /gpfs/alpine/syb105/proj-shared/Projects/iRF/iRF_LOOP_SetUp_CrossLayer.py --System Summit --NodesPer 1 --TotalNodes 50 --RunTime 90 --Account SYB105 --NumTrees 1000 --NumIterations 5 --RunName cas9only.dwt.onehot --bypass --Prediction /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli/iRF.run/Ecoli.allCas9/all.features/cas9only/cas9only.dwt.onehot.features.txt /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli/iRF.run/Ecoli.allCas9/all.features/cas9only/cas9only.score.txt
cd /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli/iRF.run/Ecoli.allCas9/all.features/cas9only/dwt.tensor
python /gpfs/alpine/syb105/proj-shared/Projects/iRF/iRF_LOOP_SetUp_CrossLayer.py --System Summit --NodesPer 1 --TotalNodes 50 --RunTime 90 --Account SYB105 --NumTrees 1000 --NumIterations 5 --RunName cas9only.dwt.tensor --bypass --Prediction /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli/iRF.run/Ecoli.allCas9/all.features/cas9only/cas9only.dwt.tensor.features.txt /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli/iRF.run/Ecoli.allCas9/all.features/cas9only/cas9only.score.txt
cd /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli/iRF.run/Ecoli.allCas9/all.features/cas9only/dwt.onehot.tensor
python /gpfs/alpine/syb105/proj-shared/Projects/iRF/iRF_LOOP_SetUp_CrossLayer.py --System Summit --NodesPer 1 --TotalNodes 50 --RunTime 90 --Account SYB105 --NumTrees 1000 --NumIterations 5 --RunName cas9only.dwt.onehot.tensor --bypass --Prediction /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli/iRF.run/Ecoli.allCas9/all.features/cas9only/cas9only.dwt.onehot.tensor.features.txt /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli/iRF.run/Ecoli.allCas9/all.features/cas9only/cas9only.score.txt
cd /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli/iRF.run/Ecoli.allCas9/all.features/cas9only/onehot.tensor
python /gpfs/alpine/syb105/proj-shared/Projects/iRF/iRF_LOOP_SetUp_CrossLayer.py --System Summit --NodesPer 1 --TotalNodes 50 --RunTime 90 --Account SYB105 --NumTrees 1000 --NumIterations 5 --RunName cas9only.onehot.tensor --bypass --Prediction /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli/iRF.run/Ecoli.allCas9/all.features/cas9only/cas9only.onehot.tensor.features.txt /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli/iRF.run/Ecoli.allCas9/all.features/cas9only/cas9only.score.txt
# Summit
cd /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli/iRF.run/Ecoli.allCas9/all.features/cas9only
module load python/3.7.0-anaconda3-5.3.0
# full
bsub /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli/iRF.run/Ecoli.allCas9/all.features/cas9only/raw.onehot.tensor/Submits/submit_full_cas9only.raw.onehot.tensor_0.sh
bsub /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli/iRF.run/Ecoli.allCas9/all.features/cas9only/raw.onehot.tensor.dwt/Submits/submit_full_cas9only.raw.onehot.tensor.dwt_0.sh
bsub /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli/iRF.run/Ecoli.allCas9/all.features/cas9only/raw.dwt/Submits/submit_full_cas9only.raw.dwt_0.sh
bsub /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli/iRF.run/Ecoli.allCas9/all.features/cas9only/raw.tensor/Submits/submit_full_cas9only.raw.tensor_0.sh
bsub /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli/iRF.run/Ecoli.allCas9/all.features/cas9only/dwt.onehot/Submits/submit_full_cas9only.dwt.onehot_0.sh
bsub /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli/iRF.run/Ecoli.allCas9/all.features/cas9only/dwt.tensor/Submits/submit_full_cas9only.dwt.tensor_0.sh
bsub /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli/iRF.run/Ecoli.allCas9/all.features/cas9only/dwt.onehot.tensor/Submits/submit_full_cas9only.dwt.onehot.tensor_0.sh
bsub /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli/iRF.run/Ecoli.allCas9/all.features/cas9only/onehot.tensor/Submits/submit_full_cas9only.onehot.tensor_0.sh
# train
bsub /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli/iRF.run/Ecoli.allCas9/all.features/cas9only/raw.onehot.tensor/Submits/submit_train_cas9only.raw.onehot.tensor_0.sh
bsub /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli/iRF.run/Ecoli.allCas9/all.features/cas9only/raw.onehot.tensor.dwt/Submits/submit_train_cas9only.raw.onehot.tensor.dwt_0.sh
bsub /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli/iRF.run/Ecoli.allCas9/all.features/cas9only/raw.dwt/Submits/submit_train_cas9only.raw.dwt_0.sh
bsub /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli/iRF.run/Ecoli.allCas9/all.features/cas9only/raw.tensor/Submits/submit_train_cas9only.raw.tensor_0.sh
bsub /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli/iRF.run/Ecoli.allCas9/all.features/cas9only/dwt.onehot/Submits/submit_train_cas9only.dwt.onehot_0.sh
bsub /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli/iRF.run/Ecoli.allCas9/all.features/cas9only/dwt.tensor/Submits/submit_train_cas9only.dwt.tensor_0.sh
bsub /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli/iRF.run/Ecoli.allCas9/all.features/cas9only/dwt.onehot.tensor/Submits/submit_train_cas9only.dwt.onehot.tensor_0.sh
bsub /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli/iRF.run/Ecoli.allCas9/all.features/cas9only/onehot.tensor/Submits/submit_train_cas9only.onehot.tensor_0.sh
# once the train submissions are done run the test submissions
# test
bsub /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli/iRF.run/Ecoli.allCas9/all.features/cas9only/raw.onehot.tensor/Submits/submit_test_cas9only.raw.onehot.tensor_0.sh
bsub /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli/iRF.run/Ecoli.allCas9/all.features/cas9only/raw.onehot.tensor.dwt/Submits/submit_test_cas9only.raw.onehot.tensor.dwt_0.sh
bsub /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli/iRF.run/Ecoli.allCas9/all.features/cas9only/raw.dwt/Submits/submit_test_cas9only.raw.dwt_0.sh
bsub /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli/iRF.run/Ecoli.allCas9/all.features/cas9only/raw.tensor/Submits/submit_test_cas9only.raw.tensor_0.sh
bsub /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli/iRF.run/Ecoli.allCas9/all.features/cas9only/dwt.onehot/Submits/submit_test_cas9only.dwt.onehot_0.sh
bsub /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli/iRF.run/Ecoli.allCas9/all.features/cas9only/dwt.tensor/Submits/submit_test_cas9only.dwt.tensor_0.sh
bsub /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli/iRF.run/Ecoli.allCas9/all.features/cas9only/dwt.onehot.tensor/Submits/submit_test_cas9only.dwt.onehot.tensor_0.sh
bsub /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli/iRF.run/Ecoli.allCas9/all.features/cas9only/onehot.tensor/Submits/submit_test_cas9only.onehot.tensor_0.sh
# Andes
module load python/3.7-anaconda3
cd /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli/iRF.run/Ecoli.allCas9/all.features/cas9only/raw.onehot.tensor
python /gpfs/alpine/syb105/proj-shared/Projects/iRF/iRF_postProcessing.py --Iterations 5 --Prediction --PredAccuracy MAE,MAEA,MSE,R2 --varTot 95 /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli/iRF.run/Ecoli.allCas9/YNames.txt cas9only.raw.onehot.tensor
# R2 = 0.25066138026328
cd /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli/iRF.run/Ecoli.allCas9/all.features/cas9only/raw.onehot.tensor.dwt
python /gpfs/alpine/syb105/proj-shared/Projects/iRF/iRF_postProcessing.py --Iterations 5 --Prediction --PredAccuracy MAE,MAEA,MSE,R2 --varTot 95 /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli/iRF.run/Ecoli.allCas9/YNames.txt cas9only.raw.onehot.tensor.dwt
# R2 = 0.2378214213581758
cd /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli/iRF.run/Ecoli.allCas9/all.features/cas9only/raw.dwt
python /gpfs/alpine/syb105/proj-shared/Projects/iRF/iRF_postProcessing.py --Iterations 5 --Prediction --PredAccuracy MAE,MAEA,MSE,R2 --varTot 95 /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli/iRF.run/Ecoli.allCas9/YNames.txt cas9only.raw.dwt
# R2 = -0.016846851799699098
cd /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli/iRF.run/Ecoli.allCas9/all.features/cas9only/raw.tensor
python /gpfs/alpine/syb105/proj-shared/Projects/iRF/iRF_postProcessing.py --Iterations 5 --Prediction --PredAccuracy MAE,MAEA,MSE,R2 --varTot 95 /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli/iRF.run/Ecoli.allCas9/YNames.txt cas9only.raw.tensor
# R2 = 0.24283471437403994
cd /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli/iRF.run/Ecoli.allCas9/all.features/cas9only/dwt.onehot
python /gpfs/alpine/syb105/proj-shared/Projects/iRF/iRF_postProcessing.py --Iterations 5 --Prediction --PredAccuracy MAE,MAEA,MSE,R2 --varTot 95 /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli/iRF.run/Ecoli.allCas9/YNames.txt cas9only.dwt.onehot
# R2 = 0.1684180543469489
cd /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli/iRF.run/Ecoli.allCas9/all.features/cas9only/dwt.tensor
python /gpfs/alpine/syb105/proj-shared/Projects/iRF/iRF_postProcessing.py --Iterations 5 --Prediction --PredAccuracy MAE,MAEA,MSE,R2 --varTot 95 /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli/iRF.run/Ecoli.allCas9/YNames.txt cas9only.dwt.tensor
# R2 = 0.21437348151176538
cd /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli/iRF.run/Ecoli.allCas9/all.features/cas9only/dwt.onehot.tensor
python /gpfs/alpine/syb105/proj-shared/Projects/iRF/iRF_postProcessing.py --Iterations 5 --Prediction --PredAccuracy MAE,MAEA,MSE,R2 --varTot 95 /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli/iRF.run/Ecoli.allCas9/YNames.txt cas9only.dwt.onehot.tensor
# R2 = 0.2362881314944351
cd /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli/iRF.run/Ecoli.allCas9/all.features/cas9only/onehot.tensor
python /gpfs/alpine/syb105/proj-shared/Projects/iRF/iRF_postProcessing.py --Iterations 5 --Prediction --PredAccuracy MAE,MAEA,MSE,R2 --varTot 95 /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli/iRF.run/Ecoli.allCas9/YNames.txt cas9only.onehot.tensor
# R2 = 0.2500703068116479
# pearson correlation
setwd("/gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli/iRF.run/Ecoli.allCas9/all.features/cas9only/raw.onehot.tensor/cut.score/foldRuns/fold9/Runs/Set4")
pred <- read.delim("cas9only.raw.onehot.tensor_Set4_test.prediction", header=T, sep="\t")
y <- read.delim("set4_Y_test_noSampleIDs.txt", header=T, sep="\t")
cor(y$cut.score, pred$Predictions.)
# 0.4904731
setwd("/gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli/iRF.run/Ecoli.allCas9/all.features/cas9only/raw.onehot.tensor.dwt/cut.score/foldRuns/fold9/Runs/Set4")
pred <- read.delim("cas9only.raw.onehot.tensor.dwt_Set4_test.prediction", header=T, sep="\t")
y <- read.delim("set4_Y_test_noSampleIDs.txt", header=T, sep="\t")
cor(y$cut.score, pred$Predictions.)
# 0.4782825
setwd("/gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli/iRF.run/Ecoli.allCas9/all.features/cas9only/raw.dwt/cut.score/foldRuns/fold9/Runs/Set4")
pred <- read.delim("cas9only.raw.dwt_Set4_test.prediction", header=T, sep="\t")
y <- read.delim("set4_Y_test_noSampleIDs.txt", header=T, sep="\t")
cor(y$cut.score, pred$Predictions.)
# 0.1937668
setwd("/gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli/iRF.run/Ecoli.allCas9/all.features/cas9only/raw.tensor/cut.score/foldRuns/fold9/Runs/Set4")
pred <- read.delim("cas9only.raw.tensor_Set4_test.prediction", header=T, sep="\t")
y <- read.delim("set4_Y_test_noSampleIDs.txt", header=T, sep="\t")
cor(y$cut.score, pred$Predictions.)
# 0.4845264
setwd("/gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli/iRF.run/Ecoli.allCas9/all.features/cas9only/dwt.onehot/cut.score/foldRuns/fold9/Runs/Set4")
pred <- read.delim("cas9only.dwt.onehot_Set4_test.prediction", header=T, sep="\t")
y <- read.delim("set4_Y_test_noSampleIDs.txt", header=T, sep="\t")
cor(y$cut.score, pred$Predictions.)
# 0.3902289
setwd("/gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli/iRF.run/Ecoli.allCas9/all.features/cas9only/dwt.tensor/cut.score/foldRuns/fold9/Runs/Set4")
pred <- read.delim("cas9only.dwt.tensor_Set4_test.prediction", header=T, sep="\t")
y <- read.delim("set4_Y_test_noSampleIDs.txt", header=T, sep="\t")
cor(y$cut.score, pred$Predictions.)
# 0.4561978
setwd("/gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli/iRF.run/Ecoli.allCas9/all.features/cas9only/dwt.onehot.tensor/cut.score/foldRuns/fold9/Runs/Set4")
pred <- read.delim("cas9only.dwt.onehot.tensor_Set4_test.prediction", header=T, sep="\t")
y <- read.delim("set4_Y_test_noSampleIDs.txt", header=T, sep="\t")
cor(y$cut.score, pred$Predictions.)
# 0.4765636
setwd("/gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli/iRF.run/Ecoli.allCas9/all.features/cas9only/onehot.tensor/cut.score/foldRuns/fold9/Runs/Set4")
pred <- read.delim("cas9only.onehot.tensor_Set4_test.prediction", header=T, sep="\t")
y <- read.delim("set4_Y_test_noSampleIDs.txt", header=T, sep="\t")
cor(y$cut.score, pred$Predictions.)
# 0.4898595
sort -k 3,3n /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli/iRF.run/Ecoli.allCas9/all.features/region2/topVarEdges/cut.score_top95.txt
# gene.dwtd8 cut.score 0.0178377197490399
# structure.dwtd3 cut.score 0.018259924636914858
# ipd.dwtd6 cut.score 0.018474666778161605
# structure.dwtd4 cut.score 0.01850407553242048
# ipd.dwtd4 cut.score 0.018562893040938236
# gc.dwtd1 cut.score 0.018756524937791234
# ipd.dwtd3 cut.score 0.018774577836445195
# structure.dwtd2 cut.score 0.018923368662448026
# ipd.dwtd1 cut.score 0.019233762048982306
# structure.dwtd1 cut.score 0.019423317484848927
# temp.dwtd3 cut.score 0.019431033643144573
# gene.dwtd4 cut.score 0.019828488589316517
# temp.dwtd2 cut.score 0.020052402767782616
# gene.dwtd3 cut.score 0.0211268414134782
# ipd.dwtd2 cut.score 0.02139399519597839
# gene.dwtd2 cut.score 0.02156608008476053
# ipd.dwtd12 cut.score 0.025757118742435164
# p20yz_quadrupoleraw cut.score 0.026921530705615825
# sgRNA.tempsgRNA.raw cut.score 0.029184548904625605
# p20xz_quadrupoleraw cut.score 0.03299312816904248
cd /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli/iRF.run/Ecoli.allCas9/all.features/region2/
cut -f 1,384,612,385,636,624,637,625,662,626,663,646,633,650,638,618,639,652,641,651,630 region.dwt.noncor.features.txt > ../selected/selected.features.txt
cut -f 1,384,612,385,636,624,637,625,662,626,663,646,633,650,638,618,639,652,641,651,630 region.dwt.noncor.features_overlap.txt > ../selected/selected.features_overlap.txt
cut -f 383,611,384,635,623,636,624,661,625,662,645,632,649,637,617,638,651,640,650,629 region.dwt.noncor.features_overlap_noSampleIDs.txt > ../selected/selected.features_overlap_noSampleIDs.txt
# Andes
module load python/3.7-anaconda3
cd /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli/iRF.run/Ecoli.allCas9/all.features/selected
python /gpfs/alpine/syb105/proj-shared/Projects/iRF/iRF_LOOP_SetUp_CrossLayer.py --System Summit --NodesPer 1 --TotalNodes 50 --RunTime 90 --Account SYB105 --NumTrees 1000 --NumIterations 5 --RunName selected --bypass --Prediction /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli/iRF.run/Ecoli.allCas9/all.features/selected/selected.features.txt /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli/iRF.run/Ecoli.allCas9/all.features/region2/region.dwt.score.txt
# Summit
cd /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli/iRF.run/Ecoli.allCas9/all.features/selected
module load python/3.7.0-anaconda3-5.3.0
# full
bsub /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli/iRF.run/Ecoli.allCas9/all.features/selected/Submits/submit_full_selected_0.sh
# train
bsub /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli/iRF.run/Ecoli.allCas9/all.features/selected/Submits/submit_train_selected_0.sh
# once the train submissions are done run the test submissions
# test
bsub /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli/iRF.run/Ecoli.allCas9/all.features/selected/Submits/submit_test_selected_0.sh
# Andes
module load python/3.7-anaconda3
cd /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli/iRF.run/Ecoli.allCas9/all.features/selected
python /gpfs/alpine/syb105/proj-shared/Projects/iRF/iRF_postProcessing.py --Iterations 5 --Prediction --PredAccuracy MAE,MAEA,MSE,R2 --varTot 95 /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli/iRF.run/Ecoli.allCas9/YNames.txt selected
# R2 = -0.030434700545415215
# pearson correlation
setwd("/gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli/iRF.run/Ecoli.allCas9/all.features/selected/cut.score/foldRuns/fold9/Runs/Set4")
pred <- read.delim("selected_Set4_test.prediction", header=T, sep="\t")
y <- read.delim("set4_Y_test_noSampleIDs.txt", header=T, sep="\t")
cor(y$cut.score, pred$Predictions.)
# 0.2729446
id <- read.delim("set4_test_SampleIDs.txt", header=F, sep="\t")
colnames(id) <- "sgRNAID"
id.pred <- cbind(id, pred)
id.pred.y <- cbind(id.pred, y)
library(tidyr)
id.pred.y.group <- id.pred.y %>% separate(sgRNAID, c("sgRNA", "ID", "group"), "_")
# 23117
cor(id.pred.y.group$cut.score, id.pred.y.group$Predictions., method=c("pearson"))
# 0.2729446
pred.Cas9 <- subset(id.pred.y.group, id.pred.y.group$group == "Cas9")
cor(pred.Cas9$cut.score, pred.Cas9$Predictions., method=c("pearson"))
# 0.6531926
pred.eSpCas9 <- subset(id.pred.y.group, id.pred.y.group$group == "eSpCas9")
cor(pred.eSpCas9$cut.score, pred.eSpCas9$Predictions., method=c("pearson"))
# 0.6164039
pred.recACas9 <- subset(id.pred.y.group, id.pred.y.group$group == "recACas9")
cor(pred.recACas9$cut.score, pred.recACas9$Predictions., method=c("pearson"))
# 0.431828
–> run w/ GBR
# salloc -A SYB105 -N 2 -t 4:00:00
module load python
source /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/scripts/loadcondaandes.sh
conda activate /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/Libraries/andes/envs/test
# gene.dwtd8 structure.dwtd3 ipd.dwtd6 structure.dwtd4 ipd.dwtd4 gc.dwtd1 ipd.dwtd3 structure.dwtd2 ipd.dwtd1 structure.dwtd1 temp.dwtd3 gene.dwtd4 temp.dwtd2 gene.dwtd3 ipd.dwtd2 gene.dwtd2 ipd.dwtd12 p20yz_quadrupoleraw sgRNA.tempsgRNA.raw p20xz_quadrupoleraw
setwd("/gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli")
df.dcast <- read.delim("Ecoli.allCas9.DWT.raw.onehot.tensor.dcast.regional2.txt", header=T, sep="\t")
df.dcast <- na.omit(df.dcast)
colnames(df.dcast)
df.select <- df.dcast[,c(2,953,958,1584,1635,1623,1637,1624,1679,1625,1680,1660,1632,1665,1638,1604,1639,1667,1641,1666,1629)]
# raw (raw + onehot + thermal)
data = df.select
library(caret)
set.seed(998)
inTraining <- createDataPartition(data$cut.score, p = .75, list = FALSE)
training <- data[ inTraining,]
testing <- data[-inTraining,]
fitControl <- trainControl(## 5-fold CV
method = "repeatedcv",
number = 5,
## repeated ten times
repeats = 10)
set.seed(825)
gbmFit1 <- train(cut.score ~ ., data = training,
method = "gbm",
trControl = fitControl,
verbose = FALSE)
gbmFit1
# Stochastic Gradient Boosting
#
# 88609 samples
# 20 predictor
#
# No pre-processing
# Resampling: Cross-Validated (5 fold, repeated 10 times)
# Summary of sample sizes: 70886, 70886, 70888, 70889, 70887, 70886, ...
# Resampling results across tuning parameters:
#
# interaction.depth n.trees RMSE Rsquared MAE
# 1 50 9.406080 0.04957845 7.906327
# 1 100 9.391206 0.05170885 7.886404
# 1 150 9.386278 0.05253544 7.880528
# 2 50 9.392435 0.05165346 7.888972
# 2 100 9.381171 0.05355709 7.876022
# 2 150 9.375879 0.05450824 7.870856
# 3 50 9.386528 0.05271025 7.882818
# 3 100 9.374771 0.05479445 7.870415
# 3 150 9.368917 0.05587183 7.864280
#
# Tuning parameter 'shrinkage' was held constant at a value of 0.1
#
# Tuning parameter 'n.minobsinnode' was held constant at a value of 10
# RMSE was used to select the optimal model using the smallest value.
# The final values used for the model were n.trees = 150, interaction.depth =
# 3, shrinkage = 0.1 and n.minobsinnode = 10.
head(summary(gbmFit1))
# var rel.inf
# p20xz_quadrupoleraw p20xz_quadrupoleraw 25.398009
# p20yz_quadrupoleraw p20yz_quadrupoleraw 23.753452
# sgRNA.tempsgRNA.raw sgRNA.tempsgRNA.raw 20.898917
# ipd.dwtd12 ipd.dwtd12 8.569271
# gene.dwtd8 gene.dwtd8 2.973065
# structure.dwtd4 structure.dwtd4 2.601908
pred <- predict(gbmFit1, newdata = testing)
pred.df <- data.frame(pred.score = pred, exp.score = testing$cut.score)
cor(pred.df$pred.score, pred.df$exp.score)
# 0.2313137
## all features
data = df.dcast[,c(2:ncol(df.dcast))]
library(caret)
set.seed(998)
inTraining <- createDataPartition(data$cut.score, p = .75, list = FALSE)
training <- data[ inTraining,]
testing <- data[-inTraining,]
fitControl <- trainControl(## 5-fold CV
method = "repeatedcv",
number = 5,
## repeated ten times
repeats = 10)
set.seed(825)
gbmFit1 <- train(cut.score ~ ., data = training,
method = "gbm",
trControl = fitControl,
verbose = FALSE)
gbmFit1
head(summary(gbmFit1))
pred <- predict(gbmFit1, newdata = testing)
pred.df <- data.frame(pred.score = pred, exp.score = testing$cut.score)
cor(pred.df$pred.score, pred.df$exp.score)
#
**not enough memory to run DWT on sliding windows
cd /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli
bedtools makewindows -g ecoli.sizes.genome -w 20 -s 1 > ecoli.20bp.sliding.bed
–> density
module load python
source /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/scripts/loadcondaandes.sh
conda activate /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/Libraries/andes/envs/test
cd /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli
## genes
bedtools intersect -wo -a ecoli.20bp.sliding.bed -b genome/GCF_000005845.2_ASM584v2_genomic.gene.gff > ecoli.gene.20sliding.bed
## GC content
bedtools nuc -fi genome/GCF_000005845.2_ASM584v2_genomic.fna -bed ecoli.20bp.sliding.bed | sed '1d' > ecoli.GC.20sliding.bed
–> melting temp https://biopython.org/docs/1.75/api/Bio.SeqUtils.MeltingTemp.html
module load python
source /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/scripts/loadcondaandes.sh
conda activate /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/Libraries/andes/envs/test
Bio.SeqUtils.MeltingTemp.Tm_NN(seq, check=True, strict=True, c_seq=None, shift=0, nn_table=None, tmm_table=None, imm_table=None, de_table=None, dnac1=25, dnac2=25, selfcomp=False, Na=50, K=0, Tris=0, Mg=0, dNTPs=0, saltcorr=5)
https://warwick.ac.uk/fac/sci/moac/people/students/peter_cock/python/fasta_n
# summit: # conda install -c conda-forge biopython
### 20bp sliding windows
cd /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli
bedtools getfasta -fi genome/GCF_000005845.2_ASM584v2_genomic.fna -bed ecoli.20bp.sliding.bed -fo ecoli.20sliding.fa
# count nucleotides
cd /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli
python3
input_file = open('ecoli.20sliding.fa', 'r')
output_file = open('nucleotide_counts_20sliding.tsv','w')
output_file.write('Window\tA\tC\tG\tT\tLength\tCG%\n')
from Bio import SeqIO
for cur_record in SeqIO.parse(input_file, "fasta") :
gene_name = cur_record.name
A_count = cur_record.seq.count('A')
C_count = cur_record.seq.count('C')
G_count = cur_record.seq.count('G')
T_count = cur_record.seq.count('T')
length = len(cur_record.seq)
cg_percentage = float(C_count + G_count) / length
output_line = '%s\t%i\t%i\t%i\t%i\t%i\t%f\n' % \
(gene_name, A_count, C_count, G_count, T_count, length, cg_percentage)
output_file.write(output_line)
output_file.close()
input_file.close()
exit()
# Melting temperature(°C) = 64.9 + 41 * (nG+nC-16.4)/(nA+nT+nG+nC)
R
library(dplyr)
setwd("/gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli")
df <- read.delim("nucleotide_counts_20sliding.tsv", header=T, sep="\t")
df.melt <- df %>% mutate(MeltingTemp = 64.9 + 41 * (G+C-16.4) / (A+T+G+C))
write.table(df.melt, "nucleotide_counts_20sliding_temp.txt", quote=F, row.names=F, sep="\t")
q()
–> structure https://academic.oup.com/bioinformatics/article/34/14/2499/4924718 https://github.com/Superzchen/iFeature/ https://github.com/feliixx/gotranseq
#cd /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/Libraries/andes
#git clone https://github.com/Superzchen/iFeature
cd /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli
python /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/Libraries/andes/iFeature/iFeature.py --file ecoli.20sliding.fa --type AAC --out 20sliding.protein.structure.fa
–> rnaseq
library(tidyr)
library(dplyr)
setwd("/gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli/genome")
# sed '1d' GCF_000005845.2_ASM584v2_genomic.gff | sed '1d' | sed '1d' | sed '1d' | sed '1d' | sed '1d' | sed '1d' > GCF_000005845.2_ASM584v2_genomic.txt
annotation <- read.delim("GCF_000005845.2_ASM584v2_genomic.txt", header=F, sep="\t")
gene <- subset(annotation, annotation$V3 == "gene")
gene.id <- separate(gene, V9, c("id1", "id2"), sep="EcoGene:")
gene.id$gene_id <- substr(gene.id$id2, 1, 7)
setwd("/gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli")
rna <- read.delim("GSM2267479_Sample-1.genes.results.txt", header=T, sep="\t")
rna.id <- left_join(rna, gene.id, by="gene_id")
rna.id.idf <- na.omit(rna.id[,c(8,11,12,1,3:7)])
write.table(rna.id.idf, "GSM2267479.fpkm.coord.txt", quote=F, row.names=F, sep="\t")
# calculate density
bedtools intersect -wo -a ecoli.20bp.sliding.bed -b GSM2267479.fpkm.coord.bed > ecoli.rnaseq.20sliding.bed
setwd("/gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli")
window <- read.delim("ecoli.rnaseq.20sliding.bed", header=F, sep="\t")
window.df <- window %>% group_by(V1, V2, V3) %>% mutate(avg.fpkm = mean(V12))
window.uniq <- unique(window.df[,c(1:3,14)])
write.table(window.uniq, "ecoli.rnaseq.average.20sliding.bed", quote=F, row.names=F, sep="\t")
–> GATC motif
module load python
source /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/scripts/loadcondaandes.sh
conda activate /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/Libraries/andes/envs/test
#source /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/scripts/loadcondasummit.sh
#conda activate /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/Libraries/summit/anaconda3/envs/summit-test
## GATC motif
## try fastaregex
#cd /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/Libraries/andes/
#wget https://github.com/dariober/bioinformatics-cafe/blob/master/fastaRegexFinder/fastaRegexFinder.py?raw=true -O fastaRegexFinder.py
#chmod a+x fastaRegexFinder.py
cd /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli
python /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/Libraries/andes/fastaRegexFinder.py -q -f genome/GCF_000005845.2_ASM584v2_genomic.fna -r 'GATC' > ecoli.gatc.bed
bedtools intersect -wo -a ecoli.20bp.sliding.bed -b ecoli.gatc.coord.bed > ecoli.gatc.20sliding.bed
–> IPD ratios
setwd("/gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli")
df <- read.delim("GSM3264688_Ecoli.gff", header=F, sep="\t")
df2 <- df[5:nrow(df),]
library(dplyr)
library(tidyr)
df.sep <- df2 %>% separate(V9, c("coverage", "context", "IPD"), sep=";")
df.ipd <- df.sep %>% separate(IPD, c("IPD", "IPD.value"), sep="=")
df.ipd$chr <- "NC_000913.3"
df.coord <- df.ipd[,c(13,4,5,12)]
write.table(df.coord, "GSM3264688_Ecoli.coord.bed", quote=F, row.names=F, col.names=F, sep="\t")
cd /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli
bedtools intersect -wo -a ecoli.20bp.sliding.bed -b GSM3264688_Ecoli.coord.bed > ecoli.ipd.20sliding.bed
setwd("/gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli")
window <- read.delim("ecoli.ipd.20sliding.bed", header=F, sep="\t")
window.df <- window %>% group_by(V1, V2, V3) %>% mutate(avg.fpkm = mean(V7))
write.table(window.df, "ecoli.ipd.average.20sliding.bed", quote=F, row.names=F, sep="\t")
setwd("/gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli")
window.df <- read.delim("ecoli.ipd.average.20sliding.bed", header=T, sep="\t")
window.uniq <- unique(window.df[,c(1:3,9)])
write.table(window.uniq, "ecoli.ipd.average.20sliding.bed", quote=F, row.names=F, sep="\t")
–> run DWT on genome-wide patterns (using 20bp sliding windows) –> take the 20bp exact match bin for the sgRNA
#!/bin/bash
#SBATCH -A SYB105
#SBATCH -J sgRNA.sliding
#SBATCH -N 2
#SBATCH -p gpu
#SBATCH -t 01:00:00
#SBATCH --mem-per-cpu=0
module load python
source /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/scripts/loadcondaandes.sh
conda activate /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/Libraries/andes/envs/test
cd /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli
#R CMD BATCH sgRNA.sliding.window.R <-- not enough memory in R... break into separate files for DWT creation and then merge
R CMD BATCH gatc.dwt.sliding.window.R
R CMD BATCH ipd.dwt.sliding.window.R
R CMD BATCH gene.dwt.sliding.window.R
R CMD BATCH structure.dwt.sliding.window.R
R CMD BATCH rnaseq.dwt.sliding.window.R
R CMD BATCH temp.dwt.sliding.window.R
#R CMD BATCH merge.dwt.sliding.window.R
# sbatch /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli/sgRNA.sliding.window.sh
#!/bin/bash
#SBATCH -A SYB105
#SBATCH -J sgRNA.merge.sliding
#SBATCH -N 1
#SBATCH -t 10:00:00
module load python
source /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/scripts/loadcondaandes.sh
conda activate /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/Libraries/andes/envs/test
cd /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli
R CMD BATCH merge.dwt.sliding.window.R
R CMD BATCH matrix.sliding.window.R
# sbatch /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli/sgRNA.merge.sliding.window.sh
module load python
source /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/scripts/loadcondaandes.sh
conda activate /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/Libraries/andes/envs/test
# salloc -A SYB105 -N 2 -p gpu -t 4:00:00
library(dplyr)
library(reshape2)
library(tidyr)
library(wmtsa)
library(data.table)
setwd("/gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli")
gatc <- read.table("ecoli.gatc.20sliding.bed", header=F, sep="\t", stringsAsFactors = F)
ipd <- read.table("ecoli.ipd.average.20sliding.bed", header=T, sep="\t", stringsAsFactors = F)
gene <- read.table("ecoli.gene.20sliding.bed", header=F, sep="\t", stringsAsFactors = F)
structure <- read.table("20sliding.protein.structure.fa", header=T, sep="\t", stringsAsFactors = F)
nuc <- read.table("nucleotide_counts_20sliding_temp.txt", header=T, sep="\t", stringsAsFactors = F)
rnaseq <- read.table("ecoli.rnaseq.average.20sliding.bed", header=T, sep="\t", stringsAsFactors = F)
window <- read.table("ecoli.20bp.sliding.bed", header=F, sep="\t", stringsAsFactors = F)
score <- read.table("sgRNA.coord.txt", header=T, sep="\t", stringsAsFactors = F)
colnames(score) <- c("chr", "start", "end", "sgRNA", "id", "seq", "id2", "cut.score", "gid", "change.val", "quality")
score.df <- score[,c(1:4,8)]
window.v <- window[,1:3]
colnames(window.v) <- c("V1", "V2", "V3")
gatc.win <- left_join(window.v, gatc, by=c("V1", "V2", "V3"))
gatc.win[is.na(gatc.win)] <- 0
ipd.win <- left_join(window.v, ipd, by=c("V1", "V2", "V3"))
ipd.win[is.na(ipd.win)] <- 0
gene.win <- left_join(window.v, gene, by=c("V1", "V2", "V3"))
gene.win[is.na(gene.win)] <- 0
rnaseq.win <- left_join(window.v, rnaseq, by=c("V1", "V2", "V3"))
rnaseq.win[is.na(rnaseq.win)] <- 0
gatc.bin <- gatc.win %>% group_by(V1, V2, V3) %>% mutate(gatc.count = n())
gatc.count <- unique(gatc.bin[,c(1:3,8)])
gatc.df <- gatc.count$gatc.count
ipd.df <- ipd.win[,4]
gene.bin <- gene.win %>% group_by(V1, V2, V3) %>% mutate(gene.count = n())
gene.count <- unique(gene.bin[,c(1:3,14)])
gene.df <- gene.count$gene.count
structure.df <- structure[,2]
gc.df <- nuc[,7]
temp.df <- nuc[,8]
rna.df <- rnaseq.win[,4]
# Run DWT instead of CWT
#wavMODWT(x, wavelet="s8", n.levels=ilogb(length(x), base=2),position=list(from=1,by=1,units=character()), units=character(),title.data=character(), documentation=character(), keep.series=FALSE)
temp.modwt <- wavMODWT(temp.df)
temp.modwt.df <- as.matrix(temp.modwt)
temp.modwt.label <- data.frame(label = row.names(temp.modwt.df), temp.modwt.df)
temp.modwt.dt <- as.data.table(temp.modwt.label)
#temp.modwt.name <- temp.modwt.label %>% separate(label, c("name", "number"), sep = "[^[:alnum:]]+")
temp.modwt.name <- temp.modwt.dt[, c("name", "number") := tstrsplit(label, "[^[:alnum:]]+")]
colnames(temp.modwt.name) <- c("label", "temp.dwt", "scale", "window")
gc.modwt <- wavMODWT(gc.df)
gc.modwt.df <- as.matrix(gc.modwt)
gc.modwt.label <- data.frame(label = row.names(gc.modwt.df), gc.modwt.df)
gc.modwt.dt <- as.data.table(gc.modwt.label)
gc.modwt.name <- gc.modwt.dt[, c("name", "number") := tstrsplit(label, "[^[:alnum:]]+")]
colnames(gc.modwt.name) <- c("label", "gc.dwt", "scale", "window")
structure.modwt <- wavMODWT(structure.df)
structure.modwt.df <- as.matrix(structure.modwt)
structure.modwt.label <- data.frame(label = row.names(structure.modwt.df), structure.modwt.df)
structure.modwt.dt <- as.data.table(structure.modwt.label)
structure.modwt.name <- structure.modwt.dt[, c("name", "number") := tstrsplit(label, "[^[:alnum:]]+")]
colnames(structure.modwt.name) <- c("label", "structure.dwt", "scale", "window")
rna.modwt <- wavMODWT(rna.df)
rna.modwt.df <- as.matrix(rna.modwt)
rna.modwt.label <- data.frame(label = row.names(rna.modwt.df), rna.modwt.df)
rna.modwt.dt <- as.data.table(rna.modwt.label)
rna.modwt.name <- rna.modwt.dt[, c("name", "number") := tstrsplit(label, "[^[:alnum:]]+")]
colnames(rna.modwt.name) <- c("label", "rna.dwt", "scale", "window")
gene.modwt <- wavMODWT(gene.df)
gene.modwt.df <- as.matrix(gene.modwt)
gene.modwt.label <- data.frame(label = row.names(gene.modwt.df), gene.modwt.df)
gene.modwt.dt <- as.data.table(gene.modwt.label)
gene.modwt.name <- gene.modwt.dt[, c("name", "number") := tstrsplit(label, "[^[:alnum:]]+")]
colnames(gene.modwt.name) <- c("label", "gene.dwt", "scale", "window")
ipd.modwt <- wavMODWT(ipd.df)
ipd.modwt.df <- as.matrix(ipd.modwt)
ipd.modwt.label <- data.frame(label = row.names(ipd.modwt.df), ipd.modwt.df)
ipd.modwt.dt <- as.data.table(ipd.modwt.label)
ipd.modwt.name <- ipd.modwt.dt[, c("name", "number") := tstrsplit(label, "[^[:alnum:]]+")]
colnames(ipd.modwt.name) <- c("label", "ipd.dwt", "scale", "window")
gatc.modwt <- wavMODWT(gatc.df)
gatc.modwt.df <- as.matrix(gatc.modwt)
gatc.modwt.label <- data.frame(label = row.names(gatc.modwt.df), gatc.modwt.df)
gatc.modwt.dt <- as.data.table(gatc.modwt.label)
gatc.modwt.name <- gatc.modwt.dt[, c("name", "number") := tstrsplit(label, "[^[:alnum:]]+")]
colnames(gatc.modwt.name) <- c("label", "gatc.dwt", "scale", "window")
library(wavelets)
ipd.mat <- as.matrix(ipd.df)
ipd.dwt <- modwt(ipd.mat, filter="la8")
pdf("ecoli.20bpsliding.ipd.pdf")
plot.modwt(ipd.dwt)
dev.off()
gatc.mat <- as.matrix(gatc.df)
gatc.dwt <- modwt(gatc.mat, filter="la8")
pdf("ecoli.20bpsliding.gatc.pdf")
plot.modwt(gatc.dwt)
dev.off()
gene.mat <- as.matrix(gene.df)
gene.dwt <- modwt(gene.mat, filter="la8")
pdf("ecoli.20bpsliding.gene.pdf")
plot.modwt(gene.dwt)
dev.off()
rna.mat <- as.matrix(rna.df)
rna.dwt <- modwt(rna.mat, filter="la8")
pdf("ecoli.20bpsliding.rna.pdf")
plot.modwt(rna.dwt)
dev.off()
structure.mat <- as.matrix(structure.df)
structure.dwt <- modwt(structure.mat, filter="la8")
pdf("ecoli.20bpsliding.structure.pdf")
plot.modwt(structure.dwt)
dev.off()
gc.mat <- as.matrix(gc.df)
gc.dwt <- modwt(gc.mat, filter="la8")
pdf("ecoli.20bpsliding.gc.pdf")
plot.modwt(gc.dwt)
dev.off()
temp.mat <- as.matrix(temp.df)
temp.dwt <- modwt(temp.mat, filter="la8")
pdf("ecoli.20bpsliding.temp.pdf")
plot.modwt(temp.dwt)
dev.off()
colnames(window) <- c("chr", "start", "end")
window$window <- seq.int(nrow(window))
window$window <- as.character(window$window-1)
window$start <- as.numeric(window$start)
window$end <- as.numeric(window$end - 1)
window.score.df <- left_join(score.df, window, by=c("chr", "start", "end"))
window.score.temp <- left_join(window.score.df, temp.modwt.name[,c(3,4,2)], by="window")
window.temp.gc <- left_join(window.score.temp, gc.modwt.name[,c(3,4,2)], by=c("window", "scale"))
window.temp.gc.structure <- left_join(window.temp.gc, structure.modwt.name[,c(3,4,2)], by=c("window", "scale"))
window.temp.gc.structure.rna <- left_join(window.temp.gc.structure, rna.modwt.name[,c(3,4,2)], by=c("window", "scale"))
window.temp.gc.structure.rna.gene <- left_join(window.temp.gc.structure.rna, gene.modwt.name[,c(3,4,2)], by=c("window", "scale"))
window.temp.gc.structure.rna.gene.gatc <- left_join(window.temp.gc.structure.rna.gene, gatc.modwt.name[,c(3,4,2)], by=c("window", "scale"))
window.temp.gc.structure.rna.gene.gatc.ipd <- left_join(window.temp.gc.structure.rna.gene.gatc, ipd.modwt.name[,c(3,4,2)], by=c("window", "scale"))
# 1293751
window.temp.gc.structure.rna.gene.gatc.ipd.sgRNA <- subset(window.temp.gc.structure.rna.gene.gatc.ipd, window.temp.gc.structure.rna.gene.gatc.ipd$score != "NA")
# 931340
df.melt <- melt(window.temp.gc.structure.rna.gene.gatc.ipd.sgRNA[,c(4,5,7:14)], id=c("score", "scale", "sgRNA"))
df <- na.omit(df.melt)
colnames(df) <- c("cut.score", "scale", "sgRNA", "variable", "value")
df.id <- df %>% unite(feature.scale, c(variable, scale), sep = "")
df.dcast <- df.id %>% dcast(sgRNA + cut.score ~ feature.scale, value.var = "value", fun.aggregate=mean, na.rm=TRUE)
df.dcast.na <- na.omit(df.dcast)
# 40467
write.table(df.dcast.na, "ecoli.20sliding.exact.dwt.dcast.txt", quote=F, row.names=F, sep="\t")
–> take the 20bp upstream and 20bp downstream bin of the sgRNA
setwd("/gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli")
## upstream bin
score <- read.table("sgRNA.coord.txt", header=T, sep="\t", stringsAsFactors = F)
colnames(score) <- c("chr", "start", "end", "sgRNA", "id", "seq", "id2", "cut.score", "gid", "change.val", "quality")
score.df <- score[,c(1:4,8)]
score.df$chr <- score.df$chr
score.df$end <- score.df$start
score.df$start <- score.df$start - 19
window.score.df <- left_join(score.df, window, by=c("chr", "start", "end"))
window.score.temp <- left_join(window.score.df, temp.modwt.name, by="window")
window.temp.gc <- left_join(window.score.temp, gc.modwt.name, by=c("window", "scale"))
window.temp.gc.structure <- left_join(window.temp.gc, structure.modwt.name, by=c("window", "scale"))
window.temp.gc.structure.rna <- left_join(window.temp.gc.structure, rna.modwt.name, by=c("window", "scale"))
window.temp.gc.structure.rna.gene <- left_join(window.temp.gc.structure.rna, gene.modwt.name, by=c("window", "scale"))
window.temp.gc.structure.rna.gene.gatc <- left_join(window.temp.gc.structure.rna.gene, gatc.modwt.name, by=c("window", "scale"))
window.temp.gc.structure.rna.gene.gatc.ipd <- left_join(window.temp.gc.structure.rna.gene.gatc, ipd.modwt.name, by=c("window", "scale"))
window.temp.gc.structure.rna.gene.gatc.ipd.sgRNA <- subset(window.temp.gc.structure.rna.gene.gatc.ipd, window.temp.gc.structure.rna.gene.gatc.ipd$score != "NA")
df.melt <- melt(window.temp.gc.structure.rna.gene.gatc.ipd.sgRNA[,c(4,5,7:14)], id=c("score", "scale", "sgRNA"))
df <- na.omit(df.melt)
colnames(df) <- c("cut.score", "scale", "sgRNA", "variable", "value")
df.id <- df %>% unite(feature.scale, c(variable, scale), sep = "")
df.dcast <- df.id %>% dcast(sgRNA + cut.score ~ feature.scale, value.var = "value", fun.aggregate=mean, na.rm=TRUE)
df.dcast.na <- na.omit(df.dcast)
#
write.table(df.dcast.na, "ecoli.20sliding.up.dwt.dcast.txt", quote=F, row.names=F, sep="\t")
## downstream bin
score <- read.table("sgRNA.coord.txt", header=T, sep="\t", stringsAsFactors = F)
colnames(score) <- c("chr", "start", "end", "sgRNA", "id", "seq", "id2", "cut.score", "gid", "change.val", "quality")
score.df <- score[,c(1:4,8)]
score$chr <- score$chr
score$start <- score$end
score$end <- score$start + 19
window.score.df <- left_join(score.df, window, by=c("chr", "start", "end"))
window.score.temp <- left_join(window.score.df, temp.modwt.name, by="window")
window.temp.gc <- left_join(window.score.temp, gc.modwt.name, by=c("window", "scale"))
window.temp.gc.structure <- left_join(window.temp.gc, structure.modwt.name, by=c("window", "scale"))
window.temp.gc.structure.rna <- left_join(window.temp.gc.structure, rna.modwt.name, by=c("window", "scale"))
window.temp.gc.structure.rna.gene <- left_join(window.temp.gc.structure.rna, gene.modwt.name, by=c("window", "scale"))
window.temp.gc.structure.rna.gene.gatc <- left_join(window.temp.gc.structure.rna.gene, gatc.modwt.name, by=c("window", "scale"))
window.temp.gc.structure.rna.gene.gatc.ipd <- left_join(window.temp.gc.structure.rna.gene.gatc, ipd.modwt.name, by=c("window", "scale"))
window.temp.gc.structure.rna.gene.gatc.ipd.sgRNA <- subset(window.temp.gc.structure.rna.gene.gatc.ipd, window.temp.gc.structure.rna.gene.gatc.ipd$score != "NA")
df.melt <- melt(window.temp.gc.structure.rna.gene.gatc.ipd.sgRNA[,c(4,5,7:14)], id=c("score", "scale", "sgRNA"))
df <- na.omit(df.melt)
colnames(df) <- c("cut.score", "scale", "sgRNA", "variable", "value")
df.id <- df %>% unite(feature.scale, c(variable, scale), sep = "")
df.dcast <- df.id %>% dcast(sgRNA + cut.score ~ feature.scale, value.var = "value", fun.aggregate=mean, na.rm=TRUE)
df.dcast.na <- na.omit(df.dcast)
#
write.table(df.dcast.na, "ecoli.20sliding.down.dwt.dcast.txt", quote=F, row.names=F, sep="\t")
–> combine to generate full feature matrix
# combine regional DWT with other features
library(tidyr)
library(dplyr)
setwd("/gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli")
df.dcast.na <- read.delim("ecoli.20sliding.exact.dwt.dcast.txt", header=T, sep="\t", stringsAsFactors = F)
df.dcast.sep <- df.dcast.na %>% separate(sgRNA, c("sgRNA", "ID"), sep="_")
df.dcast.dwt <- df.dcast.sep[,c(4:ncol(df.dcast.sep))]
colnames(df.dcast.dwt) <- paste0('sgRNA_', colnames(df.dcast.dwt))
df.dcast <- cbind(df.dcast.sep[,1:3], df.dcast.dwt)
setwd("/gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli")
df.dcast.na2 <- read.delim("ecoli.20sliding.up.dwt.dcast.txt", header=T, sep="\t", stringsAsFactors = F)
df.dcast.sep2 <- df.dcast.na2 %>% separate(sgRNA, c("sgRNA", "ID"), sep="_")
df.dcast.dwt2 <- df.dcast.sep2[,c(4:ncol(df.dcast.sep2))]
colnames(df.dcast.dwt2) <- paste0('sgRNA_', colnames(df.dcast.dwt2))
df.dcast2 <- cbind(df.dcast.sep2[,1:3], df.dcast.dwt2)
setwd("/gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli")
df.dcast.na3 <- read.delim("ecoli.20sliding.down.dwt.dcast.txt", header=T, sep="\t", stringsAsFactors = F)
df.dcast.sep3 <- df.dcast.na3 %>% separate(sgRNA, c("sgRNA", "ID"), sep="_")
df.dcast.dwt3 <- df.dcast.sep3[,c(4:ncol(df.dcast.sep3))]
colnames(df.dcast.dwt3) <- paste0('sgRNA_', colnames(df.dcast.dwt3))
df.dcast3 <- cbind(df.dcast.sep3[,1:3], df.dcast.dwt3)
df <- read.delim("Ecoli.allCas9.raw.onehot.tensor.dcast.na.txt", header=T, sep="\t", stringsAsFactors = F)
df.sep <- df %>% separate(sgRNAID, c("sgRNA", "ID", "type"), sep="_")
nrow(df.sep)
# 126182
df.sep.region <- inner_join(df.sep, df.dcast[,c(1,2,4:ncol(df.dcast.sep))], by=c("sgRNA", "ID"))
df.sep.region2 <- inner_join(df.sep.region, df.dcast2[,c(1,2,4:ncol(df.dcast.sep2))], by=c("sgRNA", "ID"))
df.sep.region3 <- inner_join(df.sep.region2, df.dcast3[,c(1,2,4:ncol(df.dcast.sep3))], by=c("sgRNA", "ID"))
## note that dwt.x = sgRNA dwt, dwt.y = upstream dwt, dwt = downstream dwt
df.sep.region.id <- df.sep.region3 %>% unite(sgRNAID, c("sgRNA", "ID", "type"), sep="_")
nrow(df.sep.region.id)
# 118140
write.table(df.sep.region.id, "ecoli.20sliding.all.features.dcast.txt", quote=F, row.names=F, sep="\t")
# Summit
module load r/4.0.5
# Andes
# salloc -A SYB105 -N 2 -t 4:00:00
source /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/scripts/loadcondaandes.sh
conda activate /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/Libraries/andes/envs/test
mkdir /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli/iRF.run/Ecoli.allCas9/all.features/dwt20bp
R
setwd("/gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli")
df <- read.delim("ecoli.20sliding.all.features.dcast.txt", header=T, sep="\t")
df <- na.omit(df)
## sample ID
df.features <- df[,c(1,3:ncol(df))]
df.score <- df[,1:2]
setwd("/gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli/iRF.run/Ecoli.allCas9/all.features/dwt20bp")
write.table(df.features, "dwt20bp.features_overlap.txt", quote=F, row.names=F, sep="\t")
write.table(df.score, "dwt20bp.score_overlap.txt", quote=F, row.names=F, sep="\t")
write.table(df.features, "dwt20bp.features.txt", quote=F, row.names=F, sep="\t")
write.table(df.score, "dwt20bp.score.txt", quote=F, row.names=F, sep="\t")
## no sample ID
## <DataFile>_overlap_noSampleIDs.txt & <YFile>_overlap_noSampleIDs.txt
df.features <- as.matrix(df[,c(3:ncol(df))])
df.score <- as.matrix(data.frame(df[,2]))
colnames(df.score) <- "cut.score"
setwd("/gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli/iRF.run/Ecoli.allCas9/all.features/dwt20bp")
write.table(df.features, "dwt20bp.features_overlap_noSampleIDs.txt", quote=F, row.names=F, sep="\t")
write.table(df.score, "dwt20bp.score_overlap_noSampleIDs.txt", quote=F, row.names=F, sep="\t")
cd /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli/iRF.run/Ecoli.allCas9/all.features/dwt20bp
cut -f 1,1589-2071 dwt20bp.features.txt > dwt20bp.dwtonly.features.txt
cut -f 1,1589-2071 dwt20bp.features_overlap.txt > dwt20bp.dwtonly.features_overlap.txt
cut -f 1588-2070 dwt20bp.features_overlap_noSampleIDs.txt > dwt20bp.dwtonly.features_overlap_noSampleIDs.txt
–> only include 20bp DWT window overlapping sgRNA (not flanking windows)
cd /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli/
cut -f 1-1750 ecoli.20sliding.all.features.dcast.txt > ecoli.20overlap.all.features.dcast.txt
# python
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
import warnings
warnings.filterwarnings("ignore")
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.metrics import confusion_matrix
np.random.seed(123)
data = pd.read_table('/gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli/ecoli.20overlap.all.features.dcast.txt')
data = data.iloc[:,2:-1]
label_encoder = LabelEncoder()
data.iloc[:,0] = label_encoder.fit_transform(data.iloc[:,0]).astype('float64')
corr = data.corr()
columns = np.full((corr.shape[0],), True, dtype=bool)
for i in range(corr.shape[0]):
for j in range(i+1, corr.shape[0]):
if corr.iloc[i,j] >= 0.9:
if columns[j]:
columns[j] = False
selected_columns = data.columns[columns]
data = data[selected_columns]
data.to_csv("/gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli/ecoli.20overlap.all.features.dcast.pythoncorrelation.csv")
# R
module load python
source /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/scripts/loadcondaandes.sh
conda activate /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/Libraries/andes/envs/test
#head -n 1 /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli/ecoli.20overlap.all.features.dcast.pythoncorrelation.csv > /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli/ecoli.20overlap.all.features.dcast.pythoncorrelation.header.txt
setwd("/gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli/")
df.noncor <- read.delim("ecoli.20overlap.all.features.dcast.pythoncorrelation.header.txt", header=F, sep=",")
setwd("/gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli/iRF.run/Ecoli.allCas9/all.features/dwt20bp/")
df <- read.delim("dwt20bp.features.txt", header=T, sep="\t")
df.subset <- df[ , which(names(df) %in% df.noncor[1,])]
df.mat <- as.matrix(df.subset[,2:ncol(df.subset)])
df.mat.id <- cbind(as.data.frame(df$sgRNAID), df.mat)
write.table(df.mat.id, "dwt20bp.sgRNA.noncor2.features.txt", quote=F, row.names=F, sep="\t")
write.table(df.mat.id, "dwt20bp.sgRNA.noncor2.features_overlap.txt", quote=F, row.names=F, sep="\t")
write.table(df.mat, "dwt20bp.sgRNA.noncor2.features_overlap_noSampleIDs.txt", quote=F, row.names=F, sep="\t")
–> add in pvalue analysis to removal of features pre-processing
source /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/scripts/loadcondaandes.sh
conda activate /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/Libraries/andes/envs/test
# python
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
import warnings
warnings.filterwarnings("ignore")
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.metrics import confusion_matrix
np.random.seed(123)
data = pd.read_table('/gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli/ecoli.20overlap.all.features.dcast.txt')
data = data.iloc[:,2:-1]
label_encoder = LabelEncoder()
data.iloc[:,0] = label_encoder.fit_transform(data.iloc[:,0]).astype('float64')
corr = data.corr()
columns = np.full((corr.shape[0],), True, dtype=bool)
for i in range(corr.shape[0]):
for j in range(i+1, corr.shape[0]):
if corr.iloc[i,j] >= 0.9:
if columns[j]:
columns[j] = False
selected_columns = data.columns[columns]
data = data[selected_columns]
selected_columns = selected_columns[1:].values
import statsmodels.formula.api as sm
def backwardElimination(x, Y, sl, columns):
numVars = len(x[0])
for i in range(0, numVars):
regressor_OLS = sm.OLS(Y, x).fit()
maxVar = max(regressor_OLS.pvalues).astype(float)
if maxVar > sl:
for j in range(0, numVars - i):
if (regressor_OLS.pvalues[j].astype(float) == maxVar):
x = np.delete(x, j, 1)
columns = np.delete(columns, j)
regressor_OLS.summary()
return x, columns
SL = 0.05
data_modeled, selected_columns = backwardElimination(data.iloc[:,1:].values, data.iloc[:,0].values, SL, selected_columns)
result = pd.DataFrame()
result['diagnosis'] = data.iloc[:,0]
data = pd.DataFrame(data = data_modeled, columns = selected_columns)
data.to_csv("/gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli/ecoli.20overlap.all.features.dcast.pythoncorrelationpval.csv")
# R
module load python
source /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/scripts/loadcondaandes.sh
conda activate /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/Libraries/andes/envs/test
#head -n 1 /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli/ecoli.20overlap.all.features.dcast.pythoncorrelationpval.csv > /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli/ecoli.20overlap.all.features.dcast.pythoncorrelationpval.header.txt
setwd("/gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli/")
df.noncor <- read.delim("ecoli.20overlap.all.features.dcast.pythoncorrelationpval.header.txt", header=F, sep=",")
setwd("/gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli/iRF.run/Ecoli.allCas9/all.features/dwt20bp/")
df <- read.delim("dwt20bp.features.txt", header=T, sep="\t")
df.subset <- df[ , which(names(df) %in% df.noncor[1,])]
df.mat <- as.matrix(df.subset[,2:ncol(df.subset)])
df.mat.id <- cbind(as.data.frame(df$sgRNAID), df.mat)
write.table(df.mat.id, "dwt20bp.sgRNA.noncorpval.features.txt", quote=F, row.names=F, sep="\t")
write.table(df.mat.id, "dwt20bp.sgRNA.noncorpval.features_overlap.txt", quote=F, row.names=F, sep="\t")
write.table(df.mat, "dwt20bp.sgRNA.noncorpval.features_overlap_noSampleIDs.txt", quote=F, row.names=F, sep="\t")
# run python scripts on Andes
# run job submissions on Summit
# Builder script: /gpfs/alpine/syb105/proj-shared/Projects/iRF/iRF_LOOP_SetUp_CrossLayer.py
# [python iRF_LOOP_SetUp_CrossLayer.py --DataFile --YFile --System Summit --NodesPer 1 --TotalNodes 10 --RunTime 2 --Account SYB105 --NumTrees 1000 --NumIterations 5 --RunName iRF.XX --bypass --Prediction]
# Andes
mkdir /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli/iRF.run/Ecoli.allCas9/all.features/dwt20bp
mkdir /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli/iRF.run/Ecoli.allCas9/all.features/dwt20bp.noncor
mkdir /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli/iRF.run/Ecoli.allCas9/all.features/dwt20bp/dwtonly
mkdir /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli/iRF.run/Ecoli.allCas9/all.features/dwt20bp.noncor/dwtonly
module load python/3.7-anaconda3
cd /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli/iRF.run/Ecoli.allCas9/all.features/dwt20bp
python /gpfs/alpine/syb105/proj-shared/Projects/iRF/iRF_LOOP_SetUp_CrossLayer.py --System Summit --NodesPer 1 --TotalNodes 50 --RunTime 90 --Account SYB105 --NumTrees 1000 --NumIterations 5 --RunName dwt20bp --bypass --Prediction /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli/iRF.run/Ecoli.allCas9/all.features/dwt20bp/dwt20bp.features.txt /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli/iRF.run/Ecoli.allCas9/all.features/dwt20bp/dwt20bp.score.txt
cd /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli/iRF.run/Ecoli.allCas9/all.features/dwt20bp.noncor
python /gpfs/alpine/syb105/proj-shared/Projects/iRF/iRF_LOOP_SetUp_CrossLayer.py --System Summit --NodesPer 1 --TotalNodes 50 --RunTime 90 --Account SYB105 --NumTrees 1000 --NumIterations 5 --RunName dwt20bp.noncor --bypass --Prediction /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli/iRF.run/Ecoli.allCas9/all.features/dwt20bp/dwt20bp.noncor.features.txt /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli/iRF.run/Ecoli.allCas9/all.features/dwt20bp/dwt20bp.score.txt
cd /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli/iRF.run/Ecoli.allCas9/all.features/dwt20bp/dwtonly
python /gpfs/alpine/syb105/proj-shared/Projects/iRF/iRF_LOOP_SetUp_CrossLayer.py --System Summit --NodesPer 1 --TotalNodes 50 --RunTime 90 --Account SYB105 --NumTrees 1000 --NumIterations 5 --RunName dwt20bp.dwtonly --bypass --Prediction /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli/iRF.run/Ecoli.allCas9/all.features/dwt20bp/dwt20bp.dwtonly.features.txt /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli/iRF.run/Ecoli.allCas9/all.features/dwt20bp/dwt20bp.score.txt
cd /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli/iRF.run/Ecoli.allCas9/all.features/dwt20bp.noncor/dwtonly
python /gpfs/alpine/syb105/proj-shared/Projects/iRF/iRF_LOOP_SetUp_CrossLayer.py --System Summit --NodesPer 1 --TotalNodes 50 --RunTime 90 --Account SYB105 --NumTrees 1000 --NumIterations 5 --RunName dwt20bp.noncor.dwtonly --bypass --Prediction /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli/iRF.run/Ecoli.allCas9/all.features/dwt20bp/dwt20bp.noncor.dwtonly.features.txt /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli/iRF.run/Ecoli.allCas9/all.features/dwt20bp/dwt20bp.score.txt
# Summit
cd /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli/iRF.run/Ecoli.allCas9/all.features/dwt20bp
module load python/3.7.0-anaconda3-5.3.0
# full
bsub /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli/iRF.run/Ecoli.allCas9/all.features/dwt20bp/Submits/submit_full_dwt20bp_0.sh
bsub /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli/iRF.run/Ecoli.allCas9/all.features/dwt20bp/dwtonly/Submits/submit_full_dwt20bp.dwtonly_0.sh
bsub /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli/iRF.run/Ecoli.allCas9/all.features/dwt20bp.noncor/Submits/submit_full_dwt20bp.noncor_0.sh
bsub /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli/iRF.run/Ecoli.allCas9/all.features/dwt20bp.noncor/dwtonly/Submits/submit_full_dwt20bp.noncor.dwtonly_0.sh
# train
bsub /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli/iRF.run/Ecoli.allCas9/all.features/dwt20bp/Submits/submit_train_dwt20bp_0.sh
bsub /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli/iRF.run/Ecoli.allCas9/all.features/dwt20bp/dwtonly/Submits/submit_train_dwt20bp.dwtonly_0.sh
bsub /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli/iRF.run/Ecoli.allCas9/all.features/dwt20bp.noncor/Submits/submit_train_dwt20bp.noncor_0.sh
bsub /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli/iRF.run/Ecoli.allCas9/all.features/dwt20bp.noncor/dwtonly/Submits/submit_train_dwt20bp.noncor.dwtonly_0.sh
# once the train submissions are done run the test submissions
# test
bsub /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli/iRF.run/Ecoli.allCas9/all.features/dwt20bp/Submits/submit_test_dwt20bp_0.sh
bsub /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli/iRF.run/Ecoli.allCas9/all.features/dwt20bp/dwtonly/Submits/submit_test_dwt20bp.dwtonly_0.sh
bsub /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli/iRF.run/Ecoli.allCas9/all.features/dwt20bp.noncor/Submits/submit_test_dwt20bp.noncor_0.sh
bsub /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli/iRF.run/Ecoli.allCas9/all.features/dwt20bp.noncor/dwtonly/Submits/submit_test_dwt20bp.noncor.dwtonly_0.sh
# Andes
cd /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli/iRF.run/Ecoli.allCas9/
module load python/3.7-anaconda3
cd /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli/iRF.run/Ecoli.allCas9/all.features/dwt20bp
python /gpfs/alpine/syb105/proj-shared/Projects/iRF/iRF_postProcessing.py --Iterations 5 --Prediction --PredAccuracy MAE,MAEA,MSE,R2 --varTot 95 /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli/iRF.run/Ecoli.allCas9/YNames.txt dwt20bp
# R2 = -0.09732972147525264
# sort -k 3,3n cut.score_top95.txt | tail
# gene.dwtd22.y cut.score 0.005088947000780752
# p20homo_energyraw cut.score 0.0053181865441389595
# CCsgRNA.raw cut.score 0.005339999896960374
# p18xz_quadrupoleraw cut.score 0.005767609956968676
# p20homo1_energyraw cut.score 0.008388629530967358
# p20lumo1_energyraw cut.score 0.008781497697448083
# sgRNA.tempsgRNA.raw cut.score 0.010472801419079218
# sgRNA.gcsgRNA.raw cut.score 0.01124422807868197
# p20xz_quadrupoleraw cut.score 0.012476568605244256
# p20yz_quadrupoleraw cut.score 0.015935850268443418
# sort -k 3,3n importanceScores.txt | tail
# p17rot_temp_yraw 1.0049606976282138e-05 9.65787133716791e-06
# p13x_dipoleraw 9.819701433467067e-06 9.688540149124728e-06
# p1relativenum_Oatomsraw 9.817232827622205e-06 9.691609695980317e-06
# p12lumo1_energyraw 9.87789110556229e-07 9.718214016923348e-07
# p11num_Hatomsraw 2.591322097373495e-08 9.80395820959332e-10
# p7relativenum_Hatomsraw 9.66627762124357e-07 9.85548931808876e-07
# p1relativenum_aromaticbondsraw 1.014031472881299e-05 9.899151622784594e-06
# p5homo_lumo_energygapraw 1.0649382174876468e-06 9.936602694948545e-07
# p7homo1_energyraw 9.646178646683486e-07 9.951558502852053e-07
# p11relativenum_Hatomsraw 1.0564708275856993e-06 9.997236209398945e-07
########### which file should I be looking at????
#***** NOTE ***** sort is not accounting for scientific notation in the file --> sort -k#rg usually works, where # is the column you want to sort on
# the two files are going to be slightly different, the topVarEdges file isnt all of the edges and is from a single full iRF run, the importanceScores file accounts for all n of the iRF runs and contains all edges > 0 but shouldn't matter for looking at top features with importance
cd /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli/iRF.run/Ecoli.allCas9/all.features/dwt20bp/dwtonly
python /gpfs/alpine/syb105/proj-shared/Projects/iRF/iRF_postProcessing.py --Iterations 5 --Prediction --PredAccuracy MAE,MAEA,MSE,R2 --varTot 95 /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli/iRF.run/Ecoli.allCas9/YNames.txt dwt20bp.dwtonly
# R2 = -0.1015313462431008
# sort -k 3,3n cut.score_top95.txt | tail
# ipd.dwtd2 cut.score 0.005893454386658152
# structure.dwtd2 cut.score 0.006590363506278453
# structure.dwtd2.x cut.score 0.006662768167757329
# structure.dwtd2.y cut.score 0.0066847033594927595
# structure.dwtd3.y cut.score 0.0071422352567631145
# structure.dwtd3.x cut.score 0.0071579846959233815
# structure.dwtd3 cut.score 0.0072963517143558474
# structure.dwtd1.y cut.score 0.007616927628919407
# structure.dwtd1.x cut.score 0.007630795687112113
# structure.dwtd1 cut.score 0.00767058974515326
cd /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli/iRF.run/Ecoli.allCas9/all.features/dwt20bp.noncor
python /gpfs/alpine/syb105/proj-shared/Projects/iRF/iRF_postProcessing.py --Iterations 5 --Prediction --PredAccuracy MAE,MAEA,MSE,R2 --varTot 95 /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli/iRF.run/Ecoli.allCas9/YNames.txt dwt20bp.noncor
# R2 = -0.09723670364309422
# sort -k 3,3n cut.score_top95.txt | tail
# ipd.dwtd2.x cut.score 0.011356284955400845
# p19xy_polarizabilityraw cut.score 0.011524537394544619
# structure.dwtd2 cut.score 0.011864545158752052
# gc.dwtd2 cut.score 0.012866982824906632
# gc.dwtd1 cut.score 0.013695956441158584
# gene.dwtd22.x cut.score 0.01477263820156635
# p18yz_quadrupoleraw cut.score 0.015289669859090929
# sgRNA.tempsgRNA.raw cut.score 0.02229302100534027
# p20xz_quadrupoleraw cut.score 0.026460316100111493
# p20yz_quadrupoleraw cut.score 0.03246534475919344
cd /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli/iRF.run/Ecoli.allCas9/all.features/dwt20bp.noncor/dwtonly
python /gpfs/alpine/syb105/proj-shared/Projects/iRF/iRF_postProcessing.py --Iterations 5 --Prediction --PredAccuracy MAE,MAEA,MSE,R2 --varTot 95 /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli/iRF.run/Ecoli.allCas9/YNames.txt dwt20bp.noncor.dwtonly
# R2 = -0.10006915839387324
# sort -k 3,3n cut.score_top95.txt | tail
# ipd.dwtd6.x cut.score 0.017480706716070438
# ipd.dwtd1.x cut.score 0.017755258595007914
# ipd.dwtd2.x cut.score 0.017933488998746427
# structure.dwtd4.x cut.score 0.018382989216341565
# gc.dwtd3.x cut.score 0.020268036160605615
# structure.dwtd2 cut.score 0.02076919322860375
# gc.dwtd2 cut.score 0.02196086824991708
# structure.dwtd3 cut.score 0.02256020187337726
# structure.dwtd1 cut.score 0.023827078994586668
# gc.dwtd1 cut.score 0.02962434751522795
# pearson correlation
setwd("/gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli/iRF.run/Ecoli.allCas9/all.features/dwt20bp/cut.score/foldRuns/fold9/Runs/Set4")
pred <- read.delim("dwt20bp_Set4_test.prediction", header=T, sep="\t")
y <- read.delim("set4_Y_test_noSampleIDs.txt", header=T, sep="\t")
cor(y$cut.score, pred$Predictions.)
# 0.2651749
# correlation - by Cas9 type
setwd("/gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli/iRF.run/Ecoli.allCas9/all.features/dwt20bp/cut.score/foldRuns/fold9/Runs/Set4")
pred <- read.delim("dwt20bp_Set4_test.prediction", header=T, sep="\t")
y <- read.delim("set4_Y_test_noSampleIDs.txt", header=T, sep="\t")
id <- read.delim("set4_test_SampleIDs.txt", header=F, sep="\t")
colnames(id) <- "sgRNAID"
id.pred <- cbind(id, pred)
id.pred.y <- cbind(id.pred, y)
library(tidyr)
id.pred.y.group <- id.pred.y %>% separate(sgRNAID, c("sgRNA", "ID", "group"), "_")
# 23628
cor(id.pred.y.group$cut.score, id.pred.y.group$Predictions., method=c("pearson"))
# 0.2651749
pred.Cas9 <- subset(id.pred.y.group, id.pred.y.group$group == "Cas9")
cor(pred.Cas9$cut.score, pred.Cas9$Predictions., method=c("pearson"))
# 0.7208258
pred.eSpCas9 <- subset(id.pred.y.group, id.pred.y.group$group == "eSpCas9")
cor(pred.eSpCas9$cut.score, pred.eSpCas9$Predictions., method=c("pearson"))
# 0.6570744
pred.recACas9 <- subset(id.pred.y.group, id.pred.y.group$group == "recACas9")
cor(pred.recACas9$cut.score, pred.recACas9$Predictions., method=c("pearson"))
# 0.4569226
# pearson correlation
setwd("/gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli/iRF.run/Ecoli.allCas9/all.features/dwt20bp/dwtonly/cut.score/foldRuns/fold9/Runs/Set4")
pred <- read.delim("dwt20bp.dwtonly_Set4_test.prediction", header=T, sep="\t")
y <- read.delim("set4_Y_test_noSampleIDs.txt", header=T, sep="\t")
cor(y$cut.score, pred$Predictions.)
# 0.2462201
# pearson correlation
setwd("/gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli/iRF.run/Ecoli.allCas9/all.features/dwt20bp.noncor/cut.score/foldRuns/fold9/Runs/Set4")
pred <- read.delim("dwt20bp.noncor_Set4_test.prediction", header=T, sep="\t")
y <- read.delim("set4_Y_test_noSampleIDs.txt", header=T, sep="\t")
cor(y$cut.score, pred$Predictions.)
# 0.2636917
# pearson correlation
setwd("/gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli/iRF.run/Ecoli.allCas9/all.features/dwt20bp.noncor/dwtonly/cut.score/foldRuns/fold9/Runs/Set4")
pred <- read.delim("dwt20bp.noncor.dwtonly_Set4_test.prediction", header=T, sep="\t")
y <- read.delim("set4_Y_test_noSampleIDs.txt", header=T, sep="\t")
cor(y$cut.score, pred$Predictions.)
# 0.2465025
#### different method for removing highly correlated features...
mkdir /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli/iRF.run/Ecoli.allCas9/all.features/dwt20bp.noncor2
module load python/3.7-anaconda3
# Andes
cd /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli/iRF.run/Ecoli.allCas9/all.features/dwt20bp.noncor2
python /gpfs/alpine/syb105/proj-shared/Projects/iRF/iRF_LOOP_SetUp_CrossLayer.py --System Summit --NodesPer 1 --TotalNodes 50 --RunTime 90 --Account SYB105 --NumTrees 1000 --NumIterations 5 --RunName dwt20bp.noncor2 --bypass --Prediction /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli/iRF.run/Ecoli.allCas9/all.features/dwt20bp/dwt20bp.noncor2.features.txt /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli/iRF.run/Ecoli.allCas9/all.features/dwt20bp/dwt20bp.score.txt
# Summit
cd /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli/iRF.run/Ecoli.allCas9/all.features/dwt20bp.noncor2
module load python/3.7.0-anaconda3-5.3.0
# full
bsub /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli/iRF.run/Ecoli.allCas9/all.features/dwt20bp.noncor2/Submits/submit_full_dwt20bp.noncor2_0.sh
# train
bsub /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli/iRF.run/Ecoli.allCas9/all.features/dwt20bp.noncor2/Submits/submit_train_dwt20bp.noncor2_0.sh
# once the train submissions are done run the test submissions
# test
bsub /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli/iRF.run/Ecoli.allCas9/all.features/dwt20bp.noncor2/Submits/submit_test_dwt20bp.noncor2_0.sh
# Andes
cd /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli/iRF.run/Ecoli.allCas9/
module load python/3.7-anaconda3
cd /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli/iRF.run/Ecoli.allCas9/all.features/dwt20bp.noncor2
python /gpfs/alpine/syb105/proj-shared/Projects/iRF/iRF_postProcessing.py --Iterations 5 --Prediction --PredAccuracy MAE,MAEA,MSE,R2 --varTot 95 /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli/iRF.run/Ecoli.allCas9/YNames.txt dwt20bp.noncor2
# -0.09706323437274336
# sort -k3rg topVarEdges/cut.score_top95.txt | head
# p20xz_quadrupoleraw cut.score 0.03430781915575253
# sgRNA.gcsgRNA.raw cut.score 0.022643938491274534
# p18xz_quadrupoleraw cut.score 0.017385625795686453
# p20homo_energyraw cut.score 0.016573443712848534
# gene.dwtd22.x cut.score 0.014550040428767322
# p19xy_polarizabilityraw cut.score 0.013047439467126136
# gc.dwtd2.x cut.score 0.012500423935908321
# gc.dwtd1.x cut.score 0.0119529382764896
# structure.dwtd2.x cut.score 0.011650588858070564
# ipd.dwtd2.x cut.score 0.011237346169474187
# pearson correlation
setwd("/gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli/iRF.run/Ecoli.allCas9/all.features/dwt20bp.noncor2/cut.score/foldRuns/fold9/Runs/Set4")
pred <- read.delim("dwt20bp.noncor2_Set4_test.prediction", header=T, sep="\t")
y <- read.delim("set4_Y_test_noSampleIDs.txt", header=T, sep="\t")
cor(y$cut.score, pred$Predictions.)
# 0.2642945
id <- read.delim("set4_test_SampleIDs.txt", header=F, sep="\t")
colnames(id) <- "sgRNAID"
id.pred <- cbind(id, pred)
id.pred.y <- cbind(id.pred, y)
library(tidyr)
id.pred.y.group <- id.pred.y %>% separate(sgRNAID, c("sgRNA", "ID", "group"), "_")
cor(id.pred.y.group$cut.score, id.pred.y.group$Predictions., method=c("pearson"))
# 0.2642945
pred.Cas9 <- subset(id.pred.y.group, id.pred.y.group$group == "Cas9")
cor(pred.Cas9$cut.score, pred.Cas9$Predictions., method=c("pearson"))
# 0.7196688
pred.eSpCas9 <- subset(id.pred.y.group, id.pred.y.group$group == "eSpCas9")
cor(pred.eSpCas9$cut.score, pred.eSpCas9$Predictions., method=c("pearson"))
# 0.65628
pred.recACas9 <- subset(id.pred.y.group, id.pred.y.group$group == "recACas9")
cor(pred.recACas9$cut.score, pred.recACas9$Predictions., method=c("pearson"))
# 0.456601
##### run above with just cas9
cd /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli/iRF.run/Ecoli.allCas9/all.features/dwt20bp/
awk 'NR==1 || /_Cas9/' dwt20bp.score.txt > dwt20bp.cas9.score.txt
cp dwt20bp.cas9.score.txt dwt20bp.cas9.score_overlap.txt
cut --complement -f 1 dwt20bp.cas9.score.txt > dwt20bp.cas9.score_overlap_noSampleIDs.txt
awk 'NR==1 || /_Cas9/' dwt20bp.noncor2.features.txt > dwt20bp.noncor2.cas9.features.txt
cp dwt20bp.noncor2.cas9.features.txt dwt20bp.noncor2.cas9.features_overlap.txt
cut --complement -f 1 dwt20bp.noncor2.cas9.features.txt > dwt20bp.noncor2.cas9.features_overlap_noSampleIDs.txt
mkdir /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli/iRF.run/Ecoli.allCas9/all.features/dwt20bp.noncor2.cas9
module load python/3.7-anaconda3
# Andes
cd /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli/iRF.run/Ecoli.allCas9/all.features/dwt20bp.noncor2.cas9
python /gpfs/alpine/syb105/proj-shared/Projects/iRF/iRF_LOOP_SetUp_CrossLayer.py --System Summit --NodesPer 1 --TotalNodes 50 --RunTime 90 --Account SYB105 --NumTrees 1000 --NumIterations 5 --RunName dwt20bp.noncor2.cas9 --bypass --Prediction /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli/iRF.run/Ecoli.allCas9/all.features/dwt20bp/dwt20bp.noncor2.cas9.features.txt /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli/iRF.run/Ecoli.allCas9/all.features/dwt20bp/dwt20bp.cas9.score.txt
# Summit
cd /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli/iRF.run/Ecoli.allCas9/all.features/dwt20bp.noncor2.cas9
module load python/3.7.0-anaconda3-5.3.0
# full
bsub /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli/iRF.run/Ecoli.allCas9/all.features/dwt20bp.noncor2.cas9/Submits/submit_full_dwt20bp.noncor2.cas9_0.sh
# train
bsub /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli/iRF.run/Ecoli.allCas9/all.features/dwt20bp.noncor2.cas9/Submits/submit_train_dwt20bp.noncor2.cas9_0.sh
# once the train submissions are done run the test submissions
# test
bsub /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli/iRF.run/Ecoli.allCas9/all.features/dwt20bp.noncor2.cas9/Submits/submit_test_dwt20bp.noncor2.cas9_0.sh
# Andes
cd /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli/iRF.run/Ecoli.allCas9/
module load python/3.7-anaconda3
cd /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli/iRF.run/Ecoli.allCas9/all.features/dwt20bp.noncor2.cas9
python /gpfs/alpine/syb105/proj-shared/Projects/iRF/iRF_postProcessing.py --Iterations 5 --Prediction --PredAccuracy MAE,MAEA,MSE,R2 --varTot 95 /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli/iRF.run/Ecoli.allCas9/YNames.txt dwt20bp.noncor2.cas9
# 0.23064617681729335
sort -k3rg topVarEdges/cut.score_top95.txt | head
# p20xz_quadrupoleraw cut.score 0.029686817358311596
# gene.dwtd22.x cut.score 0.029489538370503877
# p20homo_lumo_energygapraw cut.score 0.02729263038651026
# sgRNA.gcsgRNA.raw cut.score 0.02561598838431653
# p18xz_quadrupoleraw cut.score 0.023110545239158516
# p20homo_energyraw cut.score 0.013095906817271597
# p19xy_polarizabilityraw cut.score 0.012551967356934784
# p15.CCsgRNA.raw cut.score 0.012249350565404618
# ipd.dwtd2.x cut.score 0.011976463258548734
# rna.dwtd21.x cut.score 0.011755281398032221
# pearson correlation
setwd("/gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli/iRF.run/Ecoli.allCas9/all.features/dwt20bp.noncor2.cas9/cut.score/foldRuns/fold9/Runs/Set4")
pred <- read.delim("dwt20bp.noncor2.cas9_Set4_test.prediction", header=T, sep="\t")
y <- read.delim("set4_Y_test_noSampleIDs.txt", header=T, sep="\t")
cor(y$cut.score, pred$Predictions.)
# 0.481121
#### Noncor + pval filter
cd /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli/iRF.run/Ecoli.allCas9/all.features/dwt20bp/
awk 'NR==1 || /_Cas9/' dwt20bp.sgRNA.noncorpval.features.txt > dwt20bp.sgRNA.noncorpval.cas9.features.txt
cp dwt20bp.sgRNA.noncorpval.cas9.features.txt dwt20bp.sgRNA.noncorpval.cas9.features_overlap.txt
cut --complement -f 1 dwt20bp.sgRNA.noncorpval.cas9.features.txt > dwt20bp.sgRNA.noncorpval.cas9.features_overlap_noSampleIDs.txt
module load python/3.7-anaconda3
# Andes
cd /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli/iRF.run/Ecoli.allCas9/all.features/dwt20bp.noncor2.cas9
python /gpfs/alpine/syb105/proj-shared/Projects/iRF/iRF_LOOP_SetUp_CrossLayer.py --System Summit --NodesPer 1 --TotalNodes 50 --RunTime 90 --Account SYB105 --NumTrees 1000 --NumIterations 5 --RunName dwt20bp.noncor2.cas9 --bypass --Prediction /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli/iRF.run/Ecoli.allCas9/all.features/dwt20bp/dwt20bp.sgRNA.noncorpval.cas9.features.txt /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli/iRF.run/Ecoli.allCas9/all.features/dwt20bp/dwt20bp.cas9.score.txt
# Summit
cd /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli/iRF.run/Ecoli.allCas9/all.features/dwt20bp.noncor2.cas9
module load python/3.7.0-anaconda3-5.3.0
# full
bsub /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli/iRF.run/Ecoli.allCas9/all.features/dwt20bp.noncor2.cas9/Submits/submit_full_dwt20bp.noncor2.cas9_0.sh
# train
bsub /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli/iRF.run/Ecoli.allCas9/all.features/dwt20bp.noncor2.cas9/Submits/submit_train_dwt20bp.noncor2.cas9_0.sh
# once the train submissions are done run the test submissions
# test
bsub /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli/iRF.run/Ecoli.allCas9/all.features/dwt20bp.noncor2.cas9/Submits/submit_test_dwt20bp.noncor2.cas9_0.sh
# Andes
cd /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli/iRF.run/Ecoli.allCas9/
module load python/3.7-anaconda3
cd /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli/iRF.run/Ecoli.allCas9/all.features/dwt20bp.noncor2.cas9
python /gpfs/alpine/syb105/proj-shared/Projects/iRF/iRF_postProcessing.py --Iterations 5 --Prediction --PredAccuracy MAE,MAEA,MSE,R2 --varTot 95 /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli/iRF.run/Ecoli.allCas9/YNames.txt dwt20bp.noncor2.cas9
# 0.2305409701687254
sort -k3rg topVarEdges/cut.score_top95.txt | head
# gene.dwtd22.x cut.score 0.02935066938083802
# p20xz_quadrupoleraw cut.score 0.027829350521036314
# p20homo_lumo_energygapraw cut.score 0.0271765372244849
# sgRNA.gcsgRNA.raw cut.score 0.025397305484077294
# p18xz_quadrupoleraw cut.score 0.022912737085578515
# p20homo_energyraw cut.score 0.015014568144767555
# p15.CCsgRNA.raw cut.score 0.012289824112700985
# p19xy_polarizabilityraw cut.score 0.012272476947421796
# rna.dwtd21.x cut.score 0.011863350020268486
# p19rot_constants_zraw cut.score 0.011810069441196698
# pearson correlation
setwd("/gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli/iRF.run/Ecoli.allCas9/all.features/dwt20bp.noncor2.cas9/cut.score/foldRuns/fold9/Runs/Set4")
pred <- read.delim("dwt20bp.noncor2.cas9_Set4_test.prediction", header=T, sep="\t")
y <- read.delim("set4_Y_test_noSampleIDs.txt", header=T, sep="\t")
cor(y$cut.score, pred$Predictions.)
# 0.4809575
##### run with just the DWT that overlaps the sgRNA (all cas types & just cas9)
cd /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli/iRF.run/Ecoli.allCas9/all.features/dwt20bp/
awk 'NR==1 || /_Cas9/' dwt20bp.sgRNA.noncor2.features.txt > dwt20bp.sgRNA.noncor2.cas9.features.txt
cp dwt20bp.sgRNA.noncor2.cas9.features.txt dwt20bp.sgRNA.noncor2.cas9.features_overlap.txt
cut --complement -f 1 dwt20bp.sgRNA.noncor2.cas9.features.txt > dwt20bp.sgRNA.noncor2.cas9.features_overlap_noSampleIDs.txt
mkdir /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli/iRF.run/Ecoli.allCas9/all.features/dwt20bp.sgRNA.noncor2
mkdir /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli/iRF.run/Ecoli.allCas9/all.features/dwt20bp.sgRNA.noncor2.cas9
module load python/3.7-anaconda3
# Andes
cd /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli/iRF.run/Ecoli.allCas9/all.features/dwt20bp.sgRNA.noncor2
python /gpfs/alpine/syb105/proj-shared/Projects/iRF/iRF_LOOP_SetUp_CrossLayer.py --System Summit --NodesPer 1 --TotalNodes 50 --RunTime 90 --Account SYB105 --NumTrees 1000 --NumIterations 5 --RunName dwt20bp.sgRNA.noncor2 --bypass --Prediction /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli/iRF.run/Ecoli.allCas9/all.features/dwt20bp/dwt20bp.sgRNA.noncor2.features.txt /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli/iRF.run/Ecoli.allCas9/all.features/dwt20bp/dwt20bp.score.txt
cd /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli/iRF.run/Ecoli.allCas9/all.features/dwt20bp.sgRNA.noncor2.cas9
python /gpfs/alpine/syb105/proj-shared/Projects/iRF/iRF_LOOP_SetUp_CrossLayer.py --System Summit --NodesPer 1 --TotalNodes 50 --RunTime 90 --Account SYB105 --NumTrees 1000 --NumIterations 5 --RunName dwt20bp.sgRNA.noncor2.cas9 --bypass --Prediction /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli/iRF.run/Ecoli.allCas9/all.features/dwt20bp/dwt20bp.sgRNA.noncor2.cas9.features.txt /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli/iRF.run/Ecoli.allCas9/all.features/dwt20bp/dwt20bp.cas9.score.txt
# Summit
cd /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli/iRF.run/Ecoli.allCas9/all.features/
module load python/3.7.0-anaconda3-5.3.0
# full
bsub /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli/iRF.run/Ecoli.allCas9/all.features/dwt20bp.sgRNA.noncor2/Submits/submit_full_dwt20bp.sgRNA.noncor2_0.sh
bsub /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli/iRF.run/Ecoli.allCas9/all.features/dwt20bp.sgRNA.noncor2.cas9/Submits/submit_full_dwt20bp.sgRNA.noncor2.cas9_0.sh
# train
bsub /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli/iRF.run/Ecoli.allCas9/all.features/dwt20bp.sgRNA.noncor2/Submits/submit_train_dwt20bp.sgRNA.noncor2_0.sh
bsub /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli/iRF.run/Ecoli.allCas9/all.features/dwt20bp.sgRNA.noncor2.cas9/Submits/submit_train_dwt20bp.sgRNA.noncor2.cas9_0.sh
# once the train submissions are done run the test submissions
# test
bsub /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli/iRF.run/Ecoli.allCas9/all.features/dwt20bp.sgRNA.noncor2/Submits/submit_test_dwt20bp.sgRNA.noncor2_0.sh
bsub /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli/iRF.run/Ecoli.allCas9/all.features/dwt20bp.sgRNA.noncor2.cas9/Submits/submit_test_dwt20bp.sgRNA.noncor2.cas9_0.sh
# Andes
cd /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli/iRF.run/Ecoli.allCas9/
module load python/3.7-anaconda3
cd /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli/iRF.run/Ecoli.allCas9/all.features/dwt20bp.sgRNA.noncor2
python /gpfs/alpine/syb105/proj-shared/Projects/iRF/iRF_postProcessing.py --Iterations 5 --Prediction --PredAccuracy MAE,MAEA,MSE,R2 --varTot 95 /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli/iRF.run/Ecoli.allCas9/YNames.txt dwt20bp.sgRNA.noncor2
# -0.09701249294636369
sort -k3rg topVarEdges/cut.score_top95.txt | head
# p20xz_quadrupoleraw cut.score 0.02970763111253204
# sgRNA.gcsgRNA.raw cut.score 0.022546575545529038
# p20homo_energyraw cut.score 0.019438982603852864
# p18xz_quadrupoleraw cut.score 0.01760964584559493
# gene.dwtd22.x cut.score 0.014608906686527283
# gc.dwtd2.x cut.score 0.012553723773347972
# gc.dwtd1.x cut.score 0.012068678836160345
# p19xy_polarizabilityraw cut.score 0.0120464817437198
# p19rot_constants_yraw cut.score 0.011487778932387364
# structure.dwtd2.x cut.score 0.011485157452920702
cd /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli/iRF.run/Ecoli.allCas9/all.features/dwt20bp.sgRNA.noncor2.cas9
python /gpfs/alpine/syb105/proj-shared/Projects/iRF/iRF_postProcessing.py --Iterations 5 --Prediction --PredAccuracy MAE,MAEA,MSE,R2 --varTot 95 /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli/iRF.run/Ecoli.allCas9/YNames.txt dwt20bp.sgRNA.noncor2.cas9
# 0.23060436117200434
sort -k3rg topVarEdges/cut.score_top95.txt | head
# gene.dwtd22.x cut.score 0.02950429703086881
# p20xz_quadrupoleraw cut.score 0.02647631501114429
# p20homo_lumo_energygapraw cut.score 0.02586002648307124
# sgRNA.gcsgRNA.raw cut.score 0.02544610820137214
# p18xz_quadrupoleraw cut.score 0.024142632725578022
# p20homo_energyraw cut.score 0.01793906689532132
# p19rot_constants_zraw cut.score 0.012390152750953683
# p15.CCsgRNA.raw cut.score 0.012146528343244091
# rna.dwtd21.x cut.score 0.01193224991570596
# ipd.dwtd2.x cut.score 0.01163594499941648
# pearson correlation
setwd("/gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli/iRF.run/Ecoli.allCas9/all.features/dwt20bp.sgRNA.noncor2/cut.score/foldRuns/fold9/Runs/Set4")
pred <- read.delim("dwt20bp.sgRNA.noncor2_Set4_test.prediction", header=T, sep="\t")
y <- read.delim("set4_Y_test_noSampleIDs.txt", header=T, sep="\t")
cor(y$cut.score, pred$Predictions.)
# 0.2643666
setwd("/gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli/iRF.run/Ecoli.allCas9/all.features/dwt20bp.sgRNA.noncor2.cas9/cut.score/foldRuns/fold9/Runs/Set4")
pred <- read.delim("dwt20bp.sgRNA.noncor2.cas9_Set4_test.prediction", header=T, sep="\t")
y <- read.delim("set4_Y_test_noSampleIDs.txt", header=T, sep="\t")
cor(y$cut.score, pred$Predictions.)
# 0.480452
–> try last feature set with GBR??
#!/bin/bash
#SBATCH -A SYB105
#SBATCH -J gbr
#SBATCH -N 2
#SBATCH -p gpu
#SBATCH -t 10:00:00
#SBATCH --mem-per-cpu=0
module load python
source /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/scripts/loadcondaandes.sh
conda activate /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/Libraries/andes/envs/test
cd /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli/iRF.run/Ecoli.allCas9/all.features/dwt20bp/
R CMD BATCH dwt.sliding.gbr.R
# sbatch /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli/iRF.run/Ecoli.allCas9/all.features/dwt20bp/dwt.sliding.gbr.sh
# salloc -A SYB105 -N 2 -t 4:00:00 -p gpu
cd /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli
grep '_Cas9' Ecoli.allCas9.DWT.raw.onehot.tensor.dcast.regional2.txt | cut -f 1-2 > Ecoli.Cas9.sgRNAID.score.txt
module load python
source /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/scripts/loadcondaandes.sh
conda activate /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/Libraries/andes/envs/test
setwd("/gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli/iRF.run/Ecoli.allCas9/all.features/dwt20bp/")
df.feature <- read.delim("dwt20bp.sgRNA.noncor2.cas9.features.txt", header=T, sep="\t")
setwd("/gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli")
df.dcast <- read.delim("Ecoli.Cas9.sgRNAID.score.txt", header=F, sep="\t")
colnames(df.dcast) <- c("df.sgRNAID", "cut.score")
library(dplyr)
df <- left_join(df.dcast, df.feature, by=c("df.sgRNAID"))
data <- df[,2:ncol(df)]
library(caret)
set.seed(998)
inTraining <- createDataPartition(data$cut.score, p = .75, list = FALSE)
training <- data[ inTraining,]
testing <- data[-inTraining,]
fitControl <- trainControl(## 5-fold CV
method = "repeatedcv",
number = 5,
## repeated ten times
repeats = 10)
set.seed(825)
gbmFit1 <- train(cut.score ~ ., data = training,
method = "gbm",
trControl = fitControl,
verbose = FALSE)
gbmFit1
# Stochastic Gradient Boosting
#
# 30353 samples
# 866 predictor
#
# No pre-processing
# Resampling: Cross-Validated (5 fold, repeated 10 times)
# Summary of sample sizes: 24283, 24282, 24281, 24283, 24283, 24283, ...
# Resampling results across tuning parameters:
#
# interaction.depth n.trees RMSE Rsquared MAE
# 1 50 9.752900 0.1741312 8.020422
# 1 100 9.491935 0.2100688 7.755110
# 1 150 9.345047 0.2269003 7.598931
# 2 50 9.489438 0.2125873 7.753194
# 2 100 9.236948 0.2420648 7.484005
# 2 150 9.114203 0.2587267 7.354661
# 3 50 9.346438 0.2309019 7.604840
# 3 100 9.113106 0.2598522 7.355787
# 3 150 9.003248 0.2749639 7.242314
#
# Tuning parameter 'shrinkage' was held constant at a value of 0.1
#
# Tuning parameter 'n.minobsinnode' was held constant at a value of 10
# RMSE was used to select the optimal model using the smallest value.
# The final values used for the model were n.trees = 150, interaction.depth =
# 3, shrinkage = 0.1 and n.minobsinnode = 10.
head(summary(gbmFit1))
# var rel.inf
# p20homo_energyraw p20homo_energyraw 7.791843
# p19.GGsgRNA.raw p19.GGsgRNA.raw 6.054626
# sgRNA.gcsgRNA.raw sgRNA.gcsgRNA.raw 5.987518
# gene.dwtd22.x gene.dwtd22.x 5.116246
# p20xz_quadrupoleraw p20xz_quadrupoleraw 4.283596
# p19rot_constants_yraw p19rot_constants_yraw 3.972413
test.df <- na.omit(testing)
pred <- predict(gbmFit1, newdata = test.df)
pred.df <- data.frame(pred.score = pred, exp.score = test.df$cut.score)
cor(pred.df$pred.score, pred.df$exp.score)
# 0.5255129
–> use log2 normalized score (Cas9 only)
source /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/scripts/loadcondaandes.sh
conda activate /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/Libraries/andes/envs/test
mkdir /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli/iRF.run/Ecoli.allCas9/all.features/dwt20bp.norm
R
### add the normalized score values
setwd("/gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli")
df.sep.region.id <- read.table("ecoli.20sliding.all.features.dcast.txt", header=T, sep="\t")
norm <- read.table("DataS6.txt", header=T, sep="\t", stringsAsFactors = F)
norm$sgRNAID <- paste0(norm$sgRNA, "_Cas9", sep="")
df.sep.region.id.norm <- left_join(df.sep.region.id, norm[,c(5,3)], by="sgRNAID")
df <- na.omit(df.sep.region.id.norm)
# 40467
## sample ID
df.features <- df[,c(1,3:ncol(df)-1)]
df.score <- df[,c(1,ncol(df))]
setwd("/gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli/iRF.run/Ecoli.allCas9/all.features/dwt20bp.norm")
write.table(df.features, "dwt20bp.norm.features_overlap.txt", quote=F, row.names=F, sep="\t")
write.table(df.score, "dwt20bp.norm.score_overlap.txt", quote=F, row.names=F, sep="\t")
write.table(df.features, "dwt20bp.norm.features.txt", quote=F, row.names=F, sep="\t")
write.table(df.score, "dwt20bp.norm.score.txt", quote=F, row.names=F, sep="\t")
## no sample ID
## <DataFile>_overlap_noSampleIDs.txt & <YFile>_overlap_noSampleIDs.txt
df.features <- as.matrix(df[,c(3:ncol(df))])
df.score <- as.matrix(data.frame(df[,2]))
colnames(df.score) <- "cut.score"
setwd("/gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli/iRF.run/Ecoli.allCas9/all.features/dwt20bp.norm")
write.table(df.features, "dwt20bp.norm.features_overlap_noSampleIDs.txt", quote=F, row.names=F, sep="\t")
write.table(df.score, "dwt20bp.norm.score_overlap_noSampleIDs.txt", quote=F, row.names=F, sep="\t")
### remove correlated features
source /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/scripts/loadcondaandes.sh
conda activate /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/Libraries/andes/envs/test
# python
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
import warnings
warnings.filterwarnings("ignore")
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.metrics import confusion_matrix
np.random.seed(123)
data = pd.read_table('/gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli/ecoli.20sliding.all.features.dcast.txt')
data = data.iloc[:,2:-1]
label_encoder = LabelEncoder()
data.iloc[:,0] = label_encoder.fit_transform(data.iloc[:,0]).astype('float64')
corr = data.corr()
columns = np.full((corr.shape[0],), True, dtype=bool)
for i in range(corr.shape[0]):
for j in range(i+1, corr.shape[0]):
if corr.iloc[i,j] >= 0.9:
if columns[j]:
columns[j] = False
selected_columns = data.columns[columns]
data = data[selected_columns]
data.to_csv("/gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli/ecoli.20sliding.norm.features.dcast.pythoncorrelation.txt")
# R
source /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/scripts/loadcondaandes.sh
conda activate /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/Libraries/andes/envs/test
#head -n 1 /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli/ecoli.20sliding.norm.features.dcast.pythoncorrelation.txt > /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli/ecoli.20sliding.norm.features.dcast.pythoncorrelation.header.txt
setwd("/gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli/")
df.noncor <- read.delim("ecoli.20sliding.norm.features.dcast.pythoncorrelation.header.txt", header=F, sep=",")
setwd("/gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli/iRF.run/Ecoli.allCas9/all.features/dwt20bp.norm/")
df <- read.delim("dwt20bp.norm.features.txt", header=T, sep="\t")
df.subset <- df[ , which(names(df) %in% df.noncor[1,])]
df.mat <- as.matrix(df.subset[,2:ncol(df.subset)])
df.mat.id <- cbind(as.data.frame(df$sgRNAID), df.mat)
write.table(df.mat.id, "dwt20bp.norm.sgRNA.noncorpval.features.txt", quote=F, row.names=F, sep="\t")
write.table(df.mat.id, "dwt20bp.norm.sgRNA.noncorpval.features_overlap.txt", quote=F, row.names=F, sep="\t")
write.table(df.mat, "dwt20bp.norm.sgRNA.noncorpval.features_overlap_noSampleIDs.txt", quote=F, row.names=F, sep="\t")
#### run iRF
# run python scripts on Andes
# run job submissions on Summit
# Builder script: /gpfs/alpine/syb105/proj-shared/Projects/iRF/iRF_LOOP_SetUp_CrossLayer.py
# [python iRF_LOOP_SetUp_CrossLayer.py --DataFile --YFile --System Summit --NodesPer 1 --TotalNodes 10 --RunTime 2 --Account SYB105 --NumTrees 1000 --NumIterations 5 --RunName iRF.XX --bypass --Prediction]
# Andes
module load python/3.7-anaconda3
cd /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli/iRF.run/Ecoli.allCas9/all.features/dwt20bp.norm
python /gpfs/alpine/syb105/proj-shared/Projects/iRF/iRF_LOOP_SetUp_CrossLayer.py --System Summit --NodesPer 1 --TotalNodes 50 --RunTime 90 --Account SYB105 --NumTrees 1000 --NumIterations 5 --RunName dwt20bp.norm --bypass --Prediction /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli/iRF.run/Ecoli.allCas9/all.features/dwt20bp.norm/dwt20bp.norm.sgRNA.noncorpval.features.txt /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli/iRF.run/Ecoli.allCas9/all.features/dwt20bp.norm/dwt20bp.norm.score.txt
# Summit
cd /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli/iRF.run/Ecoli.allCas9/all.features/dwt20bp.norm
module load python/3.7.0-anaconda3-5.3.0
# full
bsub /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli/iRF.run/Ecoli.allCas9/all.features/dwt20bp.norm/Submits/submit_full_dwt20bp.norm_0.sh
# train
bsub /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli/iRF.run/Ecoli.allCas9/all.features/dwt20bp.norm/Submits/submit_train_dwt20bp.norm_0.sh
# once the train submissions are done run the test submissions
# test
bsub /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli/iRF.run/Ecoli.allCas9/all.features/dwt20bp.norm/Submits/submit_test_dwt20bp.norm_0.sh
# Andes
cd /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli/iRF.run/Ecoli.allCas9/
module load python/3.7-anaconda3
cd /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli/iRF.run/Ecoli.allCas9/all.features/dwt20bp.norm
python /gpfs/alpine/syb105/proj-shared/Projects/iRF/iRF_postProcessing.py --Iterations 5 --Prediction --PredAccuracy MAE,MAEA,MSE,R2 --varTot 95 /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli/iRF.run/Ecoli.allCas9/YNames.txt dwt20bp.norm
# R2 = 0.23072261495996757
sort -k3rg topVarEdges/cut.score_top95.txt | head
# gene.dwtd22.x cut.score 0.029307027566664888
# p20xz_quadrupoleraw cut.score 0.028231805557055244
# p20homo_lumo_energygapraw cut.score 0.02773366066101117
# sgRNA.gcsgRNA.raw cut.score 0.02566260776755478
# p18xz_quadrupoleraw cut.score 0.02384074204918078
# p20homo_energyraw cut.score 0.013933004898916046
# p15.CCsgRNA.raw cut.score 0.012081401926499065
# rna.dwtd21.x cut.score 0.011928837881644295
# ipd.dwtd2.x cut.score 0.01179853088649762
# p19rot_constants_yraw cut.score 0.011657646056386582
# pearson correlation
setwd("/gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli/iRF.run/Ecoli.allCas9/all.features/dwt20bp.norm/cut.score/foldRuns/fold9/Runs/Set4")
pred <- read.delim("dwt20bp.norm_Set4_test.prediction", header=T, sep="\t")
y <- read.delim("set4_Y_test_noSampleIDs.txt", header=T, sep="\t")
cor(y$cut.score, pred$Predictions.)
# 0.4808251
source /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/scripts/loadcondaandes.sh
conda activate /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/Libraries/andes/envs/test
library(dplyr)
library(tidyr)
library(data.table)
setwd("/gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli")
window <- read.table("ecoli.20bp.sliding.bed", header=F, sep="\t", stringsAsFactors = F)
score <- read.table("sgRNA.coord.txt", header=T, sep="\t", stringsAsFactors = F)
colnames(score) <- c("chr", "start", "end", "sgRNA", "id", "seq", "id2", "cut.score", "gid", "change.val", "quality")
score.df <- score[,c(1:4,8)]
setwd("/gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli")
sgRNA.genes <- read.table("sgRNA.gene.closest.bed", header=F, sep="\t", stringsAsFactors = F)
sgRNA.genes.df <- sgRNA.genes[,c(4,14)]
colnames(sgRNA.genes.df) <- c("sgRNA", "gene.distance")
score.location <- left_join(score.df, sgRNA.genes.df, by=c("sgRNA"))
score.location$scale <- 0
df.melt <- melt(score.location[,4:7], id=c("cut.score", "scale", "sgRNA"))
df <- na.omit(df.melt)
colnames(df) <- c("cut.score", "scale", "sgRNA", "variable", "value")
df.id <- df %>% unite(feature.scale, c(variable, scale), sep = "")
df.dcast <- df.id %>% dcast(sgRNA + cut.score ~ feature.scale, value.var = "value", fun.aggregate=mean, na.rm=TRUE)
df.dcast.na <- na.omit(df.dcast)
# 40468
write.table(df.dcast.na, "ecoli.sgRNA.location.dcast.txt", quote=F, row.names=F, sep="\t")
setwd("/gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli")
df.dcast <- read.delim("ecoli.sgRNA.location.dcast.txt", header=T, sep="\t", stringsAsFactors = F)
df.dcast.sep <- df.dcast %>% separate(sgRNA, c("sgRNA", "ID"), sep="_")
df.dcast.sep <- df.dcast.sep[,c(1,2,4)]
df <- read.delim("ecoli.20sliding.all.features.dcast.txt", header=T, sep="\t", stringsAsFactors = F)
df.sep <- df %>% separate(sgRNAID, c("sgRNA", "ID", "type"), sep="_")
df.location <- inner_join(df.sep, df.dcast.sep, by=c("sgRNA", "ID"))
df.location.id <- df.location %>% unite(sgRNAID, c("sgRNA", "ID", "type"), sep="_")
nrow(df.location.id)
# 118140
write.table(df.location.id, "ecoli.20sliding.location.all.features.dcast.txt", quote=F, row.names=F, sep="\t")
# Andes
# salloc -A SYB105 -N 2 -t 4:00:00
source /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/scripts/loadcondaandes.sh
conda activate /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/Libraries/andes/envs/test
R
setwd("/gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli")
df <- read.delim("ecoli.20sliding.location.all.features.dcast.txt", header=T, sep="\t")
df <- na.omit(df)
## sample ID
df.features <- df[,c(1,3:ncol(df))]
df.score <- df[,1:2]
setwd("/gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli/iRF.run/Ecoli.allCas9/all.features/dwt20bp")
write.table(df.features, "dwt20bp.location.features_overlap.txt", quote=F, row.names=F, sep="\t")
write.table(df.score, "dwt20bp.location.score_overlap.txt", quote=F, row.names=F, sep="\t")
write.table(df.features, "dwt20bp.location.features.txt", quote=F, row.names=F, sep="\t")
write.table(df.score, "dwt20bp.location.score.txt", quote=F, row.names=F, sep="\t")
## no sample ID
## <DataFile>_overlap_noSampleIDs.txt & <YFile>_overlap_noSampleIDs.txt
df.features <- as.matrix(df[,c(3:ncol(df))])
df.score <- as.matrix(data.frame(df[,2]))
colnames(df.score) <- "cut.score"
setwd("/gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli/iRF.run/Ecoli.allCas9/all.features/dwt20bp")
write.table(df.features, "dwt20bp.location.features_overlap_noSampleIDs.txt", quote=F, row.names=F, sep="\t")
write.table(df.score, "dwt20bp.location.score_overlap_noSampleIDs.txt", quote=F, row.names=F, sep="\t")
# python
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
import warnings
warnings.filterwarnings("ignore")
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.metrics import confusion_matrix
np.random.seed(123)
data = pd.read_table('/gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli/ecoli.20sliding.location.all.features.dcast.txt')
data = data.iloc[:,2:-1]
label_encoder = LabelEncoder()
data.iloc[:,0] = label_encoder.fit_transform(data.iloc[:,0]).astype('float64')
corr = data.corr()
columns = np.full((corr.shape[0],), True, dtype=bool)
for i in range(corr.shape[0]):
for j in range(i+1, corr.shape[0]):
if corr.iloc[i,j] >= 0.9:
if columns[j]:
columns[j] = False
selected_columns = data.columns[columns]
data = data[selected_columns]
data.to_csv("/gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli/ecoli.20sliding.location.all.features.dcast.pythoncorrelation.csv")
# R
module load python
source /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/scripts/loadcondaandes.sh
conda activate /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/Libraries/andes/envs/test
#head -n 1 /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli/ecoli.20sliding.location.all.features.dcast.pythoncorrelation.csv > /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli/ecoli.20sliding.location.all.features.dcast.pythoncorrelation.header.txt
setwd("/gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli/")
df.noncor <- read.delim("ecoli.20sliding.location.all.features.dcast.pythoncorrelation.header.txt", header=F, sep=",")
setwd("/gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli/iRF.run/Ecoli.allCas9/all.features/dwt20bp/")
df <- read.delim("dwt20bp.location.features.txt", header=T, sep="\t")
df.subset <- df[ , which(names(df) %in% df.noncor[1,])]
df.mat <- as.matrix(df.subset[,2:ncol(df.subset)])
df.mat.id <- cbind(as.data.frame(df$sgRNAID), df.mat)
write.table(df.mat.id, "dwt20bp.location.noncor2.features.txt", quote=F, row.names=F, sep="\t")
write.table(df.mat.id, "dwt20bp.location.noncor2.features_overlap.txt", quote=F, row.names=F, sep="\t")
write.table(df.mat, "dwt20bp.location.noncor2.features_overlap_noSampleIDs.txt", quote=F, row.names=F, sep="\t")
# run python scripts on Andes
# run job submissions on Summit
# Builder script: /gpfs/alpine/syb105/proj-shared/Projects/iRF/iRF_LOOP_SetUp_CrossLayer.py
# [python iRF_LOOP_SetUp_CrossLayer.py --DataFile --YFile --System Summit --NodesPer 1 --TotalNodes 10 --RunTime 2 --Account SYB105 --NumTrees 1000 --NumIterations 5 --RunName iRF.XX --bypass --Prediction]
# Andes
mkdir /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli/iRF.run/Ecoli.allCas9/all.features/dwt20bp.location
mkdir /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli/iRF.run/Ecoli.allCas9/all.features/dwt20bp.location.noncor
module load python/3.7-anaconda3
cd /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli/iRF.run/Ecoli.allCas9/all.features/dwt20bp.location
python /gpfs/alpine/syb105/proj-shared/Projects/iRF/iRF_LOOP_SetUp_CrossLayer.py --System Summit --NodesPer 1 --TotalNodes 50 --RunTime 90 --Account SYB105 --NumTrees 1000 --NumIterations 5 --RunName dwt20bp.location --bypass --Prediction /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli/iRF.run/Ecoli.allCas9/all.features/dwt20bp/dwt20bp.location.features.txt /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli/iRF.run/Ecoli.allCas9/all.features/dwt20bp/dwt20bp.location.score.txt
cd /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli/iRF.run/Ecoli.allCas9/all.features/dwt20bp.location.noncor
python /gpfs/alpine/syb105/proj-shared/Projects/iRF/iRF_LOOP_SetUp_CrossLayer.py --System Summit --NodesPer 1 --TotalNodes 50 --RunTime 90 --Account SYB105 --NumTrees 1000 --NumIterations 5 --RunName dwt20bp.location.noncor --bypass --Prediction /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli/iRF.run/Ecoli.allCas9/all.features/dwt20bp/dwt20bp.location.noncor2.features.txt /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli/iRF.run/Ecoli.allCas9/all.features/dwt20bp/dwt20bp.location.score.txt
# Summit
cd /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli/iRF.run/Ecoli.allCas9/all.features/dwt20bp
module load python/3.7.0-anaconda3-5.3.0
# full
bsub /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli/iRF.run/Ecoli.allCas9/all.features/dwt20bp.location/Submits/submit_full_dwt20bp.location_0.sh
bsub /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli/iRF.run/Ecoli.allCas9/all.features/dwt20bp.location.noncor/Submits/submit_full_dwt20bp.location.noncor_0.sh
# train
bsub /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli/iRF.run/Ecoli.allCas9/all.features/dwt20bp.location/Submits/submit_train_dwt20bp.location_0.sh
bsub /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli/iRF.run/Ecoli.allCas9/all.features/dwt20bp.location.noncor/Submits/submit_train_dwt20bp.location.noncor_0.sh
# once the train submissions are done run the test submissions
# test
bsub /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli/iRF.run/Ecoli.allCas9/all.features/dwt20bp.location/Submits/submit_test_dwt20bp.location_0.sh
bsub /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli/iRF.run/Ecoli.allCas9/all.features/dwt20bp.location.noncor/Submits/submit_test_dwt20bp.location.noncor_0.sh
# Andes
cd /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli/iRF.run/Ecoli.allCas9/
module load python/3.7-anaconda3
cd /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli/iRF.run/Ecoli.allCas9/all.features/dwt20bp.location
python /gpfs/alpine/syb105/proj-shared/Projects/iRF/iRF_postProcessing.py --Iterations 5 --Prediction --PredAccuracy MAE,MAEA,MSE,R2 --varTot 95 /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli/iRF.run/Ecoli.allCas9/YNames.txt dwt20bp.location
# -0.09734688114787748
sort -k3rg topVarEdges/cut.score_top95.txt | head
# p20yz_quadrupoleraw cut.score 0.01635757302123597
# sgRNA.gcsgRNA.raw cut.score 0.010910872718992813
# sgRNA.tempsgRNA.raw cut.score 0.010760259389624197
# p20xz_quadrupoleraw cut.score 0.009545968790975343
# p20lumo_energyraw cut.score 0.008316071478798268
# p20lumo1_energyraw cut.score 0.007800138591430982
# p20homo1_energyraw cut.score 0.007495878876141272
# p18xz_quadrupoleraw cut.score 0.007078755281814437
# p20homo_energyraw cut.score 0.0057132960054141185
# CCsgRNA.raw cut.score 0.00514295321622992
# pearson correlation
setwd("/gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli/iRF.run/Ecoli.allCas9/all.features/dwt20bp.location/cut.score/foldRuns/fold9/Runs/Set4")
pred <- read.delim("dwt20bp.location_Set4_test.prediction", header=T, sep="\t")
y <- read.delim("set4_Y_test_noSampleIDs.txt", header=T, sep="\t")
id <- read.delim("set4_test_SampleIDs.txt", header=F, sep="\t")
colnames(id) <- "sgRNAID"
id.pred <- cbind(id, pred)
id.pred.y <- cbind(id.pred, y)
library(tidyr)
id.pred.y.group <- id.pred.y %>% separate(sgRNAID, c("sgRNA", "ID", "group"), "_")
cor(id.pred.y.group$cut.score, id.pred.y.group$Predictions., method=c("pearson"))
# 0.2648636
pred.Cas9 <- subset(id.pred.y.group, id.pred.y.group$group == "Cas9")
cor(pred.Cas9$cut.score, pred.Cas9$Predictions., method=c("pearson"))
# 0.720397
pred.eSpCas9 <- subset(id.pred.y.group, id.pred.y.group$group == "eSpCas9")
cor(pred.eSpCas9$cut.score, pred.eSpCas9$Predictions., method=c("pearson"))
# 0.6564114
pred.recACas9 <- subset(id.pred.y.group, id.pred.y.group$group == "recACas9")
cor(pred.recACas9$cut.score, pred.recACas9$Predictions., method=c("pearson"))
# 0.4568599
cd /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli/iRF.run/Ecoli.allCas9/all.features/dwt20bp.location.noncor
python /gpfs/alpine/syb105/proj-shared/Projects/iRF/iRF_postProcessing.py --Iterations 5 --Prediction --PredAccuracy MAE,MAEA,MSE,R2 --varTot 95 /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli/iRF.run/Ecoli.allCas9/YNames.txt dwt20bp.location.noncor
# -0.09705019827561431
sort -k3rg topVarEdges/cut.score_top95.txt | head
# p20xz_quadrupoleraw cut.score 0.033614417088088384
# sgRNA.gcsgRNA.raw cut.score 0.022363404734848217
# p18xz_quadrupoleraw cut.score 0.018634199824835752
# p20homo_energyraw cut.score 0.016612599210797056
# p19rot_constants_zraw cut.score 0.01625828485370655
# p19rot_constants_yraw cut.score 0.015745405205324643
# sgRNA_gene.dwtd22.x cut.score 0.014520050245398862
# sgRNA_gc.dwtd2.x cut.score 0.012545050446078423
# sgRNA_gc.dwtd1.x cut.score 0.011937886422335639
# sgRNA_structure.dwtd2.x cut.score 0.011324966749205457
# pearson correlation
setwd("/gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli/iRF.run/Ecoli.allCas9/all.features/dwt20bp.location.noncor/cut.score/foldRuns/fold9/Runs/Set4")
pred <- read.delim("dwt20bp.location.noncor_Set4_test.prediction", header=T, sep="\t")
id <- read.delim("set4_test_SampleIDs.txt", header=F, sep="\t")
colnames(id) <- "sgRNAID"
id.pred <- cbind(id, pred)
id.pred.y <- cbind(id.pred, y)
library(tidyr)
id.pred.y.group <- id.pred.y %>% separate(sgRNAID, c("sgRNA", "ID", "group"), "_")
cor(id.pred.y.group$cut.score, id.pred.y.group$Predictions., method=c("pearson"))
# 0.2638886
pred.Cas9 <- subset(id.pred.y.group, id.pred.y.group$group == "Cas9")
cor(pred.Cas9$cut.score, pred.Cas9$Predictions., method=c("pearson"))
# 0.7199896
pred.eSpCas9 <- subset(id.pred.y.group, id.pred.y.group$group == "eSpCas9")
cor(pred.eSpCas9$cut.score, pred.eSpCas9$Predictions., method=c("pearson"))
# 0.6556416
pred.recACas9 <- subset(id.pred.y.group, id.pred.y.group$group == "recACas9")
cor(pred.recACas9$cut.score, pred.recACas9$Predictions., method=c("pearson"))
# 0.4566776
salloc -A SYB105 -N 2 -p gpu -t 4:00:00
source /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/scripts/loadcondaandes.sh
conda activate /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/Libraries/andes/envs/test
library(dplyr)
library(reshape2)
library(tidyr)
library(wmtsa)
library(data.table)
setwd("/gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli")
gatc <- read.table("ecoli.gatc.20sliding.bed", header=F, sep="\t", stringsAsFactors = F)
ipd <- read.table("ecoli.ipd.average.20sliding.bed", header=T, sep="\t", stringsAsFactors = F)
gene <- read.table("ecoli.gene.20sliding.bed", header=F, sep="\t", stringsAsFactors = F)
structure <- read.table("20sliding.protein.structure.fa", header=T, sep="\t", stringsAsFactors = F)
nuc <- read.table("nucleotide_counts_20sliding_temp.txt", header=T, sep="\t", stringsAsFactors = F)
rnaseq <- read.table("ecoli.rnaseq.average.20sliding.bed", header=T, sep="\t", stringsAsFactors = F)
window <- read.table("ecoli.20bp.sliding.bed", header=F, sep="\t", stringsAsFactors = F)
score <- read.table("sgRNA.coord.txt", header=T, sep="\t", stringsAsFactors = F)
colnames(score) <- c("chr", "start", "end", "sgRNA", "id", "seq", "id2", "cut.score", "gid", "change.val", "quality")
score.df <- score[,c(1:4,8)]
window.v <- window[,1:3]
colnames(window.v) <- c("V1", "V2", "V3")
gatc.win <- left_join(window.v, gatc, by=c("V1", "V2", "V3"))
gatc.win[is.na(gatc.win)] <- 0
ipd.win <- left_join(window.v, ipd, by=c("V1", "V2", "V3"))
ipd.win[is.na(ipd.win)] <- 0
gene.win <- left_join(window.v, gene, by=c("V1", "V2", "V3"))
gene.win[is.na(gene.win)] <- 0
rnaseq.win <- left_join(window.v, rnaseq, by=c("V1", "V2", "V3"))
rnaseq.win[is.na(rnaseq.win)] <- 0
gatc.bin <- gatc.win %>% group_by(V1, V2, V3) %>% mutate(gatc.count = n())
gatc.count <- unique(gatc.bin[,c(1:3,8)])
gatc.df <- gatc.count$gatc.count
ipd.df <- ipd.win[,4]
gene.bin <- gene.win %>% group_by(V1, V2, V3) %>% mutate(gene.count = n())
gene.count <- unique(gene.bin[,c(1:3,14)])
gene.df <- gene.count$gene.count
structure.df <- structure[,2]
gc.df <- nuc[,7]
temp.df <- nuc[,8]
rna.df <- rnaseq.win[,4]
# Run DWT instead of CWT
#wavMODWT(x, wavelet="s8", n.levels=ilogb(length(x), base=2),position=list(from=1,by=1,units=character()), units=character(),title.data=character(), documentation=character(), keep.series=FALSE)
temp.modwt <- wavMODWT(temp.df)
temp.modwt.df <- as.matrix(temp.modwt)
temp.modwt.label <- data.frame(label = row.names(temp.modwt.df), temp.modwt.df)
temp.modwt.dt <- as.data.table(temp.modwt.label)
#temp.modwt.name <- temp.modwt.label %>% separate(label, c("name", "number"), sep = "[^[:alnum:]]+")
temp.modwt.name <- temp.modwt.dt[, c("name", "number") := tstrsplit(label, "[^[:alnum:]]+")]
colnames(temp.modwt.name) <- c("label", "temp.dwt", "scale", "window")
gc.modwt <- wavMODWT(gc.df)
gc.modwt.df <- as.matrix(gc.modwt)
gc.modwt.label <- data.frame(label = row.names(gc.modwt.df), gc.modwt.df)
gc.modwt.dt <- as.data.table(gc.modwt.label)
gc.modwt.name <- gc.modwt.dt[, c("name", "number") := tstrsplit(label, "[^[:alnum:]]+")]
colnames(gc.modwt.name) <- c("label", "gc.dwt", "scale", "window")
structure.modwt <- wavMODWT(structure.df)
structure.modwt.df <- as.matrix(structure.modwt)
structure.modwt.label <- data.frame(label = row.names(structure.modwt.df), structure.modwt.df)
structure.modwt.dt <- as.data.table(structure.modwt.label)
structure.modwt.name <- structure.modwt.dt[, c("name", "number") := tstrsplit(label, "[^[:alnum:]]+")]
colnames(structure.modwt.name) <- c("label", "structure.dwt", "scale", "window")
rna.modwt <- wavMODWT(rna.df)
rna.modwt.df <- as.matrix(rna.modwt)
rna.modwt.label <- data.frame(label = row.names(rna.modwt.df), rna.modwt.df)
rna.modwt.dt <- as.data.table(rna.modwt.label)
rna.modwt.name <- rna.modwt.dt[, c("name", "number") := tstrsplit(label, "[^[:alnum:]]+")]
colnames(rna.modwt.name) <- c("label", "rna.dwt", "scale", "window")
gene.modwt <- wavMODWT(gene.df)
gene.modwt.df <- as.matrix(gene.modwt)
gene.modwt.label <- data.frame(label = row.names(gene.modwt.df), gene.modwt.df)
gene.modwt.dt <- as.data.table(gene.modwt.label)
gene.modwt.name <- gene.modwt.dt[, c("name", "number") := tstrsplit(label, "[^[:alnum:]]+")]
colnames(gene.modwt.name) <- c("label", "gene.dwt", "scale", "window")
ipd.modwt <- wavMODWT(ipd.df)
ipd.modwt.df <- as.matrix(ipd.modwt)
ipd.modwt.label <- data.frame(label = row.names(ipd.modwt.df), ipd.modwt.df)
ipd.modwt.dt <- as.data.table(ipd.modwt.label)
ipd.modwt.name <- ipd.modwt.dt[, c("name", "number") := tstrsplit(label, "[^[:alnum:]]+")]
colnames(ipd.modwt.name) <- c("label", "ipd.dwt", "scale", "window")
gatc.modwt <- wavMODWT(gatc.df)
gatc.modwt.df <- as.matrix(gatc.modwt)
gatc.modwt.label <- data.frame(label = row.names(gatc.modwt.df), gatc.modwt.df)
gatc.modwt.dt <- as.data.table(gatc.modwt.label)
gatc.modwt.name <- gatc.modwt.dt[, c("name", "number") := tstrsplit(label, "[^[:alnum:]]+")]
colnames(gatc.modwt.name) <- c("label", "gatc.dwt", "scale", "window")
setwd("/gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli")
## upstream bin
score <- read.table("sgRNA.coord.txt", header=T, sep="\t", stringsAsFactors = F)
colnames(score) <- c("chr", "start", "end", "sgRNA", "id", "seq", "id2", "cut.score", "gid", "change.val", "quality")
score.df <- score[,c(1:4,8)]
score.df$chr <- score.df$chr
score.df$end <- score.df$start
score.df$start <- score.df$start - 119
colnames(window) <- c("chr", "start", "end")
window$window <- seq.int(nrow(window))
window$window <- as.character(window$window-1)
window$start <- as.numeric(window$start)
window$end <- as.numeric(window$end - 1)
window.score.df <- left_join(score.df, window, by=c("chr", "start", "end"))
window.score.temp <- left_join(window.score.df, temp.modwt.name, by="window")
window.temp.gc <- left_join(window.score.temp, gc.modwt.name, by=c("window", "scale"))
window.temp.gc.structure <- left_join(window.temp.gc, structure.modwt.name, by=c("window", "scale"))
window.temp.gc.structure.rna <- left_join(window.temp.gc.structure, rna.modwt.name, by=c("window", "scale"))
window.temp.gc.structure.rna.gene <- left_join(window.temp.gc.structure.rna, gene.modwt.name, by=c("window", "scale"))
window.temp.gc.structure.rna.gene.gatc <- left_join(window.temp.gc.structure.rna.gene, gatc.modwt.name, by=c("window", "scale"))
window.temp.gc.structure.rna.gene.gatc.ipd <- left_join(window.temp.gc.structure.rna.gene.gatc, ipd.modwt.name, by=c("window", "scale"))
window.temp.gc.structure.rna.gene.gatc.ipd.sgRNA <- subset(window.temp.gc.structure.rna.gene.gatc.ipd, window.temp.gc.structure.rna.gene.gatc.ipd$cut.score != "NA")
df.melt <- melt(window.temp.gc.structure.rna.gene.gatc.ipd.sgRNA[,c(4,5,7:14)], id=c("cut.score", "scale", "sgRNA"))
df <- na.omit(df.melt)
colnames(df) <- c("cut.score", "scale", "sgRNA", "variable", "value")
df.id <- df %>% unite(feature.scale, c(variable, scale), sep = "")
df.dcast <- df.id %>% dcast(sgRNA + cut.score ~ feature.scale, value.var = "value", fun.aggregate=mean, na.rm=TRUE)
df.dcast.na <- na.omit(df.dcast)
#
write.table(df.dcast.na, "ecoli.20sliding.up100.dwt.dcast.txt", quote=F, row.names=F, sep="\t")
## downstream bin
score <- read.table("sgRNA.coord.txt", header=T, sep="\t", stringsAsFactors = F)
colnames(score) <- c("chr", "start", "end", "sgRNA", "id", "seq", "id2", "cut.score", "gid", "change.val", "quality")
score.df <- score[,c(1:4,8)]
score$chr <- score$chr
score$start <- score$end
score$end <- score$start + 119
window.score.df <- left_join(score.df, window, by=c("chr", "start", "end"))
window.score.temp <- left_join(window.score.df, temp.modwt.name, by="window")
window.temp.gc <- left_join(window.score.temp, gc.modwt.name, by=c("window", "scale"))
window.temp.gc.structure <- left_join(window.temp.gc, structure.modwt.name, by=c("window", "scale"))
window.temp.gc.structure.rna <- left_join(window.temp.gc.structure, rna.modwt.name, by=c("window", "scale"))
window.temp.gc.structure.rna.gene <- left_join(window.temp.gc.structure.rna, gene.modwt.name, by=c("window", "scale"))
window.temp.gc.structure.rna.gene.gatc <- left_join(window.temp.gc.structure.rna.gene, gatc.modwt.name, by=c("window", "scale"))
window.temp.gc.structure.rna.gene.gatc.ipd <- left_join(window.temp.gc.structure.rna.gene.gatc, ipd.modwt.name, by=c("window", "scale"))
window.temp.gc.structure.rna.gene.gatc.ipd.sgRNA <- subset(window.temp.gc.structure.rna.gene.gatc.ipd, window.temp.gc.structure.rna.gene.gatc.ipd$score != "NA")
df.melt <- melt(window.temp.gc.structure.rna.gene.gatc.ipd.sgRNA[,c(4,5,7:14)], id=c("score", "scale", "sgRNA"))
df <- na.omit(df.melt)
colnames(df) <- c("cut.score", "scale", "sgRNA", "variable", "value")
df.id <- df %>% unite(feature.scale, c(variable, scale), sep = "")
df.dcast <- df.id %>% dcast(sgRNA + cut.score ~ feature.scale, value.var = "value", fun.aggregate=mean, na.rm=TRUE)
df.dcast.na <- na.omit(df.dcast)
#
write.table(df.dcast.na, "ecoli.20sliding.down100.dwt.dcast.txt", quote=F, row.names=F, sep="\t")
–> combine to generate full feature matrix
# combine regional DWT with other features
library(tidyr)
library(dplyr)
setwd("/gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli")
df.dcast.na <- read.delim("ecoli.20sliding.exact.dwt.dcast.txt", header=T, sep="\t", stringsAsFactors = F)
df.dcast.sep <- df.dcast.na %>% separate(sgRNA, c("sgRNA", "ID"), sep="_")
df.dcast.dwt <- df.dcast.sep[,c(4:ncol(df.dcast.sep))]
colnames(df.dcast.dwt) <- paste0('sgRNA_', colnames(df.dcast.dwt))
df.dcast <- cbind(df.dcast.sep[,1:3], df.dcast.dwt)
setwd("/gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli")
df.dcast.na2 <- read.delim("ecoli.20sliding.up.dwt.dcast.txt", header=T, sep="\t", stringsAsFactors = F)
df.dcast.sep2 <- df.dcast.na2 %>% separate(sgRNA, c("sgRNA", "ID"), sep="_")
df.dcast.dwt2 <- df.dcast.sep2[,c(4:ncol(df.dcast.sep2))]
colnames(df.dcast.dwt2) <- paste0('sgRNA_', colnames(df.dcast.dwt2))
df.dcast2 <- cbind(df.dcast.sep2[,1:3], df.dcast.dwt2)
setwd("/gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli")
df.dcast.na3 <- read.delim("ecoli.20sliding.down.dwt.dcast.txt", header=T, sep="\t", stringsAsFactors = F)
df.dcast.sep3 <- df.dcast.na3 %>% separate(sgRNA, c("sgRNA", "ID"), sep="_")
df.dcast.dwt3 <- df.dcast.sep3[,c(4:ncol(df.dcast.sep3))]
colnames(df.dcast.dwt3) <- paste0('sgRNA_', colnames(df.dcast.dwt3))
df.dcast3 <- cbind(df.dcast.sep3[,1:3], df.dcast.dwt3)
setwd("/gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli")
df.dcast.na4 <- read.delim("ecoli.20sliding.up100.dwt.dcast.txt", header=T, sep="\t", stringsAsFactors = F)
df.dcast.sep4 <- df.dcast.na4 %>% separate(sgRNA, c("sgRNA", "ID"), sep="_")
df.dcast.dwt4 <- df.dcast.sep4[,c(4:ncol(df.dcast.sep4))]
colnames(df.dcast.dwt4) <- paste0('sgRNA_', colnames(df.dcast.dwt4))
df.dcast4 <- cbind(df.dcast.sep4[,1:3], df.dcast.dwt4)
setwd("/gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli")
df.dcast.na5 <- read.delim("ecoli.20sliding.down100.dwt.dcast.txt", header=T, sep="\t", stringsAsFactors = F)
df.dcast.sep5 <- df.dcast.na5 %>% separate(sgRNA, c("sgRNA", "ID"), sep="_")
df.dcast.dwt5 <- df.dcast.sep5[,c(4:ncol(df.dcast.sep5))]
colnames(df.dcast.dwt5) <- paste0('sgRNA_', colnames(df.dcast.dwt5))
df.dcast5 <- cbind(df.dcast.sep5[,1:3], df.dcast.dwt5)
df <- read.delim("Ecoli.allCas9.raw.onehot.tensor.dcast.na.txt", header=T, sep="\t", stringsAsFactors = F)
df.sep <- df %>% separate(sgRNAID, c("sgRNA", "ID", "type"), sep="_")
nrow(df.sep)
# 126182
df.sep.region <- inner_join(df.sep, df.dcast[,c(1,2,4:ncol(df.dcast.sep))], by=c("sgRNA", "ID"))
df.sep.region2 <- inner_join(df.sep.region, df.dcast2[,c(1,2,4:ncol(df.dcast.sep2))], by=c("sgRNA", "ID"))
df.sep.region3 <- inner_join(df.sep.region2, df.dcast3[,c(1,2,4:ncol(df.dcast.sep3))], by=c("sgRNA", "ID"))
## note that dwt.x = sgRNA dwt, dwt.y = upstream dwt, dwt = downstream dwt
df.sep.region.id <- df.sep.region3 %>% unite(sgRNAID, c("sgRNA", "ID", "type"), sep="_")
nrow(df.sep.region.id)
# 118140
write.table(df.sep.region.id, "ecoli.20sliding.all.features.dcast.txt", quote=F, row.names=F, sep="\t")
# Summit
module load r/4.0.5
# Andes
# salloc -A SYB105 -N 2 -t 4:00:00
source /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/scripts/loadcondaandes.sh
conda activate /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/Libraries/andes/envs/test
mkdir /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli/iRF.run/Ecoli.allCas9/all.features/dwt20bp
R
setwd("/gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli")
df <- read.delim("ecoli.20sliding.all.features.dcast.txt", header=T, sep="\t")
df <- na.omit(df)
## sample ID
df.features <- df[,c(1,3:ncol(df))]
df.score <- df[,1:2]
setwd("/gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli/iRF.run/Ecoli.allCas9/all.features/dwt20bp")
write.table(df.features, "dwt20bp.features_overlap.txt", quote=F, row.names=F, sep="\t")
write.table(df.score, "dwt20bp.score_overlap.txt", quote=F, row.names=F, sep="\t")
write.table(df.features, "dwt20bp.features.txt", quote=F, row.names=F, sep="\t")
write.table(df.score, "dwt20bp.score.txt", quote=F, row.names=F, sep="\t")
## no sample ID
## <DataFile>_overlap_noSampleIDs.txt & <YFile>_overlap_noSampleIDs.txt
df.features <- as.matrix(df[,c(3:ncol(df))])
df.score <- as.matrix(data.frame(df[,2]))
colnames(df.score) <- "cut.score"
setwd("/gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli/iRF.run/Ecoli.allCas9/all.features/dwt20bp")
write.table(df.features, "dwt20bp.features_overlap_noSampleIDs.txt", quote=F, row.names=F, sep="\t")
write.table(df.score, "dwt20bp.score_overlap_noSampleIDs.txt", quote=F, row.names=F, sep="\t")
cd /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli/iRF.run/Ecoli.allCas9/all.features/dwt20bp
cut -f 1,1589-2071 dwt20bp.features.txt > dwt20bp.dwtonly.features.txt
cut -f 1,1589-2071 dwt20bp.features_overlap.txt > dwt20bp.dwtonly.features_overlap.txt
cut -f 1588-2070 dwt20bp.features_overlap_noSampleIDs.txt > dwt20bp.dwtonly.features_overlap_noSampleIDs.txt
https://www.synthego.com/guide/how-to-use-crispr/pam-sequence
steps: - find all of the NGG sequences in the genome - bedtools closest to each guide (look specifically downstream, considering strand orientation) - identify the distance and the “N” - incorporate into the model as a raw distance value and a one-hot encoded PAM sequence
source /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/scripts/loadcondaandes.sh
conda activate /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/Libraries/andes/envs/test
# generate fastq file of NGG sequences and blast to reference
cd /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli
# vim NGG.PAM.fasta
# ## blast
# /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/Libraries/andes/ncbi-blast-2.11.0+/bin/makeblastdb -in genome/GCF_000005845.2_ASM584v2_genomic.fna -dbtype nucl
#
# #/gpfs/alpine/syb105/proj-shared/Personal/noshayjm/Libraries/andes/ncbi-blast-2.11.0+/bin/blastn -query NGG.PAM.fasta -db genome/GCF_000005845.2_ASM584v2_genomic.fna -out NGG.PAM.blast.tab -outfmt 6 -evalue 0.0001 -task blastn -num_threads 10
# /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/Libraries/andes/ncbi-blast-2.11.0+/bin/blastn -query NGG.PAM.fasta -db genome/GCF_000005845.2_ASM584v2_genomic.fna -out NGG.PAM.blast.tab -num_threads 10 -task blastn-short -evalue 0.0001
# #### too small a sequence to find using blast...
#
# ## bowtie
# source /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/scripts/loadcondaandes.sh
#
# cd /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/Libraries/andes/anaconda3/envs/
# conda create --name bowtie python=3.8
# conda activate bowtie
# conda install -c bioconda bowtie
#
# cd /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli
# bowtie-build -f genome/GCF_000005845.2_ASM584v2_genomic.fna genome/GCF_000005845.2_ASM584v2_genomic
# bowtie genome/GCF_000005845.2_ASM584v2_genomic -v 0 -c AGG > AGG.PAM.bowtie.txt
# bowtie genome/GCF_000005845.2_ASM584v2_genomic -n 0 -c TGG > TGG.PAM.bowtie.txt
# bowtie genome/GCF_000005845.2_ASM584v2_genomic -n 0 -c CGG > CGG.PAM.bowtie.txt
# bowtie genome/GCF_000005845.2_ASM584v2_genomic -n 0 -c GGG > GGG.PAM.bowtie.txt
# ##### sequences too short (need to be at least 4bp) AND align to too many places to properly be used for bowtie
## fastaRegexFinder
python /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/Libraries/andes/fastaRegexFinder.py -q -f genome/GCF_000005845.2_ASM584v2_genomic.fna -r 'AGG' > AGG.PAM.txt
python /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/Libraries/andes/fastaRegexFinder.py -q -f genome/GCF_000005845.2_ASM584v2_genomic.fna -r 'TGG' > TGG.PAM.txt
python /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/Libraries/andes/fastaRegexFinder.py -q -f genome/GCF_000005845.2_ASM584v2_genomic.fna -r 'CGG' > CGG.PAM.txt
python /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/Libraries/andes/fastaRegexFinder.py -q -f genome/GCF_000005845.2_ASM584v2_genomic.fna -r 'GGG' > GGG.PAM.txt
cat AGG.PAM.txt TGG.PAM.txt CGG.PAM.txt GGG.PAM.txt > NGG.PAM.txt
sort -k 1,1 -k 2,2n NGG.PAM.txt > NGG.PAM.sorted.bed
# intersect with sliding windows in the genome to get density for DWT
bedtools intersect -wo -a ecoli.20bp.sliding.bed -b NGG.PAM.sorted.bed > NGG.PAM.20bp.sliding.windows.bed
# closest with gRNAs to identify distance (downstream, strand)
awk '{print $0"\t""+"}' sgRNA.coord.bed > sgRNA.coord.strand.txt
bedtools closest -a sgRNA.coord.bed -b NGG.PAM.sorted.bed -io -iu -D a > ecoli.sgRNA.closestPAM.bed
# determine if N = A,C,T, or G
## feature: PAM.A.raw, PAM.C.raw, PAM.T.raw, PAM.G.raw <-- binary
source /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/scripts/loadcondaandes.sh
conda activate /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/Libraries/andes/envs/test
library(dplyr)
library(tidyr)
library(data.table)
setwd("/gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli")
window <- read.table("ecoli.20bp.sliding.bed", header=F, sep="\t", stringsAsFactors = F)
score <- read.table("sgRNA.coord.txt", header=T, sep="\t", stringsAsFactors = F)
colnames(score) <- c("chr", "start", "end", "sgRNA", "id", "seq", "id2", "cut.score", "gid", "change.val", "quality")
score.df <- score[,c(1:4,8)]
setwd("/gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli")
sgRNA.pam <- read.table("ecoli.sgRNA.closestPAM.bed", header=F, sep="\t", stringsAsFactors = F)
sgRNA.pam.sub <- sgRNA.pam[,c(4,11,12)]
colnames(sgRNA.pam.sub) <- c("sgRNA", "pam.code", "pam.distance")
sgRNA.pam.onehot <- sgRNA.pam.sub %>% mutate(PAM.A = ifelse(pam.code == "AGG" | pam.code == "CCT", 1, 0), PAM.C = ifelse(pam.code == "CGG" | pam.code == "CCG", 1, 0), PAM.T = ifelse(pam.code == "TGG" | pam.code == "CCA", 1, 0), PAM.G = ifelse(pam.code == "GGG" | pam.code == "CCC", 1, 0))
sgRNA.pam.df <- sgRNA.pam.onehot[,c(1,3:7)]
score.location <- left_join(score.df, sgRNA.pam.df, by=c("sgRNA"))
score.location$scale <- 0
df.melt <- melt(score.location[,4:11], id=c("cut.score", "scale", "sgRNA"))
df <- na.omit(df.melt)
colnames(df) <- c("cut.score", "scale", "sgRNA", "variable", "value")
df.id <- df %>% unite(feature.scale, c(variable, scale), sep = "")
df.dcast <- df.id %>% dcast(sgRNA + cut.score ~ feature.scale, value.var = "value", fun.aggregate=mean, na.rm=TRUE)
df.dcast.na <- na.omit(df.dcast)
# 40468
write.table(df.dcast.na, "ecoli.sgRNA.pam.dcast.txt", quote=F, row.names=F, sep="\t")
setwd("/gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli")
df.dcast <- read.delim("ecoli.sgRNA.pam.dcast.txt", header=T, sep="\t", stringsAsFactors = F)
df.dcast.sep <- df.dcast %>% separate(sgRNA, c("sgRNA", "ID"), sep="_")
df.dcast.sep <- df.dcast.sep[,c(1,2,4:8)]
df <- read.delim("ecoli.20sliding.location.all.features.dcast.txt", header=T, sep="\t", stringsAsFactors = F)
df.sep <- df %>% separate(sgRNAID, c("sgRNA", "ID", "type"), sep="_")
df.location <- inner_join(df.sep, df.dcast.sep, by=c("sgRNA", "ID"))
df.location.id <- df.location %>% unite(sgRNAID, c("sgRNA", "ID", "type"), sep="_")
nrow(df.location.id)
# 118140
write.table(df.location.id, "ecoli.20sliding.pam.all.features.dcast.txt", quote=F, row.names=F, sep="\t")
### figure for relationship between cut.score and distance to closest PAM
setwd("/gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli")
score <- read.table("sgRNA.coord.txt", header=T, sep="\t", stringsAsFactors = F)
colnames(score) <- c("chr", "start", "end", "sgRNA", "id", "seq", "id2", "cut.score", "gid", "change.val", "quality")
score.df <- score[,c(1:4,8)]
setwd("/gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli")
sgRNA.pam <- read.table("ecoli.sgRNA.closestPAM.bed", header=F, sep="\t", stringsAsFactors = F)
sgRNA.pam.dist <- sgRNA.pam[,c(4,12)]
colnames(sgRNA.pam.dist) <- c("sgRNA", "pam.distance")
score.dist <- left_join(score.df, sgRNA.pam.dist, by=c("sgRNA"))
pdf("ecoli.sgRNA.PAMdist.score.pdf")
library(ggplot2)
ggplot(score.dist, aes(x=pam.distance, y=cut.score)) + geom_point() + theme_classic()
dev.off()
pdf("ecoli.sgRNA.PAMdist.density.pdf")
ggplot(score.dist, aes(x=pam.distance)) + geom_density() + theme_classic()
dev.off()
pdf("ecoli.sgRNA.PAMdist.score.violin.pdf")
score.dist.category <- score.dist %>% mutate(dist.category = ifelse(pam.distance == 1, "PAM.1bp", ifelse(pam.distance <= 5, "PAM.5bp", ifelse(pam.distance <= 10, "PAM.10bp", "PAM.gr10bp"))))
ggplot(score.dist.category, aes(x=dist.category, y=cut.score)) + geom_violin() + theme_classic()
dev.off()
## incorporate nucleotide sequence
score.dist.nuc <- left_join(score.dist, sgRNA.pam.onehot[,1:2], by=c("sgRNA"))
pdf("ecoli.sgRNA.PAMdist.score.nuc.pdf")
ggplot(score.dist.nuc, aes(x=pam.distance, y=cut.score, color=pam.code)) + geom_point() + theme_classic() + facet_grid(. ~ pam.code)
dev.off()
pdf("ecoli.sgRNA.PAMdist.density.nuc.pdf")
ggplot(score.dist.nuc, aes(x=pam.distance, color=pam.code)) + geom_density() + theme_classic() + facet_grid(pam.code ~ .)
dev.off()
source /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/scripts/loadcondaandes.sh
conda activate /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/Libraries/andes/envs/test
# salloc -A SYB105 -N 1 -p gpu -t 1:00:00
library(dplyr)
library(reshape2)
library(tidyr)
library(wmtsa)
library(data.table)
setwd("/gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli")
pam <- read.table("NGG.PAM.20bp.sliding.windows.bed", header=F, sep="\t", stringsAsFactors = F)
window <- read.table("ecoli.20bp.sliding.bed", header=F, sep="\t", stringsAsFactors = F)
score <- read.table("sgRNA.coord.txt", header=T, sep="\t", stringsAsFactors = F)
colnames(score) <- c("chr", "start", "end", "sgRNA", "id", "seq", "id2", "cut.score", "gid", "change.val", "quality")
score.df <- score[,c(1:4,8)]
window.v <- window[,1:3]
colnames(window.v) <- c("V1", "V2", "V3")
pam.win <- left_join(window.v, pam, by=c("V1", "V2", "V3"))
pam.win[is.na(pam.win)] <- 0
pam.bin <- pam.win %>% group_by(V1, V2, V3) %>% mutate(pam.count = n())
pam.count <- unique(pam.bin[,c(1:3,12)])
pam.df <- pam.count$pam.count
write.table(pam.count, "NGG.PAM.20bp.sliding.windows.count.bed", quote=F, row.names=F, sep="\t")
pam.modwt <- wavMODWT(pam.df)
pam.modwt.df <- as.matrix(pam.modwt)
pam.modwt.label <- data.frame(label = row.names(pam.modwt.df), pam.modwt.df)
pam.modwt.dt <- as.data.table(pam.modwt.label)
pam.modwt.name <- pam.modwt.dt[, c("name", "number") := tstrsplit(label, "[^[:alnum:]]+")]
colnames(pam.modwt.name) <- c("label", "pam.dwt", "scale", "window")
library(wavelets)
pam.mat <- as.matrix(pam.df)
pam.dwt <- modwt(pam.mat, filter="la8")
pdf("ecoli.20bpsliding.pam.pdf")
plot.modwt(pam.dwt)
dev.off()
colnames(window) <- c("chr", "start", "end")
window$window <- seq.int(nrow(window))
window$window <- as.character(window$window-1)
window$start <- as.numeric(window$start)
window$end <- as.numeric(window$end - 1)
window.score.df <- left_join(score.df, window, by=c("chr", "start", "end"))
window.score.pam <- left_join(window.score.df, pam.modwt.name[,c(3,4,2)], by="window")
nrow(window.score.pam)
# 1293751
window.score.pam.na <- subset(window.score.pam, window.score.pam$cut.score != "NA")
nrow(window.score.pam.na)
# 931340
df.melt <- melt(window.score.pam.na[,c(4,5,7,8)], id=c("cut.score", "scale", "sgRNA"))
df <- na.omit(df.melt)
colnames(df) <- c("cut.score", "scale", "sgRNA", "variable", "value")
df.id <- df %>% unite(feature.scale, c(variable, scale), sep = "")
df.dcast <- df.id %>% dcast(sgRNA + cut.score ~ feature.scale, value.var = "value", fun.aggregate=mean, na.rm=TRUE)
df.dcast.na <- na.omit(df.dcast)
nrow(df.dcast.na)
# 40467
write.table(df.dcast.na, "ecoli.20sliding.exact.pam.dcast.txt", quote=F, row.names=F, sep="\t")
–> take the 20bp upstream and 20bp downstream bin of the sgRNA
setwd("/gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli")
## upstream bin
score <- read.table("sgRNA.coord.txt", header=T, sep="\t", stringsAsFactors = F)
colnames(score) <- c("chr", "start", "end", "sgRNA", "id", "seq", "id2", "cut.score", "gid", "change.val", "quality")
score.df <- score[,c(1:4,8)]
score.df$chr <- score.df$chr
score.df$end <- score.df$start
score.df$start <- score.df$start - 19
window.score.df <- left_join(score.df, window, by=c("chr", "start", "end"))
window.score.pam <- left_join(window.score.df, pam.modwt.name, by="window")
window.score.pam.na <- subset(window.score.pam, window.score.pam$cut.score != "NA")
df.melt <- melt(window.score.pam.na[,c(4,5,8,9)], id=c("cut.score", "scale", "sgRNA"))
df <- na.omit(df.melt)
colnames(df) <- c("cut.score", "scale", "sgRNA", "variable", "value")
df.id <- df %>% unite(feature.scale, c(variable, scale), sep = "")
df.dcast <- df.id %>% dcast(sgRNA + cut.score ~ feature.scale, value.var = "value", fun.aggregate=mean, na.rm=TRUE)
df.dcast.na <- na.omit(df.dcast)
#
write.table(df.dcast.na, "ecoli.20sliding.up.pam.dcast.txt", quote=F, row.names=F, sep="\t")
## downstream bin
score <- read.table("sgRNA.coord.txt", header=T, sep="\t", stringsAsFactors = F)
colnames(score) <- c("chr", "start", "end", "sgRNA", "id", "seq", "id2", "cut.score", "gid", "change.val", "quality")
score.df <- score[,c(1:4,8)]
score$chr <- score$chr
score$start <- score$end
score$end <- score$start + 19
window.score.df <- left_join(score.df, window, by=c("chr", "start", "end"))
window.score.pam <- left_join(window.score.df, pam.modwt.name, by="window")
window.score.pam.na <- subset(window.score.pam, window.score.pam$cut.score != "NA")
df.melt <- melt(window.score.pam.na[,c(4,5,8,9)], id=c("cut.score", "scale", "sgRNA"))
df <- na.omit(df.melt)
colnames(df) <- c("cut.score", "scale", "sgRNA", "variable", "value")
df.id <- df %>% unite(feature.scale, c(variable, scale), sep = "")
df.dcast <- df.id %>% dcast(sgRNA + cut.score ~ feature.scale, value.var = "value", fun.aggregate=mean, na.rm=TRUE)
df.dcast.na <- na.omit(df.dcast)
#
write.table(df.dcast.na, "ecoli.20sliding.down.pam.dcast.txt", quote=F, row.names=F, sep="\t")
–> combine to generate full feature matrix
# combine regional DWT with other features
library(tidyr)
library(dplyr)
setwd("/gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli")
df.dcast.na <- read.delim("ecoli.20sliding.exact.pam.dcast.txt", header=T, sep="\t", stringsAsFactors = F)
df.dcast.sep <- df.dcast.na %>% separate(sgRNA, c("sgRNA", "ID"), sep="_")
df.dcast.dwt <- df.dcast.sep[,c(4:ncol(df.dcast.sep))]
colnames(df.dcast.dwt) <- paste0('PAM_', colnames(df.dcast.dwt))
df.dcast <- cbind(df.dcast.sep[,1:3], df.dcast.dwt)
setwd("/gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli")
df.dcast.na2 <- read.delim("ecoli.20sliding.up.pam.dcast.txt", header=T, sep="\t", stringsAsFactors = F)
df.dcast.sep2 <- df.dcast.na2 %>% separate(sgRNA, c("sgRNA", "ID"), sep="_")
df.dcast.dwt2 <- df.dcast.sep2[,c(4:ncol(df.dcast.sep2))]
colnames(df.dcast.dwt2) <- paste0('PAM_', colnames(df.dcast.dwt2))
df.dcast2 <- cbind(df.dcast.sep2[,1:3], df.dcast.dwt2)
setwd("/gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli")
df.dcast.na3 <- read.delim("ecoli.20sliding.down.pam.dcast.txt", header=T, sep="\t", stringsAsFactors = F)
df.dcast.sep3 <- df.dcast.na3 %>% separate(sgRNA, c("sgRNA", "ID"), sep="_")
df.dcast.dwt3 <- df.dcast.sep3[,c(4:ncol(df.dcast.sep3))]
colnames(df.dcast.dwt3) <- paste0('PAM_', colnames(df.dcast.dwt3))
df.dcast3 <- cbind(df.dcast.sep3[,1:3], df.dcast.dwt3)
df <- read.delim("ecoli.20sliding.pam.all.features.dcast.txt", header=T, sep="\t", stringsAsFactors = F)
df.sep <- df %>% separate(sgRNAID, c("sgRNA", "ID", "type"), sep="_")
nrow(df.sep)
# 118140
df.sep.region <- inner_join(df.sep, df.dcast[,c(1,2,4:ncol(df.dcast.sep))], by=c("sgRNA", "ID"))
df.sep.region2 <- inner_join(df.sep.region, df.dcast2[,c(1,2,4:ncol(df.dcast.sep2))], by=c("sgRNA", "ID"))
df.sep.region3 <- inner_join(df.sep.region2, df.dcast3[,c(1,2,4:ncol(df.dcast.sep3))], by=c("sgRNA", "ID"))
## note that dwt.x = sgRNA dwt, dwt.y = upstream dwt, dwt = downstream dwt
df.sep.region.id <- df.sep.region3 %>% unite(sgRNAID, c("sgRNA", "ID", "type"), sep="_")
nrow(df.sep.region.id)
# 118140
write.table(df.sep.region.id, "ecoli.20sliding.all.pam.features.dcast.txt", quote=F, row.names=F, sep="\t")
#ecoli.sgRNA.pam.dcast.txt
# combine raw and dwt features for just PAM sequence
library(tidyr)
library(dplyr)
setwd("/gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli")
df.dcast.na <- read.delim("ecoli.20sliding.exact.pam.dcast.txt", header=T, sep="\t", stringsAsFactors = F)
df.dcast.sep <- df.dcast.na %>% separate(sgRNA, c("sgRNA", "ID"), sep="_")
df.dcast.dwt <- df.dcast.sep[,c(4:ncol(df.dcast.sep))]
colnames(df.dcast.dwt) <- paste0('PAM_', colnames(df.dcast.dwt))
df.dcast <- cbind(df.dcast.sep[,1:3], df.dcast.dwt)
setwd("/gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli")
df.dcast.na2 <- read.delim("ecoli.20sliding.up.pam.dcast.txt", header=T, sep="\t", stringsAsFactors = F)
df.dcast.sep2 <- df.dcast.na2 %>% separate(sgRNA, c("sgRNA", "ID"), sep="_")
df.dcast.dwt2 <- df.dcast.sep2[,c(4:ncol(df.dcast.sep2))]
colnames(df.dcast.dwt2) <- paste0('PAM_', colnames(df.dcast.dwt2))
df.dcast2 <- cbind(df.dcast.sep2[,1:3], df.dcast.dwt2)
setwd("/gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli")
df.dcast.na3 <- read.delim("ecoli.20sliding.down.pam.dcast.txt", header=T, sep="\t", stringsAsFactors = F)
df.dcast.sep3 <- df.dcast.na3 %>% separate(sgRNA, c("sgRNA", "ID"), sep="_")
df.dcast.dwt3 <- df.dcast.sep3[,c(4:ncol(df.dcast.sep3))]
colnames(df.dcast.dwt3) <- paste0('PAM_', colnames(df.dcast.dwt3))
df.dcast3 <- cbind(df.dcast.sep3[,1:3], df.dcast.dwt3)
df <- read.delim("ecoli.sgRNA.pam.dcast.txt", header=T, sep="\t", stringsAsFactors = F)
df.sep <- df %>% separate(sgRNA, c("sgRNA", "ID"), sep="_")
nrow(df.sep)
# 40468
df.sep.region <- inner_join(df.sep, df.dcast[,c(1,2,4:ncol(df.dcast.sep))], by=c("sgRNA", "ID"))
df.sep.region2 <- inner_join(df.sep.region, df.dcast2[,c(1,2,4:ncol(df.dcast.sep2))], by=c("sgRNA", "ID"))
df.sep.region3 <- inner_join(df.sep.region2, df.dcast3[,c(1,2,4:ncol(df.dcast.sep3))], by=c("sgRNA", "ID"))
## note that dwt.x = sgRNA dwt, dwt.y = upstream dwt, dwt = downstream dwt
df.sep.region.id <- df.sep.region3 %>% unite(sgRNA, c("sgRNA", "ID"), sep="_")
nrow(df.sep.region.id)
# 40467
write.table(df.sep.region.id, "ecoli.20sliding.PAMonly.features.dcast.txt", quote=F, row.names=F, sep="\t")
# Andes
# salloc -A SYB105 -N 2 -t 4:00:00
source /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/scripts/loadcondaandes.sh
conda activate /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/Libraries/andes/envs/test
R
## All features
# awk 'NR==1 || /_Cas9/' ecoli.20sliding.all.pam.features.dcast.txt > ecoli.cas9.20sliding.all.pam.features.dcast.txt
setwd("/gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli")
df <- read.delim("ecoli.cas9.20sliding.all.pam.features.dcast.txt", header=T, sep="\t")
df <- na.omit(df)
## sample ID
df.features <- df[,c(1,3:ncol(df))]
df.score <- df[,1:2]
setwd("/gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli/iRF.run/Ecoli.allCas9/all.features/dwt20bp")
write.table(df.features, "dwt20bp.pam.features_overlap.txt", quote=F, row.names=F, sep="\t")
write.table(df.score, "dwt20bp.pam.score_overlap.txt", quote=F, row.names=F, sep="\t")
write.table(df.features, "dwt20bp.pam.features.txt", quote=F, row.names=F, sep="\t")
write.table(df.score, "dwt20bp.pam.score.txt", quote=F, row.names=F, sep="\t")
## no sample ID
## <DataFile>_overlap_noSampleIDs.txt & <YFile>_overlap_noSampleIDs.txt
df.features <- as.matrix(df[,c(3:ncol(df))])
df.score <- as.matrix(data.frame(df[,2]))
colnames(df.score) <- "cut.score"
setwd("/gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli/iRF.run/Ecoli.allCas9/all.features/dwt20bp")
write.table(df.features, "dwt20bp.pam.features_overlap_noSampleIDs.txt", quote=F, row.names=F, sep="\t")
write.table(df.score, "dwt20bp.pam.score_overlap_noSampleIDs.txt", quote=F, row.names=F, sep="\t")
## PAM only features
setwd("/gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli")
df <- read.delim("ecoli.20sliding.PAMonly.features.dcast.txt", header=T, sep="\t")
df <- na.omit(df)
## sample ID
df.features <- df[,c(1,3:ncol(df))]
df.score <- df[,1:2]
setwd("/gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli/iRF.run/Ecoli.allCas9/all.features/dwt20bp")
write.table(df.features, "dwt20bp.PAMonly.features_overlap.txt", quote=F, row.names=F, sep="\t")
write.table(df.score, "dwt20bp.PAMonly.score_overlap.txt", quote=F, row.names=F, sep="\t")
write.table(df.features, "dwt20bp.PAMonly.features.txt", quote=F, row.names=F, sep="\t")
write.table(df.score, "dwt20bp.PAMonly.score.txt", quote=F, row.names=F, sep="\t")
## no sample ID
## <DataFile>_overlap_noSampleIDs.txt & <YFile>_overlap_noSampleIDs.txt
df.features <- as.matrix(df[,c(3:ncol(df))])
df.score <- as.matrix(data.frame(df[,2]))
colnames(df.score) <- "cut.score"
setwd("/gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli/iRF.run/Ecoli.allCas9/all.features/dwt20bp")
write.table(df.features, "dwt20bp.PAMonly.features_overlap_noSampleIDs.txt", quote=F, row.names=F, sep="\t")
write.table(df.score, "dwt20bp.PAMonly.score_overlap_noSampleIDs.txt", quote=F, row.names=F, sep="\t")
# python
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
import warnings
warnings.filterwarnings("ignore")
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.metrics import confusion_matrix
np.random.seed(123)
data = pd.read_table('/gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli/ecoli.cas9.20sliding.all.pam.features.dcast.txt')
data = data.iloc[:,2:-1]
label_encoder = LabelEncoder()
data.iloc[:,0] = label_encoder.fit_transform(data.iloc[:,0]).astype('float64')
corr = data.corr()
sns.heatmap(corr)
plt.savefig('/gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli/ecoli.cas9.20sliding.all.pam.features.correlation.plot.pdf')
columns = np.full((corr.shape[0],), True, dtype=bool)
for i in range(corr.shape[0]):
for j in range(i+1, corr.shape[0]):
if corr.iloc[i,j] >= 0.9:
if columns[j]:
columns[j] = False
selected_columns = data.columns[columns]
data = data[selected_columns]
data.to_csv("/gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli/ecoli.cas9.20sliding.all.pam.features.dcast.pythoncorrelation.csv")
data = pd.read_table('/gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli/ecoli.20sliding.PAMonly.features.dcast.txt')
data = data.iloc[:,2:-1]
label_encoder = LabelEncoder()
data.iloc[:,0] = label_encoder.fit_transform(data.iloc[:,0]).astype('float64')
corr = data.corr()
corr.to_csv("/gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli/ecoli.20sliding.PAMonly.features.pythoncorrelation.table.csv")
sns.heatmap(corr)
plt.savefig('/gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli/ecoli.20sliding.PAMonly.features.correlation.plot.pdf')
columns = np.full((corr.shape[0],), True, dtype=bool)
for i in range(corr.shape[0]):
for j in range(i+1, corr.shape[0]):
if corr.iloc[i,j] >= 0.9:
if columns[j]:
columns[j] = False
selected_columns = data.columns[columns]
data = data[selected_columns]
data.to_csv("/gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli/ecoli.20sliding.PAMonly.features.dcast.pythoncorrelation.csv")
# R
source /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/scripts/loadcondaandes.sh
conda activate /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/Libraries/andes/envs/test
#head -n 1 /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli/ecoli.cas9.20sliding.all.pam.features.dcast.pythoncorrelation.csv > /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli/ecoli.cas9.20sliding.all.pam.features.dcast.pythoncorrelation.header.txt
setwd("/gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli/")
df.noncor <- read.delim("ecoli.cas9.20sliding.all.pam.features.dcast.pythoncorrelation.header.txt", header=F, sep=",")
setwd("/gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli/iRF.run/Ecoli.allCas9/all.features/dwt20bp/")
df <- read.delim("dwt20bp.pam.features.txt", header=T, sep="\t")
df.subset <- df[ , which(names(df) %in% df.noncor[1,])]
df.mat <- as.matrix(df.subset[,2:ncol(df.subset)])
df.mat.id <- cbind(as.data.frame(df$sgRNAID), df.mat)
write.table(df.mat.id, "dwt20bp.pam.noncor2.features.txt", quote=F, row.names=F, sep="\t")
write.table(df.mat.id, "dwt20bp.pam.noncor2.features_overlap.txt", quote=F, row.names=F, sep="\t")
write.table(df.mat, "dwt20bp.pam.noncor2.features_overlap_noSampleIDs.txt", quote=F, row.names=F, sep="\t")
#head -n 1 /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli/ecoli.20sliding.PAMonly.features.dcast.pythoncorrelation.csv > /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli/ecoli.20sliding.PAMonly.features.dcast.pythoncorrelation.header.txt
# create matrix heatmap
setwd("/gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli/")
df.correlation <- read.table("ecoli.20sliding.PAMonly.features.pythoncorrelation.table.csv", header=T, row.names = 1, sep=",")
cormat <- as.matrix(df.correlation)
library(reshape2)
melted_cormat <- melt(cormat)
head(melted_cormat)
# Get lower triangle of the correlation matrix
get_lower_tri<-function(cormat){
cormat[upper.tri(cormat)] <- NA
return(cormat)
}
# Get upper triangle of the correlation matrix
get_upper_tri <- function(cormat){
cormat[lower.tri(cormat)]<- NA
return(cormat)
}
upper_tri <- get_upper_tri(cormat)
upper_tri
# Melt the correlation matrix
library(reshape2)
melted_cormat <- melt(upper_tri, na.rm = TRUE)
# Heatmap
library(ggplot2)
pdf("pam.correlation.heatmap.pdf")
ggplot(data = melted_cormat, aes(Var2, Var1, fill = value))+
geom_tile(color = "white")+
scale_fill_gradient2(low = "blue", high = "red", mid = "white",
midpoint = 0, limit = c(-1,1), space = "Lab",
name="Pearson\nCorrelation") +
theme_minimal()+
theme(axis.text.x = element_text(angle = 90, vjust = 1, size = 6, hjust = 1))+
theme(axis.text.y = element_text(size = 6))+
coord_fixed()
dev.off()
#### clustered heatmap**
library(pheatmap)
pdf("pam.correlation.cluster2.heatmap.pdf")
pheatmap(cormat, fontsize=6)
dev.off()
setwd("/gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli/")
df.noncor <- read.delim("ecoli.20sliding.PAMonly.features.dcast.pythoncorrelation.header.txt", header=F, sep=",")
setwd("/gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli/iRF.run/Ecoli.allCas9/all.features/dwt20bp/")
df <- read.delim("dwt20bp.PAMonly.features.txt", header=T, sep="\t")
df.subset <- df[ , which(names(df) %in% df.noncor[1,])]
df.mat <- as.matrix(df.subset[,2:ncol(df.subset)])
df.mat.id <- cbind(as.data.frame(df$sgRNA), df.mat)
write.table(df.mat.id, "dwt20bp.PAMonly.noncor2.features.txt", quote=F, row.names=F, sep="\t")
write.table(df.mat.id, "dwt20bp.PAMonly.noncor2.features_overlap.txt", quote=F, row.names=F, sep="\t")
write.table(df.mat, "dwt20bp.PAMonly.noncor2.features_overlap_noSampleIDs.txt", quote=F, row.names=F, sep="\t")
# run python scripts on Andes
# run job submissions on Summit
# Builder script: /gpfs/alpine/syb105/proj-shared/Projects/iRF/iRF_LOOP_SetUp_CrossLayer.py
# [python iRF_LOOP_SetUp_CrossLayer.py --DataFile --YFile --System Summit --NodesPer 1 --TotalNodes 10 --RunTime 2 --Account SYB105 --NumTrees 1000 --NumIterations 5 --RunName iRF.XX --bypass --Prediction]
# Andes
mkdir /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli/iRF.run/Ecoli.allCas9/all.features/cas9.pam.noncor
mkdir /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli/iRF.run/Ecoli.allCas9/all.features/cas9.pam
mkdir /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli/iRF.run/Ecoli.allCas9/all.features/cas9.PAMonly
mkdir /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli/iRF.run/Ecoli.allCas9/all.features/cas9.PAMonly.noncor
module load python/3.7-anaconda3
cd /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli/iRF.run/Ecoli.allCas9/all.features/cas9.pam
python /gpfs/alpine/syb105/proj-shared/Projects/iRF/iRF_LOOP_SetUp_CrossLayer.py --System Summit --NodesPer 1 --TotalNodes 50 --RunTime 90 --Account SYB105 --NumTrees 1000 --NumIterations 5 --RunName dwt20bp.pam --bypass --Prediction /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli/iRF.run/Ecoli.allCas9/all.features/dwt20bp/dwt20bp.pam.features.txt /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli/iRF.run/Ecoli.allCas9/all.features/dwt20bp/dwt20bp.pam.score.txt
cd /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli/iRF.run/Ecoli.allCas9/all.features/cas9.pam.noncor
python /gpfs/alpine/syb105/proj-shared/Projects/iRF/iRF_LOOP_SetUp_CrossLayer.py --System Summit --NodesPer 1 --TotalNodes 50 --RunTime 90 --Account SYB105 --NumTrees 1000 --NumIterations 5 --RunName dwt20bp.pam.noncor --bypass --Prediction /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli/iRF.run/Ecoli.allCas9/all.features/dwt20bp/dwt20bp.pam.noncor2.features.txt /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli/iRF.run/Ecoli.allCas9/all.features/dwt20bp/dwt20bp.pam.score.txt
cd /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli/iRF.run/Ecoli.allCas9/all.features/cas9.PAMonly
python /gpfs/alpine/syb105/proj-shared/Projects/iRF/iRF_LOOP_SetUp_CrossLayer.py --System Summit --NodesPer 1 --TotalNodes 50 --RunTime 90 --Account SYB105 --NumTrees 1000 --NumIterations 5 --RunName cas9.PAMonly --bypass --Prediction /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli/iRF.run/Ecoli.allCas9/all.features/dwt20bp/dwt20bp.PAMonly.features.txt /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli/iRF.run/Ecoli.allCas9/all.features/dwt20bp/dwt20bp.PAMonly.score.txt
cd /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli/iRF.run/Ecoli.allCas9/all.features/cas9.PAMonly.noncor
python /gpfs/alpine/syb105/proj-shared/Projects/iRF/iRF_LOOP_SetUp_CrossLayer.py --System Summit --NodesPer 1 --TotalNodes 50 --RunTime 90 --Account SYB105 --NumTrees 1000 --NumIterations 5 --RunName cas9.PAMonly.noncor --bypass --Prediction /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli/iRF.run/Ecoli.allCas9/all.features/dwt20bp/dwt20bp.PAMonly.noncor2.features.txt /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli/iRF.run/Ecoli.allCas9/all.features/dwt20bp/dwt20bp.PAMonly.score.txt
# Summit
cd /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli/iRF.run/Ecoli.allCas9/all.features/dwt20bp
module load python/3.7.0-anaconda3-5.3.0
# full
bsub /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli/iRF.run/Ecoli.allCas9/all.features/cas9.pam/Submits/submit_full_dwt20bp.pam_0.sh
bsub /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli/iRF.run/Ecoli.allCas9/all.features/cas9.pam.noncor/Submits/submit_full_dwt20bp.pam.noncor_0.sh
bsub /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli/iRF.run/Ecoli.allCas9/all.features/cas9.PAMonly/Submits/submit_full_cas9.PAMonly_0.sh
bsub /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli/iRF.run/Ecoli.allCas9/all.features/cas9.PAMonly.noncor/Submits/submit_full_cas9.PAMonly.noncor_0.sh
# train
bsub /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli/iRF.run/Ecoli.allCas9/all.features/cas9.pam/Submits/submit_train_dwt20bp.pam_0.sh
bsub /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli/iRF.run/Ecoli.allCas9/all.features/cas9.pam.noncor/Submits/submit_train_dwt20bp.pam.noncor_0.sh
bsub /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli/iRF.run/Ecoli.allCas9/all.features/cas9.PAMonly/Submits/submit_train_cas9.PAMonly_0.sh
bsub /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli/iRF.run/Ecoli.allCas9/all.features/cas9.PAMonly.noncor/Submits/submit_train_cas9.PAMonly.noncor_0.sh
# once the train submissions are done run the test submissions
# test
bsub /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli/iRF.run/Ecoli.allCas9/all.features/cas9.pam/Submits/submit_test_dwt20bp.pam_0.sh
bsub /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli/iRF.run/Ecoli.allCas9/all.features/cas9.pam.noncor/Submits/submit_test_dwt20bp.pam.noncor_0.sh
bsub /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli/iRF.run/Ecoli.allCas9/all.features/cas9.PAMonly/Submits/submit_test_cas9.PAMonly_0.sh
bsub /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli/iRF.run/Ecoli.allCas9/all.features/cas9.PAMonly.noncor/Submits/submit_test_cas9.PAMonly.noncor_0.sh
# Andes
cd /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli/iRF.run/Ecoli.allCas9/
module load python/3.7-anaconda3
cd /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli/iRF.run/Ecoli.allCas9/all.features/cas9.pam
python /gpfs/alpine/syb105/proj-shared/Projects/iRF/iRF_postProcessing.py --Iterations 5 --Prediction --PredAccuracy MAE,MAEA,MSE,R2 --varTot 95 /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli/iRF.run/Ecoli.allCas9/YNames.txt dwt20bp.pam
# 0.2350489224029133
sort -k3rg topVarEdges/cut.score_top95.txt | head
# p20homo_lumo_energygapraw cut.score 0.01919548963782504
# p15.CCsgRNA.raw cut.score 0.013907416229868005
# p20xz_quadrupoleraw cut.score 0.012652986663673108
# p20yz_quadrupoleraw cut.score 0.012569569979479526
# sgRNA.tempsgRNA.raw cut.score 0.012072317795665853
# sgRNA.gcsgRNA.raw cut.score 0.011219166624725191
# p18xz_quadrupoleraw cut.score 0.009706390075265553
# sgRNA_gene.dwtd22.y cut.score 0.00959143195770127
# sgRNA_gene.dwtd22.x cut.score 0.009494634752462105
# sgRNA_gene.dwtd22 cut.score 0.009164101573607853
# pearson correlation
setwd("/gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli/iRF.run/Ecoli.allCas9/all.features/cas9.pam/cut.score/foldRuns/fold9/Runs/Set4")
pred <- read.delim("dwt20bp.pam_Set4_test.prediction", header=T, sep="\t")
y <- read.delim("set4_Y_test_noSampleIDs.txt", header=T, sep="\t")
cor(y$cut.score, pred$Predictions.)
# 0.4868159
cd /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli/iRF.run/Ecoli.allCas9/all.features/cas9.pam.noncor
python /gpfs/alpine/syb105/proj-shared/Projects/iRF/iRF_postProcessing.py --Iterations 5 --Prediction --PredAccuracy MAE,MAEA,MSE,R2 --varTot 95 /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli/iRF.run/Ecoli.allCas9/YNames.txt dwt20bp.pam.noncor
# 0.2295961790075612
sort -k3rg topVarEdges/cut.score_top95.txt | head
# p20homo_lumo_energygapraw cut.score 0.030798847742975047
# p20xz_quadrupoleraw cut.score 0.030675071981209088
# sgRNA_gene.dwtd22.x cut.score 0.029240648430965816
# sgRNA.gcsgRNA.raw cut.score 0.024714352194683632
# p18xz_quadrupoleraw cut.score 0.02397215605283516
# p19rot_constants_yraw cut.score 0.01729264291200995
# p19rot_constants_zraw cut.score 0.015932186840140517
# p15.CCsgRNA.raw cut.score 0.012437057306555586
# sgRNA_rna.dwtd21.x cut.score 0.011483571355085154
# PAM_pam.dwtd2.x cut.score 0.010540331286116459
# pearson correlation
setwd("/gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli/iRF.run/Ecoli.allCas9/all.features/cas9.pam.noncor/cut.score/foldRuns/fold9/Runs/Set4")
pred <- read.delim("dwt20bp.pam.noncor_Set4_test.prediction", header=T, sep="\t")
y <- read.delim("set4_Y_test_noSampleIDs.txt", header=T, sep="\t")
cor(y$cut.score, pred$Predictions.)
# 0.4805851
cd /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli/iRF.run/Ecoli.allCas9/all.features/cas9.PAMonly
python /gpfs/alpine/syb105/proj-shared/Projects/iRF/iRF_postProcessing.py --Iterations 5 --Prediction --PredAccuracy MAE,MAEA,MSE,R2 --varTot 95 /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli/iRF.run/Ecoli.allCas9/YNames.txt cas9.PAMonly
# 0.04678625196344603
sort -k3rg topVarEdges/cut.score_top95.txt | head
# PAM_pam.dwtd1 cut.score 0.027885296477834776
# PAM_pam.dwtd6.y cut.score 0.027526178259270923
# PAM_pam.dwtd1.x cut.score 0.027261736661964815
# PAM_pam.dwtd2.y cut.score 0.02632383180802727
# PAM_pam.dwtd4.y cut.score 0.02560209746617329
# PAM_pam.dwtd3.y cut.score 0.02439228881818026
# PAM_pam.dwtd5.y cut.score 0.024155364071387485
# PAM_pam.dwtd7.y cut.score 0.023758235288131483
# PAM_pam.dwtd3 cut.score 0.022610619386171555
# PAM_pam.dwtd3.x cut.score 0.022301171414720752
# pearson correlation
setwd("/gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli/iRF.run/Ecoli.allCas9/all.features/cas9.PAMonly/cut.score/foldRuns/fold9/Runs/Set4")
pred <- read.delim("cas9.PAMonly_Set4_test.prediction", header=T, sep="\t")
y <- read.delim("set4_Y_test_noSampleIDs.txt", header=T, sep="\t")
cor(y$cut.score, pred$Predictions.)
# 0.221513
cd /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli/iRF.run/Ecoli.allCas9/all.features/cas9.PAMonly.noncor
python /gpfs/alpine/syb105/proj-shared/Projects/iRF/iRF_postProcessing.py --Iterations 5 --Prediction --PredAccuracy MAE,MAEA,MSE,R2 --varTot 95 /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli/iRF.run/Ecoli.allCas9/YNames.txt cas9.PAMonly.noncor
#
sort -k3rg topVarEdges/cut.score_top95.txt | head
# PAM_pam.dwtd1.x cut.score 0.054715467791729575
# PAM_pam.dwtd21.x cut.score 0.04998942075851293
# PAM_pam.dwtd3.x cut.score 0.04359227618820572
# PAM_pam.dwtd2.x cut.score 0.03996988162537398
# PAM_pam.dwtd10.x cut.score 0.036363458198525676
# PAM_pam.dwtd8.x cut.score 0.03557711462098819
# PAM_pam.dwtd12.x cut.score 0.03479499869709338
# PAM_pam.dwtd11.x cut.score 0.03441286578172495
# PAM_pam.dwtd6.y cut.score 0.03437317281141317
# PAM_pam.dwtd14.x cut.score 0.033985637894167996
# pearson correlation
setwd("/gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli/iRF.run/Ecoli.allCas9/all.features/cas9.PAMonly.noncor/cut.score/foldRuns/fold9/Runs/Set4")
pred <- read.delim("cas9.PAMonly.noncor_Set4_test.prediction", header=T, sep="\t")
y <- read.delim("set4_Y_test_noSampleIDs.txt", header=T, sep="\t")
cor(y$cut.score, pred$Predictions.)
# 0.2229726
—–> remove any features with zero variance… <—– (and only use the exact match DWT for PAM density)
#mkdir /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli/iRF.run/Cas9.allFeatures.corr.var
setwd("/gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli/iRF.run/Ecoli.allCas9/all.features/dwt20bp/")
features <- read.delim("dwt20bp.pam.features.txt", header=T, sep="\t")
score <- read.delim("dwt20bp.pam.score.txt", header=T, sep="\t")
# 40467
### Cas cuts 3-4bp upstream so the PAM (where Cas binds) should be downstream of the guide
### remove PAM DWT features for the matching bin and the upstream bin
names(features)
features.pam <- features[,c(1:2137,2184:2206)]
ncol(features.pam)
# 2160
### calculate variance for all features and remove features with zero variance..
which(apply(features.pam, 2, var) == 0)
features.novar <- features.pam[ - as.numeric(which(apply(features.pam, 2, var) == 0))]
ncol(features.novar)
# 2037
df <- left_join(score, features.novar, by="sgRNAID")
setwd("/gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli/iRF.run/Cas9.allFeatures.corr.var")
write.table(df, "matrix.txt", quote=F, row.names=F, sep="\t")
# python
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
import warnings
warnings.filterwarnings("ignore")
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.metrics import confusion_matrix
np.random.seed(123)
data = pd.read_table('/gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli/iRF.run/Cas9.allFeatures.corr.var/matrix.txt')
data = data.iloc[:,2:-1]
label_encoder = LabelEncoder()
data.iloc[:,0] = label_encoder.fit_transform(data.iloc[:,0]).astype('float64')
corr = data.corr()
columns = np.full((corr.shape[0],), True, dtype=bool)
for i in range(corr.shape[0]):
for j in range(i+1, corr.shape[0]):
if corr.iloc[i,j] >= 0.9:
if columns[j]:
columns[j] = False
selected_columns = data.columns[columns]
data = data[selected_columns]
data.to_csv("/gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli/iRF.run/Cas9.allFeatures.corr.var/matrix.corr.csv")
# R
source /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/scripts/loadcondaandes.sh
conda activate /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/Libraries/andes/envs/test
#head -n 1 /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli/iRF.run/Cas9.allFeatures.corr.var/matrix.corr.csv > /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli/iRF.run/Cas9.allFeatures.corr.var/matrix.corr.header.csv
setwd("/gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli/iRF.run/Cas9.allFeatures.corr.var")
df.noncor <- read.delim("matrix.corr.header.csv", header=F, sep=",")
ncol(df.noncor)
# 786
df <- read.delim("matrix.txt", header=T, sep="\t")
df.subset <- df[ , which(names(df) %in% df.noncor[1,])]
df.mat <- as.matrix(df.subset[,2:ncol(df.subset)])
df.mat.id <- cbind(as.data.frame(df$sgRNAID), df.mat)
ncol(df.mat.id)
# 785
write.table(df.mat.id, "matrix.noncor.features.txt", quote=F, row.names=F, sep="\t")
write.table(df.mat.id, "matrix.noncor.features_overlap.txt", quote=F, row.names=F, sep="\t")
write.table(df.mat, "matrix.noncor.features_overlap_noSampleIDs.txt", quote=F, row.names=F, sep="\t")
names(df)
df.nodwt.id <- df[,c(1,3:1529,2010:2015)]
df.nodwt.noid <- df[,c(3:1529,2010:2015)]
write.table(df.nodwt.id, "matrix.noncor.noDWT.features.txt", quote=F, row.names=F, sep="\t")
write.table(df.nodwt.id, "matrix.noncor.noDWT.features_overlap.txt", quote=F, row.names=F, sep="\t")
write.table(df.nodwt.noid, "matrix.noncor.noDWT.features_overlap_noSampleIDs.txt", quote=F, row.names=F, sep="\t")
df.id <- df[,c(1,3:ncol(df))]
df.noid <- df[,3:ncol(df)]
write.table(df.id, "matrix.var.features.txt", quote=F, row.names=F, sep="\t")
write.table(df.id, "matrix.var.features_overlap.txt", quote=F, row.names=F, sep="\t")
write.table(df.noid, "matrix.var.features_overlap_noSampleIDs.txt", quote=F, row.names=F, sep="\t")
names(df.mat.id)
df.nodwt.var.id <- df.id[,c(1:626,761:766)]
df.nodwt.var.noid <- df.id[,c(2:626,761:766)]
write.table(df.nodwt.var.id, "matrix.var.noDWT.features.txt", quote=F, row.names=F, sep="\t")
write.table(df.nodwt.var.id, "matrix.var.noDWT.features_overlap.txt", quote=F, row.names=F, sep="\t")
write.table(df.nodwt.var.noid, "matrix.var.noDWT.features_overlap_noSampleIDs.txt", quote=F, row.names=F, sep="\t")
## zero variance features removed
cd /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli/iRF.run/Cas9.allFeatures.corr.var
head -n 1 matrix.txt | awk '{print NF; exit}' ## full matrix (with PAM and DWTs)
# 2038
head -n 1 matrix.noncor.features.txt | awk '{print NF; exit}' ## matrix based on features with < 0.9 correlation
# 785
head -n 1 matrix.noncor.noDWT.features.txt | awk '{print NF; exit}' ## should accidentally be noDWT matrix with zero variance removed, but correlation not considered
# 1534
head -n 1 matrix.var.noDWT.features.txt | awk '{print NF; exit}' ## no DWT matrix accedentally with correlation considered??
# 632
head -n 1 matrix.var.features.txt | awk '{print NF; exit}' ## full matrix with zero variance features removed (same as matrix.txt)
# 2037
# run python scripts on Andes
# run job submissions on Summit
# Builder script: /gpfs/alpine/syb105/proj-shared/Projects/iRF/iRF_LOOP_SetUp_CrossLayer.py
# [python iRF_LOOP_SetUp_CrossLayer.py --DataFile --YFile --System Summit --NodesPer 1 --TotalNodes 10 --RunTime 2 --Account SYB105 --NumTrees 1000 --NumIterations 5 --RunName iRF.XX --bypass --Prediction]
# Andes
module load python/3.7-anaconda3
## variance > 0 & correlation < 0.9
cd /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli/iRF.run/Cas9.allFeatures.corr.var
python /gpfs/alpine/syb105/proj-shared/Projects/iRF/iRF_LOOP_SetUp_CrossLayer.py --System Summit --NodesPer 1 --TotalNodes 50 --RunTime 90 --Account SYB105 --NumTrees 1000 --NumIterations 5 --RunName Cas9.allFeatures.corr.var --bypass --Prediction /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli/iRF.run/Cas9.allFeatures.corr.var/matrix.noncor.features.txt /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli/iRF.run/Ecoli.allCas9/all.features/dwt20bp/dwt20bp.pam.score.txt
## variance > 0
# mkdir /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli/iRF.run/Cas9.allFeatures.var
cd /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli/iRF.run/Cas9.allFeatures.var
python /gpfs/alpine/syb105/proj-shared/Projects/iRF/iRF_LOOP_SetUp_CrossLayer.py --System Summit --NodesPer 1 --TotalNodes 50 --RunTime 90 --Account SYB105 --NumTrees 1000 --NumIterations 5 --RunName Cas9.allFeatures.var --bypass --Prediction /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli/iRF.run/Cas9.allFeatures.corr.var/matrix.var.features.txt /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli/iRF.run/Ecoli.allCas9/all.features/dwt20bp/dwt20bp.pam.score.txt
## remove DWT (this is actually only after removing zero variance)
# mkdir /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli/iRF.run/Cas9.allFeatures.noncor.noDWT
cd /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli/iRF.run/Cas9.allFeatures.noncor.noDWT
python /gpfs/alpine/syb105/proj-shared/Projects/iRF/iRF_LOOP_SetUp_CrossLayer.py --System Summit --NodesPer 1 --TotalNodes 50 --RunTime 90 --Account SYB105 --NumTrees 1000 --NumIterations 5 --RunName Cas9.allFeatures.noncor.noDWT --bypass --Prediction /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli/iRF.run/Cas9.allFeatures.corr.var/matrix.noncor.noDWT.features.txt /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli/iRF.run/Ecoli.allCas9/all.features/dwt20bp/dwt20bp.pam.score.txt
## no DWT (this is actually after removing zero variance and highly correlated)
# mkdir /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli/iRF.run/Cas9.allFeatures.var.noDWT
cd /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli/iRF.run/Cas9.allFeatures.var.noDWT
python /gpfs/alpine/syb105/proj-shared/Projects/iRF/iRF_LOOP_SetUp_CrossLayer.py --System Summit --NodesPer 1 --TotalNodes 50 --RunTime 90 --Account SYB105 --NumTrees 1000 --NumIterations 5 --RunName Cas9.allFeatures.var.noDWT --bypass --Prediction /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli/iRF.run/Cas9.allFeatures.corr.var/matrix.var.noDWT.features.txt /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli/iRF.run/Ecoli.allCas9/all.features/dwt20bp/dwt20bp.pam.score.txt
# Summit
cd /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli/iRF.run/Cas9.allFeatures.corr.var
module load python/3.7.0-anaconda3-5.3.0
# full
bsub /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli/iRF.run/Cas9.allFeatures.corr.var/Submits/submit_full_Cas9.allFeatures.corr.var_0.sh
bsub /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli/iRF.run/Cas9.allFeatures.var/Submits/submit_full_Cas9.allFeatures.var_0.sh
bsub /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli/iRF.run/Cas9.allFeatures.noncor.noDWT/Submits/submit_full_Cas9.allFeatures.noncor.noDWT_0.sh
bsub /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli/iRF.run/Cas9.allFeatures.var.noDWT/Submits/submit_full_Cas9.allFeatures.var.noDWT_0.sh
# train
bsub /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli/iRF.run/Cas9.allFeatures.corr.var/Submits/submit_train_Cas9.allFeatures.corr.var_0.sh
bsub /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli/iRF.run/Cas9.allFeatures.var/Submits/submit_train_Cas9.allFeatures.var_0.sh
bsub /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli/iRF.run/Cas9.allFeatures.noncor.noDWT/Submits/submit_train_Cas9.allFeatures.noncor.noDWT_0.sh
bsub /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli/iRF.run/Cas9.allFeatures.var.noDWT/Submits/submit_train_Cas9.allFeatures.var.noDWT_0.sh
# once the train submissions are done run the test submissions
# test
bsub /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli/iRF.run/Cas9.allFeatures.corr.var/Submits/submit_test_Cas9.allFeatures.corr.var_0.sh
bsub /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli/iRF.run/Cas9.allFeatures.var/Submits/submit_test_Cas9.allFeatures.var_0.sh
bsub /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli/iRF.run/Cas9.allFeatures.noncor.noDWT/Submits/submit_test_Cas9.allFeatures.noncor.noDWT_0.sh
bsub /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli/iRF.run/Cas9.allFeatures.var.noDWT/Submits/submit_test_Cas9.allFeatures.var.noDWT_0.sh
# Andes
module load python/3.7-anaconda3
cd /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli/iRF.run/Cas9.allFeatures.corr.var
python /gpfs/alpine/syb105/proj-shared/Projects/iRF/iRF_postProcessing.py --Iterations 5 --Prediction --PredAccuracy MAE,MAEA,MSE,R2 --varTot 95 /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli/iRF.run/Ecoli.allCas9/YNames.txt Cas9.allFeatures.corr.var
# 0.2302965490735392
sort -k3rg topVarEdges/cut.score_top95.txt | head
# sgRNA_gene.dwtd22.x cut.score 0.029568161532544573
# p20homo_lumo_energygapraw cut.score 0.02914057828444596
# p20xz_quadrupoleraw cut.score 0.029099517446779837
# sgRNA.gcsgRNA.raw cut.score 0.0252028668912803
# p18xz_quadrupoleraw cut.score 0.02369462662505263
# p19rot_constants_yraw cut.score 0.016235684603339257
# p19rot_constants_zraw cut.score 0.01620565743210744
# p15.CCsgRNA.raw cut.score 0.012404891961213707
# p20homo_energyraw cut.score 0.012115126317974347
# sgRNA_rna.dwtd21.x cut.score 0.011255142628531375
# pearson correlation
setwd("/gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli/iRF.run/Cas9.allFeatures.corr.var/cut.score/foldRuns/fold9/Runs/Set4")
pred <- read.delim("Cas9.allFeatures.corr.var_Set4_test.prediction", header=T, sep="\t")
y <- read.delim("set4_Y_test_noSampleIDs.txt", header=T, sep="\t")
cor(y$cut.score, pred$Predictions.)
# 0.4817219
cd /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli/iRF.run/Cas9.allFeatures.var
python /gpfs/alpine/syb105/proj-shared/Projects/iRF/iRF_postProcessing.py --Iterations 5 --Prediction --PredAccuracy MAE,MAEA,MSE,R2 --varTot 95 /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli/iRF.run/Ecoli.allCas9/YNames.txt Cas9.allFeatures.var
# 0.2354958488742556
sort -k3rg topVarEdges/cut.score_top95.txt | head
# p20homo_lumo_energygapraw cut.score 0.022261760235400847
# p15.CCsgRNA.raw cut.score 0.013984430525219813
# p20xz_quadrupoleraw cut.score 0.012645915942767013
# p20yz_quadrupoleraw cut.score 0.012225371295176698
# p18xz_quadrupoleraw cut.score 0.012124403980819144
# sgRNA.tempsgRNA.raw cut.score 0.01182623702507589
# sgRNA.gcsgRNA.raw cut.score 0.011697866257478562
# sgRNA_gene.dwtd22.x cut.score 0.009634235257913282
# sgRNA_gene.dwtd22 cut.score 0.009459864202782418
# sgRNA_gene.dwtd22.y cut.score 0.009448427035320076
# pearson correlation
setwd("/gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli/iRF.run/Cas9.allFeatures.var/cut.score/foldRuns/fold9/Runs/Set4")
pred <- read.delim("Cas9.allFeatures.var_Set4_test.prediction", header=T, sep="\t")
y <- read.delim("set4_Y_test_noSampleIDs.txt", header=T, sep="\t")
cor(y$cut.score, pred$Predictions.)
# 0.4864614
cd /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli/iRF.run/Cas9.allFeatures.noncor.noDWT
python /gpfs/alpine/syb105/proj-shared/Projects/iRF/iRF_postProcessing.py --Iterations 5 --Prediction --PredAccuracy MAE,MAEA,MSE,R2 --varTot 95 /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli/iRF.run/Ecoli.allCas9/YNames.txt Cas9.allFeatures.noncor.noDWT
# 0.2507490685483556
sort -k3rg topVarEdges/cut.score_top95.txt | head
# pam.distance0 cut.score 0.03482762136643839
# TsgRNA.raw cut.score 0.026791266630562957
# CCsgRNA.raw cut.score 0.025696575727691236
# GGsgRNA.raw cut.score 0.024382387415176014
# sgRNA.gcsgRNA.raw cut.score 0.023520009588115936
# sgRNA.tempsgRNA.raw cut.score 0.02338928979320086
# p20homo_lumo_energygapraw cut.score 0.02256901725480594
# GsgRNA.raw cut.score 0.02145358827395304
# p20xz_quadrupoleraw cut.score 0.021094330199443116
# CsgRNA.raw cut.score 0.019083924997040622
# pearson correlation
setwd("/gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli/iRF.run/Cas9.allFeatures.noncor.noDWT/cut.score/foldRuns/fold9/Runs/Set4")
pred <- read.delim("Cas9.allFeatures.noncor.noDWT_Set4_test.prediction", header=T, sep="\t")
y <- read.delim("set4_Y_test_noSampleIDs.txt", header=T, sep="\t")
cor(y$cut.score, pred$Predictions.)
# 0.5041279
#### this one doesn't makes sense...
cd /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli/iRF.run/Cas9.allFeatures.var.noDWT
python /gpfs/alpine/syb105/proj-shared/Projects/iRF/iRF_postProcessing.py --Iterations 5 --Prediction --PredAccuracy MAE,MAEA,MSE,R2 --varTot 95 /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli/iRF.run/Ecoli.allCas9/YNames.txt Cas9.allFeatures.var.noDWT
# 0.13435891654091953
sort -k3rg topVarEdges/cut.score_top95.txt | head
# AsgRNA.raw cut.score 0.05715759086730521
# GsgRNA.raw cut.score 0.05485955704133587
# CsgRNA.raw cut.score 0.04983667772947569
# CCsgRNA.raw cut.score 0.04150173043079704
# AAsgRNA.raw cut.score 0.03711265217517887
# GCsgRNA.raw cut.score 0.03653018258805882
# CAsgRNA.raw cut.score 0.03531438723607752
# CGsgRNA.raw cut.score 0.03475354340678201
# GGsgRNA.raw cut.score 0.03406199444777779
# ATsgRNA.raw cut.score 0.03344958150366993
# pearson correlation
setwd("/gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli/iRF.run/Cas9.allFeatures.var.noDWT/cut.score/foldRuns/fold9/Runs/Set4")
pred <- read.delim("Cas9.allFeatures.var.noDWT_Set4_test.prediction", header=T, sep="\t")
y <- read.delim("set4_Y_test_noSampleIDs.txt", header=T, sep="\t")
cor(y$cut.score, pred$Predictions.)
# 0.3700368
–> take the full matrix, remove all DWT, remove features wtih zero variance, remove highly correlated features…
setwd("/gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli/iRF.run/Cas9.allFeatures.corr.var")
df <- read.delim("matrix.txt", header=T, sep="\t")
names(df)
df.noDWT <- df[,c(1:1529,2010:2015)]
write.table(df.noDWT, "matrix.noDWT.txt", quote=F, row.names=F, sep="\t")
# python
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
import warnings
warnings.filterwarnings("ignore")
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.metrics import confusion_matrix
np.random.seed(123)
data = pd.read_table('/gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli/iRF.run/Cas9.allFeatures.corr.var/matrix.noDWT.txt')
data = data.iloc[:,2:-1]
label_encoder = LabelEncoder()
data.iloc[:,0] = label_encoder.fit_transform(data.iloc[:,0]).astype('float64')
corr = data.corr()
columns = np.full((corr.shape[0],), True, dtype=bool)
for i in range(corr.shape[0]):
for j in range(i+1, corr.shape[0]):
if corr.iloc[i,j] >= 0.9:
if columns[j]:
columns[j] = False
selected_columns = data.columns[columns]
data = data[selected_columns]
data.to_csv("/gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli/iRF.run/Cas9.allFeatures.corr.var/matrix.noDWT.corr.csv")
# R
source /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/scripts/loadcondaandes.sh
conda activate /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/Libraries/andes/envs/test
#head -n 1 /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli/iRF.run/Cas9.allFeatures.corr.var/matrix.noDWT.corr.csv > /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli/iRF.run/Cas9.allFeatures.corr.var/matrix.noDWT.corr.header.csv
setwd("/gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli/iRF.run/Cas9.allFeatures.corr.var")
df.noncor <- read.delim("matrix.noDWT.corr.header.csv", header=F, sep=",")
ncol(df.noncor)
# 632
df <- read.delim("matrix.noDWT.txt", header=T, sep="\t")
df.subset <- df[ , which(names(df) %in% df.noncor[1,])]
df.mat <- as.matrix(df.subset[,2:ncol(df.subset)])
df.mat.id <- cbind(as.data.frame(df$sgRNAID), df.mat)
ncol(df.mat.id)
# 631
write.table(df.mat.id, "matrix.noDWT.features.txt", quote=F, row.names=F, sep="\t")
write.table(df.mat.id, "matrix.noDWT.features_overlap.txt", quote=F, row.names=F, sep="\t")
write.table(df.mat, "matrix.noDWT.features_overlap_noSampleIDs.txt", quote=F, row.names=F, sep="\t")
write.table(df[,c(1,3:ncol(df))], "matrix.noDWT.2.features.txt", quote=F, row.names=F, sep="\t")
write.table(df[,c(1,3:ncol(df))], "matrix.noDWT.2.features_overlap.txt", quote=F, row.names=F, sep="\t")
write.table(df[,c(3:ncol(df))], "matrix.noDWT.2.features_overlap_noSampleIDs.txt", quote=F, row.names=F, sep="\t")
# run python scripts on Andes
# run job submissions on Summit
# Builder script: /gpfs/alpine/syb105/proj-shared/Projects/iRF/iRF_LOOP_SetUp_CrossLayer.py
# [python iRF_LOOP_SetUp_CrossLayer.py --DataFile --YFile --System Summit --NodesPer 1 --TotalNodes 10 --RunTime 2 --Account SYB105 --NumTrees 1000 --NumIterations 5 --RunName iRF.XX --bypass --Prediction]
# Andes
module load python/3.7-anaconda3
# mkdir /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli/iRF.run/Cas9.allFeatures.noDWT
cd /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli/iRF.run/Cas9.allFeatures.noDWT
python /gpfs/alpine/syb105/proj-shared/Projects/iRF/iRF_LOOP_SetUp_CrossLayer.py --System Summit --NodesPer 1 --TotalNodes 50 --RunTime 90 --Account SYB105 --NumTrees 1000 --NumIterations 5 --RunName Cas9.allFeatures.noDWT --bypass --Prediction /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli/iRF.run/Cas9.allFeatures.corr.var/matrix.noDWT.features.txt /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli/iRF.run/Ecoli.allCas9/all.features/dwt20bp/dwt20bp.pam.score.txt
### this matrix but before removing highly correlated... for a more direct comparison
# mkdir /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli/iRF.run/Cas9.allFeatures.noDWT.2
cd /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli/iRF.run/Cas9.allFeatures.noDWT.2
python /gpfs/alpine/syb105/proj-shared/Projects/iRF/iRF_LOOP_SetUp_CrossLayer.py --System Summit --NodesPer 1 --TotalNodes 50 --RunTime 90 --Account SYB105 --NumTrees 1000 --NumIterations 5 --RunName Cas9.allFeatures.noDWT.2 --bypass --Prediction /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli/iRF.run/Cas9.allFeatures.corr.var/matrix.noDWT.2.features.txt /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli/iRF.run/Ecoli.allCas9/all.features/dwt20bp/dwt20bp.pam.score.txt
# Summit
cd /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli/iRF.run/Cas9.allFeatures.noDWT
module load python/3.7.0-anaconda3-5.3.0
# full
bsub /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli/iRF.run/Cas9.allFeatures.noDWT/Submits/submit_full_Cas9.allFeatures.noDWT_0.sh
bsub /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli/iRF.run/Cas9.allFeatures.noDWT.2/Submits/submit_full_Cas9.allFeatures.noDWT.2_0.sh
# train
bsub /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli/iRF.run/Cas9.allFeatures.noDWT/Submits/submit_train_Cas9.allFeatures.noDWT_0.sh
bsub /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli/iRF.run/Cas9.allFeatures.noDWT.2/Submits/submit_train_Cas9.allFeatures.noDWT.2_0.sh
# once the train submissions are done run the test submissions
# test
bsub /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli/iRF.run/Cas9.allFeatures.noDWT/Submits/submit_test_Cas9.allFeatures.noDWT_0.sh
bsub /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli/iRF.run/Cas9.allFeatures.noDWT.2/Submits/submit_test_Cas9.allFeatures.noDWT.2_0.sh
# Andes
module load python/3.7-anaconda3
cd /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli/iRF.run/Cas9.allFeatures.noDWT
python /gpfs/alpine/syb105/proj-shared/Projects/iRF/iRF_postProcessing.py --Iterations 5 --Prediction --PredAccuracy MAE,MAEA,MSE,R2 --varTot 95 /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli/iRF.run/Ecoli.allCas9/YNames.txt Cas9.allFeatures.noDWT
# 0.24909940255855545
sort -k3rg topVarEdges/cut.score_top95.txt | head
# sgRNA.gcsgRNA.raw cut.score 0.051214107327122296
# pam.distance0 cut.score 0.0390891150243277
# p20xz_quadrupoleraw cut.score 0.034558136459621046
# p20homo_lumo_energygapraw cut.score 0.030664538962259687
# TsgRNA.raw cut.score 0.030149154785467987
# CCsgRNA.raw cut.score 0.028186199058959673
# GGsgRNA.raw cut.score 0.025499457557593048
# p18xz_quadrupoleraw cut.score 0.025008660086191233
# GsgRNA.raw cut.score 0.024787848055880204
# CsgRNA.raw cut.score 0.02153685337377076
# pearson correlation
setwd("/gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli/iRF.run/Cas9.allFeatures.noDWT/cut.score/foldRuns/fold9/Runs/Set4")
pred <- read.delim("Cas9.allFeatures.noDWT_Set4_test.prediction", header=T, sep="\t")
y <- read.delim("set4_Y_test_noSampleIDs.txt", header=T, sep="\t")
cor(y$cut.score, pred$Predictions.)
# 0.5007501
cd /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli/iRF.run/Cas9.allFeatures.noDWT.2
python /gpfs/alpine/syb105/proj-shared/Projects/iRF/iRF_postProcessing.py --Iterations 5 --Prediction --PredAccuracy MAE,MAEA,MSE,R2 --varTot 95 /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli/iRF.run/Ecoli.allCas9/YNames.txt Cas9.allFeatures.noDWT.2
#
sort -k3rg topVarEdges/cut.score_top95.txt | head
# pearson correlation
setwd("/gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli/iRF.run/Cas9.allFeatures.noDWT.2/cut.score/foldRuns/fold9/Runs/Set4")
pred <- read.delim("Cas9.allFeatures.noDWT.2_Set4_test.prediction", header=T, sep="\t")
y <- read.delim("set4_Y_test_noSampleIDs.txt", header=T, sep="\t")
cor(y$cut.score, pred$Predictions.)
#
– compare model without DWT with all feature (0.504) and removing highly correlated features (0.501)
## with all features
cd /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli/iRF.run/Cas9.allFeatures.noncor.noDWT
wc -l normalizedEdgeFiles/cut.score_Normalize.txt
# 1513
sort -k3rg normalizedEdgeFiles/cut.score_Normalize.txt
# pam.distance0 cut.score 0.03482762136643839 <----------
# TsgRNA.raw cut.score 0.026791266630562957 <----------
# CCsgRNA.raw cut.score 0.025696575727691236 <----------
# GGsgRNA.raw cut.score 0.024382387415176014 <----------
# sgRNA.gcsgRNA.raw cut.score 0.023520009588115936 <----------
# sgRNA.tempsgRNA.raw cut.score 0.02338928979320086 xxx
# p20homo_lumo_energygapraw cut.score 0.02256901725480594 <----------
# GsgRNA.raw cut.score 0.02145358827395304 <----------
# p20xz_quadrupoleraw cut.score 0.021094330199443116 <----------
# CsgRNA.raw cut.score 0.019083924997040622 <----------
# sgRNA.structuresgRNA.raw cut.score 0.01448776108491957 <----------
# AsgRNA.raw cut.score 0.013470150588746437 <----------
# p15.CCsgRNA.raw cut.score 0.012938114033078705 <----------
# p20yz_quadrupoleraw cut.score 0.012527103968460691 xxx
# TTsgRNA.raw cut.score 0.011690706991905126 <----------
# p18xz_quadrupoleraw cut.score 0.010924982588964079 <----------
# AAsgRNA.raw cut.score 0.010458888461034255 xxx
# GCsgRNA.raw cut.score 0.009470451081302062 <----------
# CAsgRNA.raw cut.score 0.009294200719673247 <----------
# CGsgRNA.raw cut.score 0.008839430980393824 xxx
## with all features minus those with correlation > 0.9
cd /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli/iRF.run/Cas9.allFeatures.noDWT
wc -l normalizedEdgeFiles/cut.score_Normalize.txt
# 624
sort -k3rg normalizedEdgeFiles/cut.score_Normalize.txt
# sgRNA.gcsgRNA.raw cut.score 0.051214107327122296 <----------
# pam.distance0 cut.score 0.0390891150243277 <----------
# p20xz_quadrupoleraw cut.score 0.034558136459621046 <----------
# p20homo_lumo_energygapraw cut.score 0.030664538962259687 <----------
# TsgRNA.raw cut.score 0.030149154785467987 <----------
# CCsgRNA.raw cut.score 0.028186199058959673 <----------
# GGsgRNA.raw cut.score 0.025499457557593048 <----------
# p18xz_quadrupoleraw cut.score 0.025008660086191233 xxx <----------
# GsgRNA.raw cut.score 0.024787848055880204 <----------
# CsgRNA.raw cut.score 0.02153685337377076 <----------
# p19rot_constants_yraw cut.score 0.021093964854014366 xxx
# p19rot_constants_zraw cut.score 0.018975059167604516 xxx
# sgRNA.structuresgRNA.raw cut.score 0.017594738111106702 <----------
# AsgRNA.raw cut.score 0.015769866005249936 <----------
# TTsgRNA.raw cut.score 0.013307097682030181 <----------
# p20homo_energyraw cut.score 0.012304030349852717 xxx
# CAsgRNA.raw cut.score 0.01160497177584366 <----------
# p17molecular_volumeraw cut.score 0.01112169081151818 xxx
# p15.CCsgRNA.raw cut.score 0.011046455493025143 <----------
# GCsgRNA.raw cut.score 0.011012338512307948 <----------
setwd("/gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli/iRF.run/Cas9.allFeatures.corr.var")
df <- read.delim("matrix.txt", header=T, sep="\t")
names(df)
df.score <- df[,1:2]
df.noDWT <- df[,c(3:1529,2010:2015)]
df.DWT <- df[,c(1530:2009,2016:2038)]
df.absDWT <- abs(df.DWT)
# 503
df.absDWT.score <- cbind(df.score, df.absDWT)
df.absDWT.all.score <- cbind(df.score, df.noDWT, df.absDWT)
# 2038
write.table(df.absDWT.score, "matrix.absDWTonly.txt", quote=F, row.names=F, sep="\t")
write.table(df.absDWT.all.score, "matrix.absDWT.txt", quote=F, row.names=F, sep="\t")
# python
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
import warnings
warnings.filterwarnings("ignore")
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.metrics import confusion_matrix
np.random.seed(123)
data = pd.read_table('/gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli/iRF.run/Cas9.allFeatures.corr.var/matrix.absDWT.txt')
data = data.iloc[:,2:-1]
label_encoder = LabelEncoder()
data.iloc[:,0] = label_encoder.fit_transform(data.iloc[:,0]).astype('float64')
corr = data.corr()
columns = np.full((corr.shape[0],), True, dtype=bool)
for i in range(corr.shape[0]):
for j in range(i+1, corr.shape[0]):
if corr.iloc[i,j] >= 0.9:
if columns[j]:
columns[j] = False
selected_columns = data.columns[columns]
data = data[selected_columns]
data.to_csv("/gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli/iRF.run/Cas9.allFeatures.corr.var/matrix.absDWT.corr.csv")
# R
source /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/scripts/loadcondaandes.sh
conda activate /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/Libraries/andes/envs/test
#head -n 1 /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli/iRF.run/Cas9.allFeatures.corr.var/matrix.absDWT.corr.csv > /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli/iRF.run/Cas9.allFeatures.corr.var/matrix.absDWT.corr.header.csv
setwd("/gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli/iRF.run/Cas9.allFeatures.corr.var")
df.noncor <- read.delim("matrix.absDWT.corr.header.csv", header=F, sep=",")
ncol(df.noncor)
# 788
df <- read.delim("matrix.absDWT.txt", header=T, sep="\t")
# 2038
df.subset <- df[ , which(names(df) %in% df.noncor[1,])]
df.mat <- as.matrix(df.subset[,2:ncol(df.subset)])
df.mat.id <- cbind(as.data.frame(df$sgRNAID), df.mat)
ncol(df.mat.id)
# 787
write.table(df.mat.id, "matrix.absDWT.noncor.features.txt", quote=F, row.names=F, sep="\t")
write.table(df.mat.id, "matrix.absDWT.noncor.features_overlap.txt", quote=F, row.names=F, sep="\t")
write.table(df.mat, "matrix.absDWT.noncor.features_overlap_noSampleIDs.txt", quote=F, row.names=F, sep="\t")
write.table(df[,c(1,3:ncol(df))], "matrix.absDWT.features.txt", quote=F, row.names=F, sep="\t")
write.table(df[,c(1,3:ncol(df))], "matrix.absDWT.features_overlap.txt", quote=F, row.names=F, sep="\t")
write.table(df[,c(3:ncol(df))], "matrix.absDWT.features_overlap_noSampleIDs.txt", quote=F, row.names=F, sep="\t")
df <- read.delim("matrix.absDWTonly.txt", header=T, sep="\t")
write.table(df[,c(1,3:ncol(df))], "matrix.absDWTonly.features.txt", quote=F, row.names=F, sep="\t")
write.table(df[,c(1,3:ncol(df))], "matrix.absDWTonly.features_overlap.txt", quote=F, row.names=F, sep="\t")
write.table(df[,c(3:ncol(df))], "matrix.absDWTonly.features_overlap_noSampleIDs.txt", quote=F, row.names=F, sep="\t")
# run python scripts on Andes
# run job submissions on Summit
# Builder script: /gpfs/alpine/syb105/proj-shared/Projects/iRF/iRF_LOOP_SetUp_CrossLayer.py
# [python iRF_LOOP_SetUp_CrossLayer.py --DataFile --YFile --System Summit --NodesPer 1 --TotalNodes 10 --RunTime 2 --Account SYB105 --NumTrees 1000 --NumIterations 5 --RunName iRF.XX --bypass --Prediction]
# Andes
module load python/3.7-anaconda3
mkdir /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli/iRF.run/Cas9.allFeatures.absDWT
cd /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli/iRF.run/Cas9.allFeatures.absDWT
python /gpfs/alpine/syb105/proj-shared/Projects/iRF/iRF_LOOP_SetUp_CrossLayer.py --System Summit --NodesPer 1 --TotalNodes 50 --RunTime 90 --Account SYB105 --NumTrees 1000 --NumIterations 5 --RunName Cas9.allFeatures.absDWT --bypass --Prediction /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli/iRF.run/Cas9.allFeatures.corr.var/matrix.absDWT.features.txt /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli/iRF.run/Ecoli.allCas9/all.features/dwt20bp/dwt20bp.pam.score.txt
### same matrix with removing highly correlated features
mkdir /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli/iRF.run/Cas9.allFeatures.absDWT.noncor
cd /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli/iRF.run/Cas9.allFeatures.absDWT.noncor
python /gpfs/alpine/syb105/proj-shared/Projects/iRF/iRF_LOOP_SetUp_CrossLayer.py --System Summit --NodesPer 1 --TotalNodes 50 --RunTime 90 --Account SYB105 --NumTrees 1000 --NumIterations 5 --RunName Cas9.allFeatures.absDWT.noncor --bypass --Prediction /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli/iRF.run/Cas9.allFeatures.corr.var/matrix.absDWT.noncor.features.txt /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli/iRF.run/Ecoli.allCas9/all.features/dwt20bp/dwt20bp.pam.score.txt
### DWT (asbolute value) only
mkdir /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli/iRF.run/Cas9.allFeatures.absDWTonly
cd /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli/iRF.run/Cas9.allFeatures.absDWTonly
python /gpfs/alpine/syb105/proj-shared/Projects/iRF/iRF_LOOP_SetUp_CrossLayer.py --System Summit --NodesPer 1 --TotalNodes 50 --RunTime 90 --Account SYB105 --NumTrees 1000 --NumIterations 5 --RunName Cas9.allFeatures.absDWTonly --bypass --Prediction /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli/iRF.run/Cas9.allFeatures.corr.var/matrix.absDWTonly.features.txt /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli/iRF.run/Ecoli.allCas9/all.features/dwt20bp/dwt20bp.pam.score.txt
# Summit
cd /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli/iRF.run/Cas9.allFeatures.absDWT
module load python/3.7.0-anaconda3-5.3.0
# full
bsub /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli/iRF.run/Cas9.allFeatures.absDWT/Submits/submit_full_Cas9.allFeatures.absDWT_0.sh
bsub /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli/iRF.run/Cas9.allFeatures.absDWT.noncor/Submits/submit_full_Cas9.allFeatures.absDWT.noncor_0.sh
bsub /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli/iRF.run/Cas9.allFeatures.absDWTonly/Submits/submit_full_Cas9.allFeatures.absDWTonly_0.sh
# train
bsub /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli/iRF.run/Cas9.allFeatures.absDWT/Submits/submit_train_Cas9.allFeatures.absDWT_0.sh
bsub /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli/iRF.run/Cas9.allFeatures.absDWT.noncor/Submits/submit_train_Cas9.allFeatures.absDWT.noncor_0.sh
bsub /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli/iRF.run/Cas9.allFeatures.absDWTonly/Submits/submit_train_Cas9.allFeatures.absDWTonly_0.sh
# once the train submissions are done run the test submissions
# test
bsub /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli/iRF.run/Cas9.allFeatures.absDWT/Submits/submit_test_Cas9.allFeatures.absDWT_0.sh
bsub /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli/iRF.run/Cas9.allFeatures.absDWT.noncor/Submits/submit_test_Cas9.allFeatures.absDWT.noncor_0.sh
bsub /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli/iRF.run/Cas9.allFeatures.absDWTonly/Submits/submit_test_Cas9.allFeatures.absDWTonly_0.sh
# Andes
module load python/3.7-anaconda3
cd /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli/iRF.run/Cas9.allFeatures.absDWT
python /gpfs/alpine/syb105/proj-shared/Projects/iRF/iRF_postProcessing.py --Iterations 5 --Prediction --PredAccuracy MAE,MAEA,MSE,R2 --varTot 95 /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli/iRF.run/Ecoli.allCas9/YNames.txt Cas9.allFeatures.absDWT
#
sort -k3rg topVarEdges/cut.score_top95.txt | head
# pearson correlation
setwd("/gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli/iRF.run/Cas9.allFeatures.absDWT/cut.score/foldRuns/fold9/Runs/Set4")
pred <- read.delim("Cas9.allFeatures.absDWT_Set4_test.prediction", header=T, sep="\t")
y <- read.delim("set4_Y_test_noSampleIDs.txt", header=T, sep="\t")
cor(y$cut.score, pred$Predictions.)
#
cd /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli/iRF.run/Cas9.allFeatures.absDWT.noncor
python /gpfs/alpine/syb105/proj-shared/Projects/iRF/iRF_postProcessing.py --Iterations 5 --Prediction --PredAccuracy MAE,MAEA,MSE,R2 --varTot 95 /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli/iRF.run/Ecoli.allCas9/YNames.txt Cas9.allFeatures.absDWT.noncor
#
sort -k3rg topVarEdges/cut.score_top95.txt | head
# pearson correlation
setwd("/gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli/iRF.run/Cas9.allFeatures.absDWT.noncor/cut.score/foldRuns/fold9/Runs/Set4")
pred <- read.delim("Cas9.allFeatures.absDWT.noncor_Set4_test.prediction", header=T, sep="\t")
y <- read.delim("set4_Y_test_noSampleIDs.txt", header=T, sep="\t")
cor(y$cut.score, pred$Predictions.)
#
cd /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli/iRF.run/Cas9.allFeatures.absDWTonly
python /gpfs/alpine/syb105/proj-shared/Projects/iRF/iRF_postProcessing.py --Iterations 5 --Prediction --PredAccuracy MAE,MAEA,MSE,R2 --varTot 95 /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli/iRF.run/Ecoli.allCas9/YNames.txt Cas9.allFeatures.absDWTonly
#
sort -k3rg topVarEdges/cut.score_top95.txt | head
# pearson correlation
setwd("/gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli/iRF.run/Cas9.allFeatures.absDWTonly/cut.score/foldRuns/fold9/Runs/Set4")
pred <- read.delim("Cas9.allFeatures.absDWTonly_Set4_test.prediction", header=T, sep="\t")
y <- read.delim("set4_Y_test_noSampleIDs.txt", header=T, sep="\t")
cor(y$cut.score, pred$Predictions.)
#
—> run again with target node size specified (per Jonathan & Ashley recommendation)… doing it here to directly compare (same matrices, diff settings)
# run python scripts on Andes
# run job submissions on Summit
# Builder script: /gpfs/alpine/syb105/proj-shared/Projects/iRF/iRF_LOOP_SetUp_CrossLayer.py
# [python iRF_LOOP_SetUp_CrossLayer.py --DataFile --YFile --System Summit --NodesPer 1 --TotalNodes 10 --RunTime 2 --Account SYB105 --NumTrees 1000 --NumIterations 5 --RunName iRF.XX --bypass --Prediction]
# Andes
module load python/3.7-anaconda3
mkdir /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli/iRF.run/Cas9.allFeatures.absDWT.targetnode
cd /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli/iRF.run/Cas9.allFeatures.absDWT.targetnode
python /gpfs/alpine/syb105/proj-shared/Projects/iRF/iRF_LOOP_SetUp_CrossLayer.py --System Summit --NodesPer 1 --TotalNodes 50 --RunTime 90 --Account SYB105 --NumTrees 1000 --NumIterations 5 --RunName Cas9.allFeatures.absDWT --bypass --Prediction --targetNodeSize 50 /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli/iRF.run/Cas9.allFeatures.corr.var/matrix.absDWT.features.txt /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli/iRF.run/Ecoli.allCas9/all.features/dwt20bp/dwt20bp.pam.score.txt
### same matrix with removing highly correlated features
mkdir /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli/iRF.run/Cas9.allFeatures.absDWT.noncor.targetnode
cd /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli/iRF.run/Cas9.allFeatures.absDWT.noncor.targetnode
python /gpfs/alpine/syb105/proj-shared/Projects/iRF/iRF_LOOP_SetUp_CrossLayer.py --System Summit --NodesPer 1 --TotalNodes 50 --RunTime 90 --Account SYB105 --NumTrees 1000 --NumIterations 5 --RunName Cas9.allFeatures.absDWT.noncor --bypass --Prediction --targetNodeSize 50 /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli/iRF.run/Cas9.allFeatures.corr.var/matrix.absDWT.noncor.features.txt /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli/iRF.run/Ecoli.allCas9/all.features/dwt20bp/dwt20bp.pam.score.txt
### DWT (asbolute value) only
mkdir /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli/iRF.run/Cas9.allFeatures.absDWTonly.targetnode
cd /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli/iRF.run/Cas9.allFeatures.absDWTonly.targetnode
python /gpfs/alpine/syb105/proj-shared/Projects/iRF/iRF_LOOP_SetUp_CrossLayer.py --System Summit --NodesPer 1 --TotalNodes 50 --RunTime 90 --Account SYB105 --NumTrees 1000 --NumIterations 5 --RunName Cas9.allFeatures.absDWTonly --bypass --Prediction --targetNodeSize 50 /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli/iRF.run/Cas9.allFeatures.corr.var/matrix.absDWTonly.features.txt /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli/iRF.run/Ecoli.allCas9/all.features/dwt20bp/dwt20bp.pam.score.txt
# Summit
cd /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli/iRF.run/Cas9.allFeatures.absDWT.targetnode
module load python/3.7.0-anaconda3-5.3.0
# full
bsub /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli/iRF.run/Cas9.allFeatures.absDWT.targetnode/Submits/submit_full_Cas9.allFeatures.absDWT_0.sh
bsub /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli/iRF.run/Cas9.allFeatures.absDWT.noncor.targetnode/Submits/submit_full_Cas9.allFeatures.absDWT.noncor_0.sh
bsub /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli/iRF.run/Cas9.allFeatures.absDWTonly.targetnode/Submits/submit_full_Cas9.allFeatures.absDWTonly_0.sh
# train
bsub /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli/iRF.run/Cas9.allFeatures.absDWT.targetnode/Submits/submit_train_Cas9.allFeatures.absDWT_0.sh
bsub /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli/iRF.run/Cas9.allFeatures.absDWT.noncor.targetnode/Submits/submit_train_Cas9.allFeatures.absDWT.noncor_0.sh
bsub /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli/iRF.run/Cas9.allFeatures.absDWTonly.targetnode/Submits/submit_train_Cas9.allFeatures.absDWTonly_0.sh
# once the train submissions are done run the test submissions
# test
bsub /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli/iRF.run/Cas9.allFeatures.absDWT.targetnode/Submits/submit_test_Cas9.allFeatures.absDWT_0.sh
bsub /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli/iRF.run/Cas9.allFeatures.absDWT.noncor.targetnode/Submits/submit_test_Cas9.allFeatures.absDWT.noncor_0.sh
bsub /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli/iRF.run/Cas9.allFeatures.absDWTonly.targetnode/Submits/submit_test_Cas9.allFeatures.absDWTonly_0.sh
# Andes
module load python/3.7-anaconda3
cd /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli/iRF.run/Cas9.allFeatures.absDWT.targetnode
python /gpfs/alpine/syb105/proj-shared/Projects/iRF/iRF_postProcessing.py --Iterations 5 --Prediction --PredAccuracy MAE,MAEA,MSE,R2 --varTot 95 /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli/iRF.run/Ecoli.allCas9/YNames.txt Cas9.allFeatures.absDWT
# 0.2372319863023856
sort -k3rg topVarEdges/cut.score_top95.txt | head
p20homo_lumo_energygapraw cut.score 0.054740564994858486
p15.CCsgRNA.raw cut.score 0.024235184178713697
p18xz_quadrupoleraw cut.score 0.01771718069144364
p19.GGsgRNA.raw cut.score 0.017564616079933988
sgRNA.tempsgRNA.raw cut.score 0.015984303044398625
p20xz_quadrupoleraw cut.score 0.01596801626758395
p20yz_quadrupoleraw cut.score 0.015845475975071785
sgRNA.gcsgRNA.raw cut.score 0.012879087420534355
p18homo_lumo_energygapraw cut.score 0.009764382978873705
p18yz_quadrupoleraw cut.score 0.009147822258328542
# pearson correlation
setwd("/gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli/iRF.run/Cas9.allFeatures.absDWT.targetnode/cut.score/foldRuns/fold9/Runs/Set4")
pred <- read.delim("Cas9.allFeatures.absDWT_Set4_test.prediction", header=T, sep="\t")
y <- read.delim("set4_Y_test_noSampleIDs.txt", header=T, sep="\t")
cor(y$cut.score, pred$Predictions.)
# 0.4862182
cd /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli/iRF.run/Cas9.allFeatures.absDWT.noncor.targetnode
python /gpfs/alpine/syb105/proj-shared/Projects/iRF/iRF_postProcessing.py --Iterations 5 --Prediction --PredAccuracy MAE,MAEA,MSE,R2 --varTot 95 /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli/iRF.run/Ecoli.allCas9/YNames.txt Cas9.allFeatures.absDWT.noncor
#
sort -k3rg topVarEdges/cut.score_top95.txt | head
p20homo_lumo_energygapraw cut.score 0.057981511852906405
p20xz_quadrupoleraw cut.score 0.03886744229994534
p18xz_quadrupoleraw cut.score 0.03830156764420709
sgRNA.gcsgRNA.raw cut.score 0.032114099079561864
sgRNA_structure.dwtd21.x cut.score 0.029592869940389795
p19rot_constants_zraw cut.score 0.028991084710952984
p15.CCsgRNA.raw cut.score 0.023768635184783008
p19rot_constants_yraw cut.score 0.021731340297680965
sgRNA_ipd.dwtd20.x cut.score 0.018803879643221014
sgRNA_rna.dwtd21.x cut.score 0.01832739783981275
# pearson correlation
setwd("/gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli/iRF.run/Cas9.allFeatures.absDWT.noncor.targetnode/cut.score/foldRuns/fold9/Runs/Set4")
pred <- read.delim("Cas9.allFeatures.absDWT.noncor_Set4_test.prediction", header=T, sep="\t")
y <- read.delim("set4_Y_test_noSampleIDs.txt", header=T, sep="\t")
cor(y$cut.score, pred$Predictions.)
# 0.4829281
cd /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli/iRF.run/Cas9.allFeatures.absDWTonly.targetnode
python /gpfs/alpine/syb105/proj-shared/Projects/iRF/iRF_postProcessing.py --Iterations 5 --Prediction --PredAccuracy MAE,MAEA,MSE,R2 --varTot 95 /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli/iRF.run/Ecoli.allCas9/YNames.txt Cas9.allFeatures.absDWTonly
#
sort -k3rg topVarEdges/cut.score_top95.txt | head
PAM_pam.dwtd1 cut.score 0.042558302477903404
PAM_pam.dwtd3 cut.score 0.015820081525189444
PAM_pam.dwtd2 cut.score 0.013755565099031648
PAM_pam.dwtd4 cut.score 0.011181639622788253
sgRNA_structure.dwtd21.y cut.score 0.009123781601190236
sgRNA_structure.dwtd21 cut.score 0.009077912592000668
PAM_pam.dwtd6 cut.score 0.008528429810768264
PAM_pam.dwtd7 cut.score 0.00826982066576757
PAM_pam.dwtd8 cut.score 0.008265916045809677
PAM_pam.dwtd5 cut.score 0.007904841452861315
# pearson correlation
setwd("/gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli/iRF.run/Cas9.allFeatures.absDWTonly.targetnode/cut.score/foldRuns/fold9/Runs/Set4")
pred <- read.delim("Cas9.allFeatures.absDWTonly_Set4_test.prediction", header=T, sep="\t")
y <- read.delim("set4_Y_test_noSampleIDs.txt", header=T, sep="\t")
cor(y$cut.score, pred$Predictions.)
# 0.2037773
library(ranger)
#iRF <- function(xmat, y, ntree=500, iter=5, classification=F, threads=1,alwayssplits=NULL, saveall=T)
iRF <- function(xmat, y, ntree=100, iter=1, classification=F, threads=1, alwayssplits=NULL, saveall=T)
{
tmp <- cbind(xmat, Y = y)
wt <- rep(1/ncol(xmat), ncol(xmat)) # start with equal sample weighting per SNP
rfs <- list()
for(i in 1:iter)
{
cat("\niRF iteration ",i,"\n")
cat("=================\n")
mtry = 0.5*sum(wt>0)
rf <- ranger::ranger(dependent.variable.name = "Y", data = tmp, num.trees=ntree,
min.node.size = 50,
split.select.weights = wt, classification = classification,
mtry = mtry, importance = "impurity_corrected", num.threads=threads, write.forest = T,
always.split.variables = alwayssplits)
wt <- rf$variable.importance / sum(abs(rf$variable.importance)) # scale importance to range(0,1)
wt[wt<0] <- 0 # set negative weights to zero
cat("mtry: ", mtry, "\n")
cat("prediction error: ",rf$prediction.error,"\n")
if(classification==FALSE) cat("r^2: ",rf$r.squared,"\n")
if(classification==TRUE) print(rf$confusion.matrix)
cat("cor(y,yhat): ",cor(rf$predictions,y),"\n")
cat("SNPs with importance > 0:",sum(wt>0),"\n")
if(saveall) rfs[[i]] <- rf
if(sum(wt>0) < max(0.01*(ncol(xmat)-1), 10))
{
if(!saveall) rfs <- rf
break
}
}
return(rfs)
}
setwd("/gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli/iRF.run/Cas9.allFeatures.corr.var/")
df <- read.delim("matrix.absDWTonly.txt", header=T, sep="\t", stringsAsFactors = F)
iRF(df[,3:ncol(df)], df$cut.score)
# mtry: 251.5
# prediction error: 108.7599
# r^2: 0.01584152
# cor(y,yhat): 0.1547343
# SNPs with importance > 0: 179
### include leaf node size = 50
# mtry: 251.5
# prediction error: 107.8222
# r^2: 0.02432663
# cor(y,yhat): 0.1664798
# SNPs with importance > 0: 223
setwd("/gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli/iRF.run/Cas9.allFeatures.corr.var/")
df <- read.delim("matrix.absDWT.txt", header=T, sep="\t", stringsAsFactors = F)
iRF(df[,3:ncol(df)], df$cut.score)
# mtry: 1018
# prediction error: 86.296
# r^2: 0.2191152
# cor(y,yhat): 0.4684943
# SNPs with importance > 0: 1395
setwd("/gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli/iRF.run/Cas9.allFeatures.corr.var")
df.noncor <- read.delim("matrix.absDWT.corr.header.csv", header=F, sep=",")
df.subset <- df[ , which(names(df) %in% df.noncor[1,])]
iRF(df.subset[,3:ncol(df.subset)], df.subset$cut.score)
setwd("/gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli/iRF.run/Cas9.allFeatures.corr.var")
df <- read.delim("matrix.txt", header=T, sep="\t")
df.DWT <- df[,c(1,2,1530:2009,2016:2038)]
iRF(df.DWT[,3:ncol(df.DWT)], df.DWT$cut.score)
# mtry: 251.5
# prediction error: 108.3767
# r^2: 0.01930901
# cor(y,yhat): 0.1647539
# SNPs with importance > 0: 180
df.noDWT <- df[,c(1:1529,2010:2015)]
iRF(df.noDWT[,3:ncol(df.noDWT)], df.noDWT$cut.score)
# mtry: 766.5
# prediction error: 85.44252
# r^2: 0.2268383
# cor(y,yhat): 0.4771703
# SNPs with importance > 0: 961
iRF(df[,3:ncol(df)], df$cut.score)
# mtry: 1018
# prediction error: 85.76362
# r^2: 0.2239326
# cor(y,yhat): 0.4737196
# SNPs with importance > 0: 1353
salloc -A SYB105 -p gpu -N 1 -t 2:00:00
source /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/scripts/loadcondaandes.sh
conda activate /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/Libraries/andes/envs/test
R
library(dplyr)
library(ggplot2)
library(gridExtra)
library(reshape2)
setwd("/gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli")
gene <- read.table("ecoli.gene.20sliding.bed", header=F, sep="\t", stringsAsFactors = F)
structure <- read.table("20sliding.protein.structure.fa", header=F, sep="\t", stringsAsFactors = F)
nuc <- read.table("nucleotide_counts_20sliding_temp.txt", header=T, sep="\t", stringsAsFactors = F)
rnaseq <- read.table("ecoli.rnaseq.average.20sliding.bed", header=T, sep="\t", stringsAsFactors = F)
gatc <- read.table("ecoli.gatc.20sliding.bed", header=F, sep="\t", stringsAsFactors = F)
ipd <- read.table("ecoli.ipd.average.20sliding.bed", header=T, sep="\t", stringsAsFactors = F)
window <- read.table("ecoli.20bp.sliding.bed", header=F, sep="\t", stringsAsFactors = F)
colnames(window) <- c("chr", "start", "end")
score <- read.table("sgRNA.coord.txt", header=T, sep="\t", stringsAsFactors = F)
colnames(score) <- c("chr", "start", "end", "sgRNA", "id", "seq", "id2", "cut.score", "gid", "change.val", "quality")
score.df <- score[,c(1:4,8)]
window.guide <- left_join(window, score.df, by=c("chr", "start", "end"))
window.v <- window[,1:3]
colnames(window.v) <- c("V1", "V2", "V3")
gatc.win <- left_join(window.v, gatc, by=c("V1", "V2", "V3"))
gatc.win[is.na(gatc.win)] <- 0
ipd.win <- left_join(window.v, ipd, by=c("V1", "V2", "V3"))
ipd.win[is.na(ipd.win)] <- 0
gene.win <- left_join(window.v, gene, by=c("V1", "V2", "V3"))
gene.win[is.na(gene.win)] <- 0
rnaseq.win <- left_join(window.v, rnaseq, by=c("V1", "V2", "V3"))
rnaseq.win[is.na(rnaseq.win)] <- 0
gatc.bin <- gatc.win %>% group_by(V1, V2, V3) %>% mutate(gatc.count = n())
gatc.count <- unique(gatc.bin[,c(1:3,8)])
gene.bin <- gene.win %>% group_by(V1, V2, V3) %>% mutate(gene.count = n())
gene.count <- unique(gene.bin[,c(1:3,14)])
library(tidyr)
structure.coord <- separate(structure, V1, c("V1", "V2"), sep=":")
structure.df <- separate(structure.coord, V2, c("V2", "V3"), sep="-")
structure.df$V2 <- as.numeric(structure.df$V2)
structure.df$V3 <- as.numeric(structure.df$V3)
nuc.coord <- separate(nuc, Window, c("V1", "V2"), sep=":")
nuc.df <- separate(nuc.coord, V2, c("V2", "V3"), sep="-")
nuc.df$V2 <- as.numeric(nuc.df$V2)
nuc.df$V3 <- as.numeric(nuc.df$V3)
score.df$V1 <- score.df$chr
score.df$V2 <- score.df$start + 1
score.df$V3 <- score.df$end + 1
window.score.df <- left_join(window.v, score.df[,5:8], by=c("V1", "V2", "V3"))
window.score.temp <- left_join(window.score.df, nuc.df[,c(1:3,10)], by=c("V1", "V2", "V3"))
window.temp.gc <- left_join(window.score.temp, nuc.df[,c(1:3,9)], by=c("V1", "V2", "V3"))
window.temp.gc.structure <- left_join(window.temp.gc, structure.df[,c(1:4)], by=c("V1", "V2", "V3"))
window.temp.gc.structure.rna <- left_join(window.temp.gc.structure, rnaseq.win, by=c("V1", "V2", "V3"))
window.temp.gc.structure.rna.gene <- left_join(window.temp.gc.structure.rna, gene.count, by=c("V1", "V2", "V3"))
window.temp.gc.structure.rna.gene.gatc <- left_join(window.temp.gc.structure.rna.gene, gatc.count, by=c("V1", "V2", "V3"))
window.temp.gc.structure.rna.gene.gatc.ipd <- left_join(window.temp.gc.structure.rna.gene.gatc, ipd.win, by=c("V1", "V2", "V3"))
df.melt <- melt(window.temp.gc.structure.rna.gene.gatc.ipd, id=c("V1", "V2", "V3"))
colnames(df.melt) <- c("chr", "start", "end", "variable", "value")
df <- na.omit(df.melt)
pdf("ecoli.genome.pdf")
ggplot(df, aes(x=start, y=value, color=variable)) + geom_point() + theme_classic() + facet_grid(variable ~ .)
dev.off()
# scp noshayjm@dtn.ccs.ornl.gov:/gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli/ecoli.genome.pdf /Users/27n/Dropbox (ORNL)/ORNL.Noshay/Projects/SEED/ExploratoryDataForModelGeneration/e.coli/.
#!/bin/bash
#SBATCH -A SYB105
#SBATCH -J genome.distribution
#SBATCH -N 1
#SBATCH -p gpu
#SBATCH -t 24:00:00
#SBATCH --mem-per-cpu=0
source /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/scripts/loadcondaandes.sh
conda activate /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/Libraries/andes/envs/test
cd /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli/
R CMD BATCH ecoli.genome.distribution.R
# sbatch /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli/ecoli.genome.distribution.sh
Because of the order I was merging files, all bins had at least one of each feature counted… need to fix for DWT…
to create figures showing large scale and small scale regions of DWT
output DWT values at all scales for chromosome regions
# salloc -A SYB105 -N 2 -p gpu -t 4:00:00
source /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/scripts/loadcondaandes.sh
conda activate /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/Libraries/andes/envs/test
R
library(dplyr)
library(reshape2)
library(tidyr)
library(wmtsa)
library(data.table)
setwd("/gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli")
gatc <- read.table("ecoli.gatc.20sliding.bed", header=F, sep="\t", stringsAsFactors = F)
ipd <- read.table("ecoli.ipd.average.20sliding.bed", header=T, sep="\t", stringsAsFactors = F)
gene <- read.table("ecoli.gene.20sliding.bed", header=F, sep="\t", stringsAsFactors = F)
#structure <- read.table("20sliding.protein.structure.fa", header=T, sep="\t", stringsAsFactors = F)
structure <- read.table("ecoli.20sliding.ViennaRNA.output.value.id.txt", header=T, sep="\t", stringsAsFactors = F)
nuc <- read.table("nucleotide_counts_20sliding_temp.txt", header=T, sep="\t", stringsAsFactors = F)
rnaseq <- read.table("ecoli.rnaseq.average.20sliding.bed", header=T, sep="\t", stringsAsFactors = F)
pam <- read.table("NGG.PAM.20bp.sliding.windows.bed", header=F, sep="\t", stringsAsFactors = F)
window <- read.table("ecoli.20bp.sliding.bed", header=F, sep="\t", stringsAsFactors = F)
score <- read.table("sgRNA.coord.txt", header=T, sep="\t", stringsAsFactors = F)
colnames(score) <- c("chr", "start", "end", "sgRNA", "id", "seq", "id2", "cut.score", "gid", "change.val", "quality")
score.df <- score[,c(1:4,8)]
gatc.bin <- gatc %>% group_by(V1, V2, V3) %>% mutate(gatc.count = n())
gatc.count <- unique(gatc.bin[,c(1:3,8)])
gene.bin <- gene %>% group_by(V1, V2, V3) %>% mutate(gene.count = n())
gene.count <- unique(gene.bin[,c(1:3,14)])
pam.bin <- pam %>% group_by(V1, V2, V3) %>% mutate(pam.count = n())
pam.count <- unique(pam.bin[,c(1:3,12)])
window.v <- window[,1:3]
colnames(window.v) <- c("V1", "V2", "V3")
gatc.win <- left_join(window.v, gatc.count, by=c("V1", "V2", "V3"))
gatc.win[is.na(gatc.win)] <- 0
gene.win <- left_join(window.v, gene.count, by=c("V1", "V2", "V3"))
gene.win[is.na(gene.win)] <- 0
ipd.win <- left_join(window.v, ipd, by=c("V1", "V2", "V3"))
ipd.win[is.na(ipd.win)] <- 0
rnaseq.win <- left_join(window.v, rnaseq, by=c("V1", "V2", "V3"))
rnaseq.win[is.na(rnaseq.win)] <- 0
pam.win <- left_join(window.v, pam.count, by=c("V1", "V2", "V3"))
pam.win[is.na(pam.win)] <- 0
setwd("/gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli/counts")
write.table(gatc.win, "GATC.20bp.sliding.windows.count.bed", quote=F, row.names=F, sep="\t")
write.table(gene.win, "Gene.20bp.sliding.windows.count.bed", quote=F, row.names=F, sep="\t")
write.table(pam.win, "NGG.PAM.20bp.sliding.windows.count.bed", quote=F, row.names=F, sep="\t")
write.table(ipd.win, "IPD.20bp.sliding.windows.count.bed", quote=F, row.names=F, sep="\t")
write.table(structure, "Structure.20bp.sliding.windows.count.bed", quote=F, row.names=F, sep="\t")
write.table(nuc, "Nucleotide.20bp.sliding.windows.count.bed", quote=F, row.names=F, sep="\t")
write.table(rnaseq.win, "RNAseq.20bp.sliding.windows.count.bed", quote=F, row.names=F, sep="\t")
gene.df <- gene.win$gene.count
gatc.df <- gatc.win$gatc.count
pam.df <- pam.win$pam.count
ipd.df <- ipd.win[,4]
structure.df <- structure[,2]
gc.df <- nuc[,7]
temp.df <- nuc[,8]
rna.df <- rnaseq.win[,4]
setwd("/gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli/modwt")
temp.modwt <- wavMODWT(temp.df)
temp.modwt.df <- as.matrix(temp.modwt)
temp.modwt.label <- data.frame(label = row.names(temp.modwt.df), temp.modwt.df)
temp.modwt.dt <- as.data.table(temp.modwt.label)
temp.modwt.name <- temp.modwt.dt[, c("name", "number") := tstrsplit(label, "[^[:alnum:]]+")]
colnames(temp.modwt.name) <- c("label", "temp.dwt", "scale", "window")
write.table(temp.modwt.name, "temp.modwt.txt", quote=F, row.names=F, sep="\t")
gc.modwt <- wavMODWT(gc.df)
gc.modwt.df <- as.matrix(gc.modwt)
gc.modwt.label <- data.frame(label = row.names(gc.modwt.df), gc.modwt.df)
gc.modwt.dt <- as.data.table(gc.modwt.label)
gc.modwt.name <- gc.modwt.dt[, c("name", "number") := tstrsplit(label, "[^[:alnum:]]+")]
colnames(gc.modwt.name) <- c("label", "gc.dwt", "scale", "window")
write.table(gc.modwt.name, "gc.modwt.txt", quote=F, row.names=F, sep="\t")
structure.modwt <- wavMODWT(structure.df)
structure.modwt.df <- as.matrix(structure.modwt)
structure.modwt.label <- data.frame(label = row.names(structure.modwt.df), structure.modwt.df)
structure.modwt.dt <- as.data.table(structure.modwt.label)
structure.modwt.name <- structure.modwt.dt[, c("name", "number") := tstrsplit(label, "[^[:alnum:]]+")]
colnames(structure.modwt.name) <- c("label", "structure.dwt", "scale", "window")
#write.table(structure.modwt.name, "structure.modwt.txt", quote=F, row.names=F, sep="\t")
write.table(structure.modwt.name, "structure.vienna.modwt.txt", quote=F, row.names=F, sep="\t")
rna.modwt <- wavMODWT(rna.df)
rna.modwt.df <- as.matrix(rna.modwt)
rna.modwt.label <- data.frame(label = row.names(rna.modwt.df), rna.modwt.df)
rna.modwt.dt <- as.data.table(rna.modwt.label)
rna.modwt.name <- rna.modwt.dt[, c("name", "number") := tstrsplit(label, "[^[:alnum:]]+")]
colnames(rna.modwt.name) <- c("label", "rna.dwt", "scale", "window")
write.table(rna.modwt.name, "rnaseq.modwt.txt", quote=F, row.names=F, sep="\t")
ipd.modwt <- wavMODWT(ipd.df)
ipd.modwt.df <- as.matrix(ipd.modwt)
ipd.modwt.label <- data.frame(label = row.names(ipd.modwt.df), ipd.modwt.df)
ipd.modwt.dt <- as.data.table(ipd.modwt.label)
ipd.modwt.name <- ipd.modwt.dt[, c("name", "number") := tstrsplit(label, "[^[:alnum:]]+")]
colnames(ipd.modwt.name) <- c("label", "ipd.dwt", "scale", "window")
write.table(ipd.modwt.name, "ipd.modwt.txt", quote=F, row.names=F, sep="\t")
gene.modwt <- wavMODWT(gene.df)
gene.modwt.df <- as.matrix(gene.modwt)
gene.modwt.label <- data.frame(label = row.names(gene.modwt.df), gene.modwt.df)
gene.modwt.dt <- as.data.table(gene.modwt.label)
gene.modwt.name <- gene.modwt.dt[, c("name", "number") := tstrsplit(label, "[^[:alnum:]]+")]
colnames(gene.modwt.name) <- c("label", "gene.dwt", "scale", "window")
write.table(gene.modwt.name, "gene.density.modwt.txt", quote=F, row.names=F, sep="\t")
gatc.modwt <- wavMODWT(gatc.df)
gatc.modwt.df <- as.matrix(gatc.modwt)
gatc.modwt.label <- data.frame(label = row.names(gatc.modwt.df), gatc.modwt.df)
gatc.modwt.dt <- as.data.table(gatc.modwt.label)
gatc.modwt.name <- gatc.modwt.dt[, c("name", "number") := tstrsplit(label, "[^[:alnum:]]+")]
colnames(gatc.modwt.name) <- c("label", "gatc.dwt", "scale", "window")
write.table(gatc.modwt.name, "gatc.density.modwt.txt", quote=F, row.names=F, sep="\t")
pam.modwt <- wavMODWT(pam.df)
pam.modwt.df <- as.matrix(pam.modwt)
pam.modwt.label <- data.frame(label = row.names(pam.modwt.df), pam.modwt.df)
pam.modwt.dt <- as.data.table(pam.modwt.label)
pam.modwt.name <- pam.modwt.dt[, c("name", "number") := tstrsplit(label, "[^[:alnum:]]+")]
colnames(pam.modwt.name) <- c("label", "pam.dwt", "scale", "window")
write.table(pam.modwt.name, "pam.density.modwt.txt", quote=F, row.names=F, sep="\t")
colnames(window) <- c("chr", "start", "end")
window$window <- seq.int(nrow(window))
window$window <- as.character(window$window-1)
window$start <- as.numeric(window$start)
window$end <- as.numeric(window$end - 1)
write.table(window, "window.coordinates.txt", quote=F, row.names=F, sep="\t")
score <- read.table("sgRNA.coord.txt", header=T, sep="\t", stringsAsFactors = F)
colnames(score) <- c("chr", "start", "end", "sgRNA", "id", "seq", "id2", "cut.score", "gid", "change.val", "quality")
score.df <- score[,c(1:4,8)]
window.score.df <- left_join(score.df, window, by=c("chr", "start", "end"))
write.table(window.score.df, "window.score.txt", quote=F, row.names=F, sep="\t")
#!/bin/bash
#SBATCH -A SYB105
#SBATCH -J dwt
#SBATCH -N 2
#SBATCH -p gpu
#SBATCH -t 48:00:00
#SBATCH --mem-per-cpu=0
source /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/scripts/loadcondaandes.sh
conda activate /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/Libraries/andes/envs/test
cd /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli/
R CMD BATCH dwt.R
# sbatch /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli/dwt.sh
salloc -A SYB105 -N 2 -p gpu -t 4:00:00
source /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/scripts/loadcondaandes.sh
conda activate /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/Libraries/andes/envs/test
R
library(dplyr)
library(reshape2)
library(tidyr)
library(data.table)
setwd("/gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli/modwt")
temp.modwt.name <- read.table("temp.modwt.txt", header=T, sep="\t", stringsAsFactors = F)
gc.modwt.name <- read.table("gc.modwt.txt", header=T, sep="\t", stringsAsFactors = F)
#structure.modwt.name <- read.table("structure.modwt.txt", header=T, sep="\t", stringsAsFactors = F)
structure.modwt.name <- read.table("structure.vienna.modwt.txt", header=T, sep="\t", stringsAsFactors = F)
rna.modwt.name <- read.table("rnaseq.modwt.txt", header=T, sep="\t", stringsAsFactors = F)
ipd.modwt.name <- read.table("ipd.modwt.txt", header=T, sep="\t", stringsAsFactors = F)
gene.modwt.name <- read.table("gene.density.modwt.txt", header=T, sep="\t", stringsAsFactors = F)
gatc.modwt.name <- read.table("gatc.density.modwt.txt", header=T, sep="\t", stringsAsFactors = F)
pam.modwt.name <- read.table("pam.density.modwt.txt", header=T, sep="\t", stringsAsFactors = F)
gene.modwt.name$window <- as.integer(gene.modwt.name$window)
gatc.modwt.name$window <- as.integer(gatc.modwt.name$window)
pam.modwt.name$window <- as.integer(pam.modwt.name$window)
setwd("/gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli")
window <- read.table("ecoli.20bp.sliding.bed", header=F, sep="\t", stringsAsFactors = F)
score <- read.table("sgRNA.coord.txt", header=T, sep="\t", stringsAsFactors = F)
colnames(score) <- c("chr", "start", "end", "sgRNA", "id", "seq", "id2", "cut.score", "gid", "change.val", "quality")
score.df <- score[,c(1:4,8)]
colnames(window) <- c("chr", "start", "end")
window$window <- seq.int(nrow(window))
window$window <- as.character(window$window-1)
window$start <- as.numeric(window$start)
window$end <- as.numeric(window$end - 1)
window.score.df <- left_join(score.df, window, by=c("chr", "start", "end"))
window.score.df$window <- as.integer(window.score.df$window)
window.score.temp <- left_join(window.score.df, temp.modwt.name[,c(3,4,2)], by="window")
window.temp.gc <- left_join(window.score.temp, gc.modwt.name[,c(3,4,2)], by=c("window", "scale"))
window.temp.gc.structure <- left_join(window.temp.gc, structure.modwt.name[,c(3,4,2)], by=c("window", "scale"))
window.temp.gc.structure.rna <- left_join(window.temp.gc.structure, rna.modwt.name[,c(3,4,2)], by=c("window", "scale"))
window.temp.gc.structure.rna.gene <- left_join(window.temp.gc.structure.rna, gene.modwt.name[,c(3,4,2)], by=c("window", "scale"))
window.temp.gc.structure.rna.gene.gatc <- left_join(window.temp.gc.structure.rna.gene, gatc.modwt.name[,c(3,4,2)], by=c("window", "scale"))
window.temp.gc.structure.rna.gene.gatc.ipd <- left_join(window.temp.gc.structure.rna.gene.gatc, ipd.modwt.name[,c(3,4,2)], by=c("window", "scale"))
window.temp.gc.structure.rna.gene.gatc.ipd.pam <- left_join(window.temp.gc.structure.rna.gene.gatc.ipd, pam.modwt.name[,c(3,4,2)], by=c("window", "scale"))
# 1293773
window.temp.gc.structure.rna.gene.gatc.ipd.pam.sgRNA <- subset(window.temp.gc.structure.rna.gene.gatc.ipd.pam, window.temp.gc.structure.rna.gene.gatc.ipd.pam$cut.score != "NA")
# 931362
write.table(window.temp.gc.structure.rna.gene.gatc.ipd.pam.sgRNA, "ecoli.20sliding.exact.DWTall.txt", quote=F, row.names=F, sep="\t")
df.melt <- melt(window.temp.gc.structure.rna.gene.gatc.ipd.pam.sgRNA[,c(4,5,7:15)], id=c("cut.score", "scale", "sgRNA"))
df <- na.omit(df.melt)
colnames(df) <- c("cut.score", "scale", "sgRNA", "variable", "value")
df.id <- df %>% unite(feature.scale, c(variable, scale), sep = "")
df.dcast <- df.id %>% dcast(sgRNA + cut.score ~ feature.scale, value.var = "value", fun.aggregate=mean, na.rm=TRUE)
df.dcast.na <- na.omit(df.dcast)
# 40468
#write.table(df.dcast.na, "ecoli.20sliding.exact.DWTall.dcast.txt", quote=F, row.names=F, sep="\t")
write.table(df.dcast.na, "ecoli.20sliding.exact.DWTall.vienna.dcast.txt", quote=F, row.names=F, sep="\t")
setwd("/gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli")
# ## upstream bin
# score <- read.table("sgRNA.coord.txt", header=T, sep="\t", stringsAsFactors = F)
# colnames(score) <- c("chr", "start", "end", "sgRNA", "id", "seq", "id2", "cut.score", "gid", "change.val", "quality")
# score.df <- score[,c(1:4,8)]
#
# score.df$chr <- score.df$chr
# score.df$end <- score.df$start
# score.df$start <- score.df$start - 19
#
# window.score.df <- left_join(score.df, window, by=c("chr", "start", "end"))
# window.score.df$window <- as.integer(window.score.df$window)
# window.score.temp <- left_join(window.score.df, temp.modwt.name, by="window")
# window.temp.gc <- left_join(window.score.temp, gc.modwt.name, by=c("window", "scale"))
# window.temp.gc.structure <- left_join(window.temp.gc, structure.modwt.name, by=c("window", "scale"))
# window.temp.gc.structure.rna <- left_join(window.temp.gc.structure, rna.modwt.name, by=c("window", "scale"))
# window.temp.gc.structure.rna.gene <- left_join(window.temp.gc.structure.rna, gene.modwt.name, by=c("window", "scale"))
# window.temp.gc.structure.rna.gene.gatc <- left_join(window.temp.gc.structure.rna.gene, gatc.modwt.name, by=c("window", "scale"))
# window.temp.gc.structure.rna.gene.gatc.ipd <- left_join(window.temp.gc.structure.rna.gene.gatc, ipd.modwt.name, by=c("window", "scale"))
# window.temp.gc.structure.rna.gene.gatc.ipd.pam <- left_join(window.temp.gc.structure.rna.gene.gatc.ipd, pam.modwt.name, by=c("window", "scale"))
# window.temp.gc.structure.rna.gene.gatc.ipd.pam.sgRNA <- subset(window.temp.gc.structure.rna.gene.gatc.ipd.pam, window.temp.gc.structure.rna.gene.gatc.ipd.pam$cut.score != "NA")
#
# df.melt <- melt(window.temp.gc.structure.rna.gene.gatc.ipd.pam.sgRNA[,c(4,5,7:15)], id=c("cut.score", "scale", "sgRNA"))
# df <- na.omit(df.melt)
# colnames(df) <- c("cut.score", "scale", "sgRNA", "variable", "value")
#
# df.id <- df %>% unite(feature.scale, c(variable, scale), sep = "")
# df.dcast <- df.id %>% dcast(sgRNA + cut.score ~ feature.scale, value.var = "value", fun.aggregate=mean, na.rm=TRUE)
# df.dcast.na <- na.omit(df.dcast)
# #
# write.table(df.dcast.na, "ecoli.20sliding.up20.DWTall.dcast.txt", quote=F, row.names=F, sep="\t")
#
#
#
# ## downstream bin
# score <- read.table("sgRNA.coord.txt", header=T, sep="\t", stringsAsFactors = F)
# colnames(score) <- c("chr", "start", "end", "sgRNA", "id", "seq", "id2", "cut.score", "gid", "change.val", "quality")
# score.df <- score[,c(1:4,8)]
#
# score$chr <- score$chr
# score$start <- score$end
# score$end <- score$start + 19
#
# window.score.df <- left_join(score.df, window, by=c("chr", "start", "end"))
# window.score.df$window <- as.integer(window.score.df$window)
# window.score.temp <- left_join(window.score.df, temp.modwt.name, by="window")
# window.temp.gc <- left_join(window.score.temp, gc.modwt.name, by=c("window", "scale"))
# window.temp.gc.structure <- left_join(window.temp.gc, structure.modwt.name, by=c("window", "scale"))
# window.temp.gc.structure.rna <- left_join(window.temp.gc.structure, rna.modwt.name, by=c("window", "scale"))
# window.temp.gc.structure.rna.gene <- left_join(window.temp.gc.structure.rna, gene.modwt.name, by=c("window", "scale"))
# window.temp.gc.structure.rna.gene.gatc <- left_join(window.temp.gc.structure.rna.gene, gatc.modwt.name, by=c("window", "scale"))
# window.temp.gc.structure.rna.gene.gatc.ipd <- left_join(window.temp.gc.structure.rna.gene.gatc, ipd.modwt.name, by=c("window", "scale"))
# window.temp.gc.structure.rna.gene.gatc.ipd.pam <- left_join(window.temp.gc.structure.rna.gene.gatc.ipd, pam.modwt.name, by=c("window", "scale"))
# window.temp.gc.structure.rna.gene.gatc.ipd.pam.sgRNA <- subset(window.temp.gc.structure.rna.gene.gatc.ipd.pam, window.temp.gc.structure.rna.gene.gatc.ipd.pam$cut.score != "NA")
#
# df.melt <- melt(window.temp.gc.structure.rna.gene.gatc.ipd.pam.sgRNA[,c(4,5,7:15)], id=c("cut.score", "scale", "sgRNA"))
# df <- na.omit(df.melt)
# colnames(df) <- c("cut.score", "scale", "sgRNA", "variable", "value")
#
# df.id <- df %>% unite(feature.scale, c(variable, scale), sep = "")
# df.dcast <- df.id %>% dcast(sgRNA + cut.score ~ feature.scale, value.var = "value", fun.aggregate=mean, na.rm=TRUE)
# df.dcast.na <- na.omit(df.dcast)
# #
# write.table(df.dcast.na, "ecoli.20sliding.down20.DWTall.dcast.txt", quote=F, row.names=F, sep="\t")
–> combine to generate full feature matrix
# combine regional DWT with other features
library(tidyr)
library(dplyr)
setwd("/gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli")
#df.dcast.na <- read.delim("ecoli.20sliding.exact.DWTall.dcast.txt", header=T, sep="\t", stringsAsFactors = F)
df.dcast.na <- read.delim("ecoli.20sliding.exact.DWTall.vienna.dcast.txt", header=T, sep="\t", stringsAsFactors = F)
df.dcast.sep <- df.dcast.na %>% separate(sgRNA, c("sgRNA", "ID"), sep="_")
df.dcast.dwt <- df.dcast.sep[,c(4:ncol(df.dcast.sep))]
colnames(df.dcast.dwt) <- paste0('sgRNA_', colnames(df.dcast.dwt))
df.dcast <- cbind(df.dcast.sep[,1:3], df.dcast.dwt)
# setwd("/gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli")
# df.dcast.na2 <- read.delim("ecoli.20sliding.up20.DWTall.dcast.txt", header=T, sep="\t", stringsAsFactors = F)
# df.dcast.sep2 <- df.dcast.na2 %>% separate(sgRNA, c("sgRNA", "ID"), sep="_")
# df.dcast.dwt2 <- df.dcast.sep2[,c(4:ncol(df.dcast.sep2))]
# colnames(df.dcast.dwt2) <- paste0('sgRNA_', colnames(df.dcast.dwt2))
# df.dcast2 <- cbind(df.dcast.sep2[,1:3], df.dcast.dwt2)
#
# setwd("/gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli")
# df.dcast.na3 <- read.delim("ecoli.20sliding.down20.DWTall.dcast.txt", header=T, sep="\t", stringsAsFactors = F)
# df.dcast.sep3 <- df.dcast.na3 %>% separate(sgRNA, c("sgRNA", "ID"), sep="_")
# df.dcast.dwt3 <- df.dcast.sep3[,c(4:ncol(df.dcast.sep3))]
# colnames(df.dcast.dwt3) <- paste0('sgRNA_', colnames(df.dcast.dwt3))
# df.dcast3 <- cbind(df.dcast.sep3[,1:3], df.dcast.dwt3)
df <- read.delim("ecoli.20sliding.pam.all.features.dcast.txt", header=T, sep="\t", stringsAsFactors = F)
df <- df[,c(1:1649,2133:2138)]
df.sep <- df %>% separate(sgRNAID, c("sgRNA", "ID", "type"), sep="_")
nrow(df.sep)
# 118140
ncol(df.sep)
# 1657
df.sep.region <- inner_join(df.sep, df.dcast[,c(1,2,4:ncol(df.dcast.sep))], by=c("sgRNA", "ID"))
# df.sep.region2 <- inner_join(df.sep.region, df.dcast2[,c(1,2,4:ncol(df.dcast.sep2))], by=c("sgRNA", "ID"))
# df.sep.region3 <- inner_join(df.sep.region2, df.dcast3[,c(1,2,4:ncol(df.dcast.sep3))], by=c("sgRNA", "ID"))
# ## note that dwt.x = sgRNA dwt, dwt.y = upstream dwt, dwt = downstream dwt
# df.sep.region.id <- df.sep.region3 %>% unite(sgRNAID, c("sgRNA", "ID", "type"), sep="_")
# nrow(df.sep.region.id)
# #
df.sep.region.id <- df.sep.region %>% unite(sgRNAID, c("sgRNA", "ID", "type"), sep="_")
nrow(df.sep.region.id)
# 118140
ncol(df.sep.region.id)
# 1839
#write.table(df.sep.region.id, "ecoli.20sliding.dwtexact.all.features.dcast.26oct.txt", quote=F, row.names=F, sep="\t")
write.table(df.sep.region.id, "ecoli.20sliding.dwtexact.all.features.dcast.5nov.txt", quote=F, row.names=F, sep="\t")
#!/bin/bash
#SBATCH -A SYB105
#SBATCH -J iRF.test
#SBATCH -N 1
#SBATCH -p gpu
#SBATCH -t 24:00:00
#SBATCH --mem-per-cpu=0
source /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/scripts/loadcondaandes.sh
conda activate /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/Libraries/andes/envs/test
cd /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli/
R CMD BATCH iRF.test.R
R CMD BATCH iRF.test.full.R
# sbatch /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli/iRF.test.sh
#!/bin/bash -l
#BSUB -P SYB105
#BSUB -W 04:00
#BSUB -J iRF.test_0
#BSUB -nnodes 50
module load r
cd /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli/
R CMD BATCH iRF.test.full.R
# bsub /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli/iRF.test.summit.sh
# salloc -A SYB105 -p gpu -N 1 -t 4:00:00
source /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/scripts/loadcondaandes.sh
conda activate /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/Libraries/andes/envs/test
cd /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli/
grep '_Cas9' ecoli.20sliding.dwtexact.all.features.dcast.26oct.txt > ecoli.20sliding.dwtexact.all.features.dcast.26oct.cas9.txt
awk 'NR==1{print $0}' ecoli.20sliding.dwtexact.all.features.dcast.26oct.txt > ecoli.20sliding.dwtexact.all.features.dcast.26oct.header.txt
cat ecoli.20sliding.dwtexact.all.features.dcast.26oct.header.txt ecoli.20sliding.dwtexact.all.features.dcast.26oct.cas9.txt > ecoli.20sliding.dwtexact.all.features.dcast.26oct.cas9.header.txt
cd /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli/
grep '_Cas9' ecoli.20sliding.dwtexact.all.features.dcast.5nov.txt > ecoli.20sliding.dwtexact.all.features.dcast.5nov.cas9.txt
awk 'NR==1{print $0}' ecoli.20sliding.dwtexact.all.features.dcast.5nov.txt > ecoli.20sliding.dwtexact.all.features.dcast.5nov.header.txt
cat ecoli.20sliding.dwtexact.all.features.dcast.5nov.header.txt ecoli.20sliding.dwtexact.all.features.dcast.5nov.cas9.txt > ecoli.20sliding.dwtexact.all.features.dcast.5nov.cas9.header.txt
R
library(ranger)
iRF <- function(xmat, y, ntree=200, iter=5, classification=F, threads=1, alwayssplits=NULL, saveall=T)
{
tmp <- cbind(xmat, Y = y)
wt <- rep(1/ncol(xmat), ncol(xmat)) # start with equal sample weighting per SNP
rfs <- list()
for(i in 1:iter)
{
cat("\niRF iteration ",i,"\n")
cat("=================\n")
mtry = 0.5*sum(wt>0)
rf <- ranger::ranger(dependent.variable.name = "Y", data = tmp, num.trees=ntree,
split.select.weights = wt, classification = classification,
mtry = mtry, importance = "impurity_corrected", num.threads=threads, write.forest = T,
always.split.variables = alwayssplits)
wt <- rf$variable.importance / sum(abs(rf$variable.importance)) # scale importance to range(0,1)
wt[wt<0] <- 0 # set negative weights to zero
cat("mtry: ", mtry, "\n")
cat("prediction error: ",rf$prediction.error,"\n")
if(classification==FALSE) cat("r^2: ",rf$r.squared,"\n")
if(classification==TRUE) print(rf$confusion.matrix)
cat("cor(y,yhat): ",cor(rf$predictions,y),"\n")
cat("SNPs with importance > 0:",sum(wt>0),"\n")
if(saveall) rfs[[i]] <- rf
if(sum(wt>0) < max(0.01*(ncol(xmat)-1), 10))
{
if(!saveall) rfs <- rf
break
}
}
return(rfs)
}
# all features
setwd("/gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli")
df <- read.delim("ecoli.20sliding.dwtexact.all.features.dcast.26oct.cas9.header.txt", header=T, sep="\t", stringsAsFactors = F)
df.sample <- df[sample(nrow(df), 10000), ]
names(df.sample)
# sgRNAID: [,1]
# cut.score: [,2]
# one-hot independent: [,c(3:17,1645:1649,1651:1652,1654:1655)]
# one-hot dependent: [,c(18:57,120:139,202:221,284:303,366:385,448:467,530:549,612:631,694:713,776:795,920:943,1068:1087,1150:1169,1232:1251,1314:1333,1396:1415,1478:1497,1560:1579)]
# chemical tensors: [,c(58:119,140:201,222:283,304:365,386:447,468:529,550:611,632:693,714:775,796:919,944:1067,1088:1149,1170:1231,1252:1313,1334:1395,1416:1477,1498:1559,1580:1641)]
# raw (gc, structure, temp, gene.distance, pam.distance): [,c(1642:1644,1650,1653)]
# DWT gatc motif: [,1656:1678]
# DWT gc content: [,1679:1701]
# DWT gene density: [,1702:1724]
# DWT ipd: [,1725:1747]
# DWT PAM: [,1748:1770]
# DWT rna-seq expression: [,1771:1793]
# DWT rna structure: [,1794:1816]
# DWT melting temp: [,1817:1839]
# all DWT features (gc, temp, structure, gene density, RNAseq, IPD, GATC, PAM)
iRF(df.sample[,1656:1839], df.sample$cut.score)
# absolute value DWT
iRF(abs(df.sample[,1656:1839]), df.sample$cut.score)
# PAM DWT
iRF(df.sample[,1748:1770], df.sample$cut.score)
iRF(df[,1748:1770], df$cut.score)
# Gene DWT
iRF(df.sample[,1702:1724], df.sample$cut.score)
# Raw features (gc, temp, structure, location, pam)
iRF(df.sample[,c(1642:1644,1650,1653)], df.sample$cut.score)
# One-hot features (dependent & independent sgRNA, PAM)
## independent
iRF(df.sample[,c(3:17,1645:1649,1651:1652,1654:1655)], df.sample$cut.score)
## dependent
iRF(df.sample[,c(18:57,120:139,202:221,284:303,366:385,448:467,530:549,612:631,694:713,776:795,920:943,1068:1087,1150:1169,1232:1251,1314:1333,1396:1415,1478:1497,1560:1579)], df.sample$cut.score)
## ind/dep
iRF(df.sample[,c(3:57,120:139,202:221,284:303,366:385,448:467,530:549,612:631,694:713,776:795,920:943,1068:1087,1150:1169,1232:1251,1314:1333,1396:1415,1478:1497,1560:1579,1645:1649,1651:1652,1654:1655)], df.sample$cut.score)
# Tensor features
iRF(df.sample[,c(58:119,140:201,222:283,304:365,386:447,468:529,550:611,632:693,714:775,796:919,944:1067,1088:1149,1170:1231,1252:1313,1334:1395,1416:1477,1498:1559,1580:1641)], df.sample$cut.score)
# Raw + Onehot
iRF(df.sample[,c(1642:1644,1650,1653,3:57,120:139,202:221,284:303,366:385,448:467,530:549,612:631,694:713,776:795,920:943,1068:1087,1150:1169,1232:1251,1314:1333,1396:1415,1478:1497,1560:1579,1645:1649,1651:1652,1654:1655)], df.sample$cut.score)
# Raw + Tensor
iRF(df.sample[,c(1642:1644,1650,1653,58:119,140:201,222:283,304:365,386:447,468:529,550:611,632:693,714:775,796:919,944:1067,1088:1149,1170:1231,1252:1313,1334:1395,1416:1477,1498:1559,1580:1641)], df.sample$cut.score)
# Raw + DWT
iRF(df.sample[,c(1642:1644,1650,1653,1656:1839)], df.sample$cut.score)
# Onehot + Tensor
iRF(df.sample[,c(3:57,120:139,202:221,284:303,366:385,448:467,530:549,612:631,694:713,776:795,920:943,1068:1087,1150:1169,1232:1251,1314:1333,1396:1415,1478:1497,1560:1579,1645:1649,1651:1652,1654:1655,1642:1644,1650,1653,58:119,140:201,222:283,304:365,386:447,468:529,550:611,632:693,714:775,796:919,944:1067,1088:1149,1170:1231,1252:1313,1334:1395,1416:1477,1498:1559,1580:1641)], df.sample$cut.score)
# Onehot + DWT
iRF(df.sample[,c(3:57,120:139,202:221,284:303,366:385,448:467,530:549,612:631,694:713,776:795,920:943,1068:1087,1150:1169,1232:1251,1314:1333,1396:1415,1478:1497,1560:1579,1645:1649,1651:1652,1654:1655,1656:1839)], df.sample$cut.score)
# Tensor + DWT
iRF(df.sample[,c(58:119,140:201,222:283,304:365,386:447,468:529,550:611,632:693,714:775,796:919,944:1067,1088:1149,1170:1231,1252:1313,1334:1395,1416:1477,1498:1559,1580:1641,1656:1839)], df.sample$cut.score)
# Raw + Onehot + Tensor
iRF(df.sample[,c(1642:1644,1650,1653,3:57,120:139,202:221,284:303,366:385,448:467,530:549,612:631,694:713,776:795,920:943,1068:1087,1150:1169,1232:1251,1314:1333,1396:1415,1478:1497,1560:1579,1645:1649,1651:1652,1654:1655,1642:1644,1650,1653,58:119,140:201,222:283,304:365,386:447,468:529,550:611,632:693,714:775,796:919,944:1067,1088:1149,1170:1231,1252:1313,1334:1395,1416:1477,1498:1559,1580:1641)], df.sample$cut.score)
# Raw + Onehot + DWT
iRF(df.sample[,c(1642:1644,1650,1653,3:57,120:139,202:221,284:303,366:385,448:467,530:549,612:631,694:713,776:795,920:943,1068:1087,1150:1169,1232:1251,1314:1333,1396:1415,1478:1497,1560:1579,1645:1649,1651:1652,1654:1655,1656:1839)], df.sample$cut.score)
# Raw + Tensor + DWT
iRF(df.sample[,c(1642:1644,1650,1653,58:119,140:201,222:283,304:365,386:447,468:529,550:611,632:693,714:775,796:919,944:1067,1088:1149,1170:1231,1252:1313,1334:1395,1416:1477,1498:1559,1580:1641,1656:1839)], df.sample$cut.score)
# Onehot + Tensor + DWT
iRF(df.sample[,c(3:57,120:139,202:221,284:303,366:385,448:467,530:549,612:631,694:713,776:795,920:943,1068:1087,1150:1169,1232:1251,1314:1333,1396:1415,1478:1497,1560:1579,1645:1649,1651:1652,1654:1655,1642:1644,1650,1653,58:119,140:201,222:283,304:365,386:447,468:529,550:611,632:693,714:775,796:919,944:1067,1088:1149,1170:1231,1252:1313,1334:1395,1416:1477,1498:1559,1580:1641,1656:1839)], df.sample$cut.score)
# Raw + Onehot + Tensor + DWT
iRF(df.sample[,c(1642:1644,1650,1653,3:57,120:139,202:221,284:303,366:385,448:467,530:549,612:631,694:713,776:795,920:943,1068:1087,1150:1169,1232:1251,1314:1333,1396:1415,1478:1497,1560:1579,1645:1649,1651:1652,1654:1655,1642:1644,1650,1653,58:119,140:201,222:283,304:365,386:447,468:529,550:611,632:693,714:775,796:919,944:1067,1088:1149,1170:1231,1252:1313,1334:1395,1416:1477,1498:1559,1580:1641,1656:1839)], df.sample$cut.score)
#### noting that iteration 5 is generally NOT the best iteration... iteration 2-4 appears to be better...
## All features [subset 10,000 samples]
iRF(df.sample[,3:ncol(df.sample)], df.sample$cut.score)
## All feautres [full dataset, not subset]
iRF(df[,3:ncol(df)], df$cut.score)
### output for matrix including cas9, espcas9, recacas9 (sampled to 10,000 rows)
# Features Number of Features Iteration Mtry Prediction Error R^2 Cor(y,yhat) cor(y,yhat
# DWT 184 5 10 92.09522 -0.002174842 0.09527017 0.096
# PAM DWT 23 1 11.5 92.47729 -0.006332565 0.06393301 0.064
# Gene DWT 23 1 11.5 93.14885 -0.01364039 0.05138173 0.051
# Raw 5 1 2.5 91.41068 0.00527421 0.1105522 0.111
# Independent Onehot 24 1 12 90.7956 0.01196749 0.134344 0.134
# Dependent Onehot 384 2 93.5 82.48789 0.1023715 0.3220053 0.322
# Onehot (Ind+Dep) 408 3 63.5 82.46503 0.1026203 0.3223253 0.322
# Tensors 1240 4 102 83.49506 0.09141151 0.3038641 0.304
# Raw + Onehot 413 4 43 82.06042 0.1089794 0.3329601 0.333
# Raw + Tensor 1245 3 153.5 83.0586 0.09814107 0.3144827 0.314
# Raw + DWT 189 3 19.5 90.18321 0.02078131 0.1574552 0.157
# Onehot + Tensor 1648 4 117 81.59929 0.1139864 0.3389855 0.339
# Onehot + DWT 592 4 69.5 83.3466 0.09302715 0.3054714 0.305
# Tensor + DWT 1424 4 159 84.46883 0.08081508 0.2852729 0.285
# Raw + Onehot + Tensor 1653 3 225 81.40193 0.1161293 0.3408284 0.341
# Raw + Onehot + DWT 597 3 87 82.42158 0.1050579 0.3245071 0.325
# Raw + Tensor + DWT 1429 3 238 83.87865 0.08723739 0.2957214 0.296
# Onehot + Tensor + DWT 1832 5 111 81.73391 0.1125246 0.3355977 0.336
# All Features [subset matrix] 1837 5 139.5 82.8149 0.09881302 0.3143296 0.314
### output of same subsets but only cas9 (sampled to 10,000 rows)
# Features Number of Features Iteration Mtry Prediction Error R^2 Cor(y,yhat) cor(y,yhat)
# DWT 184 4 18 107.3934 0.02450032 0.1737309 0.174
# PAM DWT 23 4 5 108.8916 0.01089157 0.1551357 0.155
# Gene DWT 23 2 5.5 110.7235 -0.005747785 0.1195365 0.12
# Raw 5 1 2.5 108.0135 0.01886812 0.154704 0.155
# Independent Onehot 24 1 12 106.0588 0.03662277 0.203435 0.203
# Dependent Onehot 384 2 126.5 86.01369 0.2187014 0.4676271 0.468
# Onehot (Ind+Dep) 408 5 48.5 86.41148 0.2150881 0.4644804 0.464
# Tensors 1240 5 132 88.51847 0.1959494 0.4456127 0.446
# Raw + Onehot 413 2 120.5 85.46009 0.2237299 0.4729325 0.473
# Raw + Tensor 1245 4 171.5 88.52899 0.1958538 0.445104 0.445
# Raw + DWT 189 5 15.5 104.2253 0.05327758 0.2391421 0.239
# Onehot + Tensor 1648 5 151 84.59786 0.231562 0.4818554 0.482
# Onehot + DWT 592 4 89.5 87.64419 0.2038909 0.4517864 0.452
# Tensor + DWT 1424 5 185 90.28895 0.1798674 0.4245436 0.425
# Raw + Onehot + Tensor 1653 5 145.5 84.68096 0.2308071 0.4816229 0.482
# Raw + Onehot + DWT 597 3 116.5 87.28943 0.2071133 0.4554919 0.455
# Raw + Tensor + DWT 1429 5 175.5 89.21264 0.189644 0.4362274 0.436
# Onehot + Tensor + DWT 1832 4 248 86.11713 0.2177618 0.4676488 0.468
# Raw + Onehot + Tensor + DWT 1837 4 270.5 86.39748 0.2152153 0.465161 0.465
#!/bin/bash
#SBATCH -A SYB105
#SBATCH -J GBR.test
#SBATCH -N 1
#SBATCH -t 24:00:00
#SBATCH --mem-per-cpu=0
source /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/scripts/loadcondaandes.sh
conda activate /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/Libraries/andes/envs/test
cd /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli/
R CMD BATCH GBR.test.R
# sbatch /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli/GBR.test.sh
# salloc -A SYB105 -N 2 -t 4:00:00 -p gpu
source /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/scripts/loadcondaandes.sh
conda activate /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/Libraries/andes/envs/test
library(dplyr)
library(caret)
setwd("/gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli/")
df <- read.delim("ecoli.20sliding.dwtexact.all.features.dcast.26oct.cas9.header.txt", header=T, sep="\t")
## run GBR on just the raw (gc content, melting temp, structure) + one-hot (all) to try and replicate the current published model [0.54]
data <- df[,c(2,1642:1644,3:57,120:139,202:221,284:303,366:385,448:467,530:549,612:631,694:713,776:795,920:943,1068:1087,1150:1169,1232:1251,1314:1333,1396:1415,1478:1497,1560:1579,1645:1649,1651:1652,1654:1655)]
set.seed(998)
inTraining <- createDataPartition(data$cut.score, p = .75, list = FALSE)
training <- data[ inTraining,]
testing <- data[-inTraining,]
fitControl <- trainControl(## 5-fold CV
method = "repeatedcv",
number = 5,
## repeated ten times
repeats = 10)
set.seed(825)
gbmFit1 <- train(cut.score ~ ., data = training,
method = "gbm",
trControl = fitControl,
verbose = FALSE)
gbmFit1
# Stochastic Gradient Boosting
#
# 30352 samples
# 411 predictor
#
# No pre-processing
# Resampling: Cross-Validated (5 fold, repeated 10 times)
# Summary of sample sizes: 24282, 24281, 24281, 24282, 24282, 24282, ...
# Resampling results across tuning parameters:
#
# interaction.depth n.trees RMSE Rsquared MAE
# 1 50 9.857762 0.1634593 8.131042
# 1 100 9.601999 0.1935950 7.869301
# 1 150 9.463976 0.2077306 7.720368
# 2 50 9.587247 0.1965702 7.856366
# 2 100 9.355660 0.2222891 7.607528
# 2 150 9.238397 0.2372502 7.481856
# 3 50 9.447621 0.2124407 7.709178
# 3 100 9.232704 0.2392526 7.479256
# 3 150 9.127956 0.2527890 7.366385
#
# Tuning parameter 'shrinkage' was held constant at a value of 0.1
#
# Tuning parameter 'n.minobsinnode' was held constant at a value of 10
# RMSE was used to select the optimal model using the smallest value.
# The final values used for the model were n.trees = 150, interaction.depth =
# 3, shrinkage = 0.1 and n.minobsinnode = 10.
head(summary(gbmFit1))
# var rel.inf
# p20.TsgRNA.raw p20.TsgRNA.raw 9.122611
# sgRNA.gcsgRNA.raw sgRNA.gcsgRNA.raw 8.955031
# p19.GGsgRNA.raw p19.GGsgRNA.raw 5.878595
# p20.AsgRNA.raw p20.AsgRNA.raw 5.360743
# p18.GsgRNA.raw p18.GsgRNA.raw 4.816805
# p18.CCsgRNA.raw p18.CCsgRNA.raw 4.667845
test.df <- na.omit(testing)
pred <- predict(gbmFit1, newdata = test.df)
pred.df <- data.frame(pred.score = pred, exp.score = test.df$cut.score)
cor(pred.df$pred.score, pred.df$exp.score)
# 0.5090717
#### when run with first 10,000 rows --> 0.5311444 ... with iRF --> 0.488
## run GBR on all features (minus DWT)
data <- df[,2:1655]
set.seed(998)
inTraining <- createDataPartition(data$cut.score, p = .75, list = FALSE)
training <- data[ inTraining,]
testing <- data[-inTraining,]
fitControl <- trainControl(## 5-fold CV
method = "repeatedcv",
number = 5,
## repeated ten times
repeats = 10)
set.seed(825)
gbmFit1 <- train(cut.score ~ ., data = training,
method = "gbm",
trControl = fitControl,
verbose = FALSE)
gbmFit1
# Stochastic Gradient Boosting
#
# 30352 samples
# 1653 predictor
#
# No pre-processing
# Resampling: Cross-Validated (5 fold, repeated 10 times)
# Summary of sample sizes: 24282, 24281, 24281, 24282, 24282, 24282, ...
# Resampling results across tuning parameters:
#
# interaction.depth n.trees RMSE Rsquared MAE
# 1 50 9.781608 0.1618878 8.046404
# 1 100 9.554316 0.1934875 7.813870
# 1 150 9.429544 0.2086376 7.679018
# 2 50 9.544578 0.1968047 7.806329
# 2 100 9.329932 0.2236894 7.575133
# 2 150 9.219098 0.2391426 7.457800
# 3 50 9.418402 0.2138478 7.673794
# 3 100 9.214167 0.2408310 7.453793
# 3 150 9.110381 0.2550280 7.342311
#
# Tuning parameter 'shrinkage' was held constant at a value of 0.1
#
# Tuning parameter 'n.minobsinnode' was held constant at a value of 10
# RMSE was used to select the optimal model using the smallest value.
# The final values used for the model were n.trees = 150, interaction.depth =
# 3, shrinkage = 0.1 and n.minobsinnode = 10.
head(summary(gbmFit1))
# var rel.inf
# p20homo_lumo_energygapraw p20homo_lumo_energygapraw 11.920842
# sgRNA.gcsgRNA.raw sgRNA.gcsgRNA.raw 7.118342
# p19.GGsgRNA.raw p19.GGsgRNA.raw 6.611344
# p18xz_quadrupoleraw p18xz_quadrupoleraw 4.710569
# p15.CCsgRNA.raw p15.CCsgRNA.raw 3.834847
# CCsgRNA.raw CCsgRNA.raw 3.514927
test.df <- na.omit(testing)
pred <- predict(gbmFit1, newdata = test.df)
pred.df <- data.frame(pred.score = pred, exp.score = test.df$cut.score)
cor(pred.df$pred.score, pred.df$exp.score)
# 0.5061305
## run GBR on all features (including DWT)
data <- df[,2:ncol(df)]
set.seed(998)
inTraining <- createDataPartition(data$cut.score, p = .75, list = FALSE)
training <- data[ inTraining,]
testing <- data[-inTraining,]
fitControl <- trainControl(## 5-fold CV
method = "repeatedcv",
number = 5,
## repeated ten times
repeats = 10)
set.seed(825)
gbmFit1 <- train(cut.score ~ ., data = training,
method = "gbm",
trControl = fitControl,
verbose = FALSE)
gbmFit1
# Stochastic Gradient Boosting
#
# 30352 samples
# 1837 predictor
#
# No pre-processing
# Resampling: Cross-Validated (5 fold, repeated 10 times)
# Summary of sample sizes: 24282, 24281, 24281, 24282, 24282, 24282, ...
# Resampling results across tuning parameters:
#
# interaction.depth n.trees RMSE Rsquared MAE
# 1 50 9.758421 0.1724696 8.025384
# 1 100 9.498191 0.2093477 7.760344
# 1 150 9.351016 0.2264783 7.603725
# 2 50 9.494041 0.2122208 7.758401
# 2 100 9.241511 0.2419625 7.488522
# 2 150 9.117110 0.2588154 7.358337
# 3 50 9.352195 0.2308743 7.611062
# 3 100 9.114351 0.2603672 7.356772
# 3 150 9.003742 0.2754201 7.243558
#
# Tuning parameter 'shrinkage' was held constant at a value of 0.1
#
# Tuning parameter 'n.minobsinnode' was held constant at a value of 10
# RMSE was used to select the optimal model using the smallest value.
# The final values used for the model were n.trees = 150, interaction.depth =
# 3, shrinkage = 0.1 and n.minobsinnode = 10.
head(summary(gbmFit1))
# var rel.inf
# p20homo_lumo_energygapraw p20homo_lumo_energygapraw 11.712740
# p19.GGsgRNA.raw p19.GGsgRNA.raw 6.271819
# sgRNA.gcsgRNA.raw sgRNA.gcsgRNA.raw 5.926826
# p15.CCsgRNA.raw p15.CCsgRNA.raw 3.494608
# p20homo_energyraw p20homo_energyraw 3.249573
# CCsgRNA.raw CCsgRNA.raw 3.109106
test.df <- na.omit(testing)
pred <- predict(gbmFit1, newdata = test.df)
pred.df <- data.frame(pred.score = pred, exp.score = test.df$cut.score)
cor(pred.df$pred.score, pred.df$exp.score)
# 0.5257828
binary encoding presence/absence by bp HAAR wavelet https://journals.plos.org/plosone/article/figures?id=10.1371/journal.pone.0157243 https://www.rdocumentation.org/packages/wmtsa/versions/2.0-3/topics/wavMODWT https://rdrr.io/cran/wmtsa/man/wavDaubechies.html
#!/bin/bash
#SBATCH -A SYB105
#SBATCH -J pam.wavelets.test
#SBATCH -N 2
#SBATCH -p gpu
#SBATCH -t 48:00:00
#SBATCH --mem-per-cpu=0
#SBATCH -o pam.modwt-%j.o
#SBATCH -e pam.modwt-%j.e
source /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/scripts/loadcondaandes.sh
conda activate /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/Libraries/andes/envs/test
cd /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli/
R CMD BATCH pam.wavelets.test.R
R CMD BATCH pam.wavelets.counts.R
R CMD BATCH pam.wavelets.smooth.R
# sbatch /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli/pam.wavelets.test.sh
# salloc -A SYB105 -p gpu -N 1 -t 1:00:00
source /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/scripts/loadcondaandes.sh
conda activate /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/Libraries/andes/envs/test
cd /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli
sed '1d' ecoli.20bp.sliding.bed | awk '{print $1"\t"$2"\t"$2}' > ecoli.1bp.window.bed
bedtools intersect -wo -a ecoli.1bp.window.bed -b NGG.PAM.sorted.bed > NGG.PAM.1bp.windows.bed
# R
library(dplyr)
library(reshape2)
library(tidyr)
library(wmtsa)
library(data.table)
library(prospectr)
setwd("/gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli")
pam <- read.table("NGG.PAM.20bp.sliding.windows.bed", header=F, sep="\t", stringsAsFactors = F)
window <- read.table("ecoli.20bp.sliding.bed", header=F, sep="\t", stringsAsFactors = F)
score <- read.table("sgRNA.coord.txt", header=T, sep="\t", stringsAsFactors = F)
colnames(score) <- c("chr", "start", "end", "sgRNA", "id", "seq", "id2", "cut.score", "gid", "change.val", "quality")
score.df <- score[,c(1:4,8)]
setwd("/gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli/modwt")
pam.bin <- pam %>% group_by(V1, V2, V3) %>% mutate(pam.count = n())
pam.count <- unique(pam.bin[,c(1:3,12)])
window.v <- window[,1:3]
colnames(window.v) <- c("V1", "V2", "V3")
pam.win <- left_join(window.v, pam.count, by=c("V1", "V2", "V3"))
pam.win[is.na(pam.win)] <- 0
pam.df <- pam.win$pam.count
pam.modwt <- wavMODWT(pam.df, wavelet="haar")
pam.modwt.df <- as.matrix(pam.modwt)
pam.modwt.label <- data.frame(label = row.names(pam.modwt.df), pam.modwt.df)
pam.modwt.dt <- as.data.table(pam.modwt.label)
pam.modwt.name <- pam.modwt.dt[, c("name", "number") := tstrsplit(label, "[^[:alnum:]]+")]
colnames(pam.modwt.name) <- c("label", "pam.dwt", "scale", "window")
write.table(pam.modwt.name, "pam.modwt.counts.haar.txt", quote=F, row.names=F, sep="\t")
setwd("/gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli")
pam <- read.table("NGG.PAM.1bp.windows.bed", header=F, sep="\t", stringsAsFactors = F)
window <- read.table("ecoli.1bp.window.bed", header=F, sep="\t", stringsAsFactors = F)
score <- read.table("sgRNA.coord.txt", header=T, sep="\t", stringsAsFactors = F)
colnames(score) <- c("chr", "start", "end", "sgRNA", "id", "seq", "id2", "cut.score", "gid", "change.val", "quality")
score.df <- score[,c(1:4,8)]
setwd("/gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli/modwt")
pam.bin <- pam %>% group_by(V1, V2, V3) %>% mutate(pam.count = n())
pam.count <- unique(pam.bin[,c(1:3,12)])
window.v <- window[,1:3]
colnames(window.v) <- c("V1", "V2", "V3")
pam.win <- left_join(window.v, pam.count, by=c("V1", "V2", "V3"))
pam.win[is.na(pam.win)] <- 0
pam.win.binary <- pam.win %>% mutate(pam.binary = ifelse(pam.count == 0, 0, 1))
pam.df <- pam.win.binary$pam.binary
write.table(pam.win.binary[,c(1:3,5)], "NGG.PAM.1bp.windows.count.bed", quote=F, row.names=F, sep="\t")
pam.modwt <- wavMODWT(pam.df, wavelet="s8")
pam.modwt.df <- as.matrix(pam.modwt)
pam.modwt.label <- data.frame(label = row.names(pam.modwt.df), pam.modwt.df)
pam.modwt.dt <- as.data.table(pam.modwt.label)
pam.modwt.name <- pam.modwt.dt[, c("name", "number") := tstrsplit(label, "[^[:alnum:]]+")]
colnames(pam.modwt.name) <- c("label", "pam.dwt", "scale", "window")
write.table(pam.modwt.name, "pam.modwt.binary.s8.txt", quote=F, row.names=F, sep="\t")
pam.modwt <- wavMODWT(pam.df, wavelet="haar")
pam.modwt.df <- as.matrix(pam.modwt)
pam.modwt.label <- data.frame(label = row.names(pam.modwt.df), pam.modwt.df)
pam.modwt.dt <- as.data.table(pam.modwt.label)
pam.modwt.name <- pam.modwt.dt[, c("name", "number") := tstrsplit(label, "[^[:alnum:]]+")]
colnames(pam.modwt.name) <- c("label", "pam.dwt", "scale", "window")
write.table(pam.count, "pam.modwt.binary.haar.txt", quote=F, row.names=F, sep="\t")
pam.modwt <- wavMODWT(pam.df, wavelet="d2")
pam.modwt.df <- as.matrix(pam.modwt)
pam.modwt.label <- data.frame(label = row.names(pam.modwt.df), pam.modwt.df)
pam.modwt.dt <- as.data.table(pam.modwt.label)
pam.modwt.name <- pam.modwt.dt[, c("name", "number") := tstrsplit(label, "[^[:alnum:]]+")]
colnames(pam.modwt.name) <- c("label", "pam.dwt", "scale", "window")
write.table(pam.modwt.name, "pam.modwt.binary.d2.txt", quote=F, row.names=F, sep="\t")
setwd("/gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli/modwt")
library(prospectr)
pam.smooth <- savitzkyGolay(pam.df, 1, 3, 21)
write.table(pam.smooth, "NGG.PAM.1bp.windows.count.savitzkyGolay.1.3.21.txt", quote=F, row.names=F, sep="\t")
pam.modwt <- wavMODWT(pam.smooth, wavelet="s8")
pam.modwt.df <- as.matrix(pam.modwt)
pam.modwt.label <- data.frame(label = row.names(pam.modwt.df), pam.modwt.df)
pam.modwt.dt <- as.data.table(pam.modwt.label)
pam.modwt.name <- pam.modwt.dt[, c("name", "number") := tstrsplit(label, "[^[:alnum:]]+")]
colnames(pam.modwt.name) <- c("label", "pam.dwt", "scale", "window")
write.table(pam.modwt.name, "pam.savitzkyGolay.1.3.21.modwt.binary.s8.txt", quote=F, row.names=F, sep="\t")
pam.modwt <- wavMODWT(pam.smooth, wavelet="haar")
pam.modwt.df <- as.matrix(pam.modwt)
pam.modwt.label <- data.frame(label = row.names(pam.modwt.df), pam.modwt.df)
pam.modwt.dt <- as.data.table(pam.modwt.label)
pam.modwt.name <- pam.modwt.dt[, c("name", "number") := tstrsplit(label, "[^[:alnum:]]+")]
colnames(pam.modwt.name) <- c("label", "pam.dwt", "scale", "window")
write.table(pam.modwt.name, "pam.savitzkyGolay.1.3.21.modwt.binary.haar.txt", quote=F, row.names=F, sep="\t")
### try smoothing the data first and then applying wavelets
## savitzkyGolay(X, m, p, w, delta.wav)
setwd("/gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli")
pam <- read.table("NGG.PAM.20bp.sliding.windows.bed", header=F, sep="\t", stringsAsFactors = F)
window <- read.table("ecoli.20bp.sliding.bed", header=F, sep="\t", stringsAsFactors = F)
score <- read.table("sgRNA.coord.txt", header=T, sep="\t", stringsAsFactors = F)
colnames(score) <- c("chr", "start", "end", "sgRNA", "id", "seq", "id2", "cut.score", "gid", "change.val", "quality")
score.df <- score[,c(1:4,8)]
setwd("/gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli/modwt")
pam.bin <- pam %>% group_by(V1, V2, V3) %>% mutate(pam.count = n())
pam.count <- unique(pam.bin[,c(1:3,12)])
window.v <- window[,1:3]
colnames(window.v) <- c("V1", "V2", "V3")
pam.win <- left_join(window.v, pam.count, by=c("V1", "V2", "V3"))
pam.win[is.na(pam.win)] <- 0
pam.df <- pam.win$pam.count
write.table(pam.win, "NGG.PAM.20bp.sliding.windows.count.txt", quote=F, row.names=F, sep="\t")
library(prospectr)
pam.smooth <- savitzkyGolay(pam.df, 1, 3, 21)
write.table(pam.smooth, "NGG.PAM.20bp.sliding.windows.count.savitzkyGolay.1.3.21.txt", quote=F, row.names=F, sep="\t")
pam.modwt <- wavMODWT(pam.smooth, wavelet="haar")
pam.modwt.df <- as.matrix(pam.modwt)
pam.modwt.label <- data.frame(label = row.names(pam.modwt.df), pam.modwt.df)
pam.modwt.dt <- as.data.table(pam.modwt.label)
pam.modwt.name <- pam.modwt.dt[, c("name", "number") := tstrsplit(label, "[^[:alnum:]]+")]
colnames(pam.modwt.name) <- c("label", "pam.dwt", "scale", "window")
write.table(pam.modwt.name, "pam.savitzkyGolay.1.3.21.modwt.haar.txt", quote=F, row.names=F, sep="\t")
## make a plot of the counts and smoothed data together
setwd("/Users/27n/Dropbox (ORNL)/ORNL.Noshay/Projects/SEED/ExploratoryDataForModelGeneration/e.coli/modwt")
counts <- read.delim("NGG.PAM.20bp.sliding.windows.count.bed", header=T, sep="\t")
smooth <- read.delim("NGG.PAM.20bp.sliding.windows.count.savitzkyGolay.1.3.21.txt", header=T, sep="\t")
smooth$window <- rownames(smooth)
counts$window <- rownames(counts)
counts.sub <- counts[2320800:2320900,]
smooth.sub <- smooth[2320800:2320900,]
library(prospectr)
opar <- par(no.readonly = TRUE)
par(mfrow = c(2, 1), mar = c(4, 4, 2, 2))
matplot(counts.sub$window,counts.sub$pam.count,type = "l")
matplot(smooth.sub$window,smooth.sub$x,type = "l")
df <- counts[1:nrow(counts),c(5,4)]
df$pam.smooth <- savitzkyGolay(X = df$pam.count, m = 1, p = 3, w = 21)
df.sub <- counts[1:100,]
matplot(counts.smooth.sub$window,counts.smooth.sub$pam.count,type = "l")
matplot(counts.smooth.sub$window,counts.smooth.sub$pam.smooth,type = "l")
setwd("/Users/27n/Dropbox (ORNL)/ORNL.Noshay/Projects/SEED/ExploratoryDataForModelGeneration/e.coli/modwt")
counts <- read.delim("NGG.PAM.1bp.windows.count.bed", header=T, sep="\t")
counts$window <- rownames(counts)
counts.sub <- counts[1:100,]
smooth <- read.delim("NGG.PAM.1bp.windows.count.savitzkyGolay.1.3.21.txt", header=T, sep="\t")
smooth$window <- rownames(smooth)
smooth.sub <- smooth[1:100,]
library(prospectr)
opar <- par(no.readonly = TRUE)
par(mfrow = c(2, 1), mar = c(4, 4, 2, 2))
matplot(counts.sub$window,counts.sub$pam.binary,type = "l")
matplot(smooth.sub$window,smooth.sub$x,type = "l")
##### make gene counts haar wavelet
library(tidyr)
library(wmtsa)
library(data.table)
setwd("/gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli")
window <- read.table("ecoli.20bp.sliding.bed", header=F, sep="\t", stringsAsFactors = F)
score <- read.table("sgRNA.coord.txt", header=T, sep="\t", stringsAsFactors = F)
colnames(score) <- c("chr", "start", "end", "sgRNA", "id", "seq", "id2", "cut.score", "gid", "change.val", "quality")
score.df <- score[,c(1:4,8)]
setwd("/gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli/counts")
gene.win <- read.table("Gene.20bp.sliding.windows.count.bed", header=T, sep="\t", stringsAsFactors = F)
gene.df <- gene.win$gene.count
setwd("/gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli/modwt")
gene.modwt <- wavMODWT(gene.df, wavelet="haar")
gene.modwt.df <- as.matrix(gene.modwt)
write.table(gene.modwt.df, "gene.modwt.df.haar.txt", quote=F, row.names=T, sep="\t")
setwd("/gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli/modwt")
gene.modwt.df <- read.table("gene.modwt.df.haar.txt", header=T, sep="\t", stringsAsFactors = F, row.names=1)
gene.modwt.label <- data.frame(label = row.names(gene.modwt.df), gene.modwt.df)
gene.modwt.dt <- as.data.table(gene.modwt.label)
gene.modwt.name <- gene.modwt.dt[, c("name", "number") := tstrsplit(label, "[^[:alnum:]]+")]
colnames(gene.modwt.name) <- c("label", "gene.dwt", "scale", "window")
write.table(gene.modwt.name, "gene.modwt.counts.haar.txt", quote=F, row.names=F, sep="\t")
################################################################################
## Wavelets Visualisation
################################################################################
# if (!requireNamespace("remotes", quietly = TRUE))
# install.packages("remotes")
# remotes::install_github("ivanek/Gviz")
library(data.table)
library(Gviz)
options(ucscChromosomeNames=FALSE)
setwd("/Users/27n/Dropbox (ORNL)/ORNL.Noshay/Projects/SEED/ExploratoryDataForModelGeneration/e.coli/")
genes <- rtracklayer::import("GCF_000005845.2_ASM584v2_genomic.gene.gff", format="gff")
pamsites <- rtracklayer::import("modwt/NGG.PAM.sorted.bed", format="bed", extraCols=c("PAM"="character"))
pamcounts <- data.table::fread("modwt/NGG.PAM.20bp.sliding.windows.count.bed", col.names = c("Chr","Start","End","pam.count"))
pambinary <- data.table::fread("modwt/NGG.PAM.1bp.windows.count.bed", col.names = c("Chr","Start","End","pam.binary"))
genecounts <- data.table::fread("modwt/Gene.20bp.sliding.windows.count.bed", col.names = c("Chr","Start","End","gene.count"))
#pamwaves <- data.table::fread("modwt/pam.density.modwt.txt")
#pamwaves <- data.table::fread("modwt/pam.modwt.counts.haar.txt")
#pamwaves <- data.table::fread("modwt/pam.savitzkyGolay.1.3.21.modwt.haar.txt")
#pamwaves <- data.table::fread("modwt/pam.modwt.binary.haar.txt")
#pamwaves <- data.table::fread("modwt/pam.modwt.binary.s8.txt")
#pamwaves <- data.table::fread("modwt/gene.density.modwt.txt")
pamwaves <- data.table::fread("modwt/gene.modwt.counts.haar.txt")
pamwaves$abs.pam.dwt <- abs(pamwaves$pam.dwt)
pamwaves.d1 <- subset(pamwaves, pamwaves$scale == "d1")
pamwaves.d2 <- subset(pamwaves, pamwaves$scale == "d2")
pamwaves.d4 <- subset(pamwaves, pamwaves$scale == "d4")
pamwaves.d6 <- subset(pamwaves, pamwaves$scale == "d6")
pamwaves.d8 <- subset(pamwaves, pamwaves$scale == "d8")
pamwaves.d12 <- subset(pamwaves, pamwaves$scale == "d12")
pamwaves.d20 <- subset(pamwaves, pamwaves$scale == "d20")
start = 1
end = 50000
gtrack <- GenomeAxisTrack(range = pamsites[start:end], littleTicks=TRUE)
pamtrack <- AnnotationTrack(range=pamsites[start:end], name = "NGG", genome = "eschColi_K12", stacking = "dense")
genetrack <- AnnotationTrack(range=genes, name = "GENE", genome = "eschColi_K12", stacking = "squish")
pamcounttrack <- DataTrack(data = pamcounts$pam.count[start:end], name="pamcounts",
genome = "eschColi_K12", chromosome = "NC_000913.3",
start=pamcounts$Start[start:end],end=pamcounts$Start[start:end],
type="l")
pambinarytrack <- DataTrack(data = pambinary$pam.binary[start:end], name="pambinary",
genome = "eschColi_K12", chromosome = "NC_000913.3",
start=pambinary$Start[start:end],end=pambinary$Start[start:end],
type="l")
genecounttrack <- DataTrack(data = genecounts$gene.count[start:end], name="genecounts",
genome = "eschColi_K12", chromosome = "NC_000913.3",
start=genecounts$Start[start:end],end=genecounts$Start[start:end],
type="l")
pamd1track <- DataTrack(data = pamwaves.d1$pam.dwt[start:end], name="pam.dwtd1",
genome = "eschColi_K12", chromosome = "NC_000913.3",
start=pamwaves.d1$window[start:end],end=pamwaves.d1$window[start:end],
type="l")
pamd2track <- DataTrack(data = pamwaves.d2$pam.dwt[start:end], name="pam.dwtd2",
genome = "eschColi_K12", chromosome = "NC_000913.3",
start=pamwaves.d2$window[start:end],end=pamwaves.d2$window[start:end],
type="l")
pamd4track <- DataTrack(data = pamwaves.d4$pam.dwt[start:end], name="pam.dwtd4",
genome = "eschColi_K12", chromosome = "NC_000913.3",
start=pamwaves.d4$window[start:end],end=pamwaves.d4$window[start:end],
type="l")
pamd6track <- DataTrack(data = pamwaves.d6$pam.dwt[start:end], name="pam.dwtd6",
genome = "eschColi_K12", chromosome = "NC_000913.3",
start=pamwaves.d6$window[start:end],end=pamwaves.d6$window[start:end],
type="l")
pamd8track <- DataTrack(data = pamwaves.d8$pam.dwt[start:end], name="pam.dwtd8",
genome = "eschColi_K12", chromosome = "NC_000913.3",
start=pamwaves.d8$window[start:end],end=pamwaves.d8$window[start:end],
type="l")
pamd12track <- DataTrack(data = pamwaves.d12$pam.dwt[start:end], name="pam.dwtd12",
genome = "eschColi_K12", chromosome = "NC_000913.3",
start=pamwaves.d12$window[start:end],end=pamwaves.d12$window[start:end],
type="l")
pamd20track <- DataTrack(data = pamwaves.d20$pam.dwt[start:end], name="pam.dwtd20",
genome = "eschColi_K12", chromosome = "NC_000913.3",
start=pamwaves.d20$window[start:end],end=pamwaves.d20$window[start:end],
type="l")
plotTracks(list(gtrack,genetrack,pamtrack,pamcounttrack,pambinarytrack,pamd1track,pamd2track,pamd4track,pamd6track,pamd8track,pamd12track,pamd20track), from = 39000, to = 40000)
plotTracks(list(gtrack,genetrack,pamtrack,pamcounttrack,pamd1track,pamd2track,pamd4track,pamd6track,pamd8track,pamd12track,pamd20track), from = 39000, to = 40000)
plotTracks(list(gtrack,genetrack,genecounttrack,pamd1track,pamd2track,pamd4track,pamd6track,pamd8track,pamd12track,pamd20track), from = 1, to = 500)
plotTracks(list(gtrack,genetrack,pamd1track,pamd2track,pamd4track,pamd6track,pamd8track,pamd12track,pamd20track), from = 39000, to = 40000)
plotTracks(list(gtrack,genetrack,pamtrack,pamcounttrack,pamd1track,pamd2track,pamd4track,pamd6track,pamd8track,pamd12track,pamd20track), from = 1, to = 500)
plotTracks(list(genetrack,pamtrack,pamd1track,pamd4track,pamd6track,pamd8track,pamd12track), from = 1, to = 500)
pamwaves.d1 <- subset(pamwaves, pamwaves$scale == "d1")
pamwaves.d2 <- subset(pamwaves, pamwaves$scale == "d2")
pamwaves.d3 <- subset(pamwaves, pamwaves$scale == "d3")
pamwaves.d4 <- subset(pamwaves, pamwaves$scale == "d4")
pamwaves.d5 <- subset(pamwaves, pamwaves$scale == "d5")
pamwaves.d6 <- subset(pamwaves, pamwaves$scale == "d6")
pamwaves.d7 <- subset(pamwaves, pamwaves$scale == "d7")
pamwaves.d8 <- subset(pamwaves, pamwaves$scale == "d8")
pamwaves.d9 <- subset(pamwaves, pamwaves$scale == "d9")
pamwaves.d10 <- subset(pamwaves, pamwaves$scale == "d10")
pamwaves.d11 <- subset(pamwaves, pamwaves$scale == "d11")
pamwaves.d12 <- subset(pamwaves, pamwaves$scale == "d12")
pamwavesbinhaar.dat <- t(cbind(pamwaves.d1$pam.dwt,pamwaves.d2$pam.dwt,pamwaves.d3$pam.dwt,pamwaves.d4$pam.dwt,pamwaves.d5$pam.dwt,pamwaves.d6$pam.dwt,pamwaves.d7$pam.dwt,pamwaves.d8$pam.dwt,pamwaves.d9$pam.dwt,pamwaves.d10$pam.dwt,pamwaves.d11$pam.dwt,pamwaves.d12$pam.dwt))
rownames(pamwavesbinhaar.dat) <- paste0("d",c(1,2,3,4,5,6,7,8,9,10,11,12))
#pamwavesbinhaar.heatmap <- DataTrack(start=start:end, width=1, data=pamwavesbinhaar.dat,chromosome="NC_000913.3", genome="eschColi_K12", name="bin haar", type=c("heatmap"), gradient=c("maroon", "white","blue4"))
pamwavesbinhaar.heatmap <- DataTrack(start=pamwaves.d1$window,width=1, data=pamwavesbinhaar.dat,chromosome="NC_000913.3", genome="eschColi_K12", name="bin haar", type=c("heatmap"), gradient=c("maroon", "white","blue4"))
plotTracks(list(gtrack,genetrack,genecounttrack,pamwavesbinhaar.heatmap), from = 1, to = 500)
plotTracks(pamwavesbinhaar.heatmap)
plotTracks(pamwavesbinhaar.heatmap, from = 39000, to = 40000)
plotTracks(list(gtrack,genetrack,pamtrack,pamcounttrack,pamd1track,pamd2track,pamd4track,pamd6track,pamd8track,pamd12track,pamwavesbinhaar.heatmap), from = 39000, to = 40000)
plotTracks(list(genetrack,pamtrack,pamd1track,pamd4track,pamd6track,pamd8track,pamd12track,pamwavesbinhaar.heatmap), from = 1, to = 500)
plotTracks(list(genetrack,pamtrack,pamd1track,pamd4track,pamd6track,pamd8track,pamd12track,pamwavesbinhaar.heatmap), from = 10000, to = 11000)
# salloc -A SYB105 -N 2 -p gpu -t 4:00:00
source /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/scripts/loadcondaandes.sh
conda activate /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/Libraries/andes/envs/test
R
library(dplyr)
library(reshape2)
library(tidyr)
library(data.table)
setwd("/gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli/modwt")
gene.modwt.name <- read.table("gene.modwt.counts.haar.txt", header=T, sep="\t", stringsAsFactors = F)
pam.modwt.name <- read.table("pam.modwt.counts.haar.txt", header=T, sep="\t", stringsAsFactors = F)
gene.modwt.name$window <- as.integer(gene.modwt.name$window)
pam.modwt.name$window <- as.integer(pam.modwt.name$window)
setwd("/gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli")
window <- read.table("ecoli.20bp.sliding.bed", header=F, sep="\t", stringsAsFactors = F)
score <- read.table("sgRNA.coord.txt", header=T, sep="\t", stringsAsFactors = F)
colnames(score) <- c("chr", "start", "end", "sgRNA", "id", "seq", "id2", "cut.score", "gid", "change.val", "quality")
score.df <- score[,c(1:4,8)]
colnames(window) <- c("chr", "start", "end")
window$window <- seq.int(nrow(window))
window$window <- as.character(window$window-1)
window$start <- as.numeric(window$start)
window$end <- as.numeric(window$end - 1)
library(tidygenomics)
window.score.df <- left_join(score.df, window, by=c("chr", "start", "end"))
window.score.df$window <- as.integer(window.score.df$window)
window.score.pam <- left_join(window.score.df, pam.modwt.name, by="window")
window.score.pam.gene <- left_join(window.score.pam, gene.modwt.name, by=c("window", "scale"))
window.score.pam.gene.sgRNA <- subset(window.score.pam.gene, window.score.pam.gene$cut.score != "NA")
#931362
write.table(window.score.pam.gene.sgRNA, "ecoli.20sliding.exact.pam.gene.HAAR.txt", quote=F, row.names=F, sep="\t")
df.melt <- melt(window.score.pam.gene.sgRNA[,c(4,5,8,9,11)], id=c("cut.score", "scale", "sgRNA"))
df <- na.omit(df.melt)
colnames(df) <- c("cut.score", "scale", "sgRNA", "variable", "value")
df.id <- df %>% unite(feature.scale, c(variable, scale), sep = "")
df.dcast <- df.id %>% dcast(sgRNA + cut.score ~ feature.scale, value.var = "value", fun.aggregate=mean, na.rm=TRUE)
df.dcast.na <- na.omit(df.dcast)
# 40468
write.table(df.dcast.na, "ecoli.20sliding.exact.pam.gene.HAAR.dcast.txt", quote=F, row.names=F, sep="\t")
### take the mean of the 20bp region? Doesn't really make sense when I am already using 20bp count values...
window$window <- as.integer(window$window)
window.pam <- left_join(window, pam.modwt.name[,c(3,4,2)], by="window")
window.gene <- left_join(window, gene.modwt.name[,c(3,4,2)], by="window")
window.pam.coord <- genome_intersect(window.score.df, window.pam, by=c("chr", "start", "end"))
window.gene.coord <- genome_intersect(window.score.df, window.gene, by=c("chr", "start", "end"))
window.pam.coord.mean <- window.pam.coord %>% group_by(sgRNA, scale) %>% mutate(pam.dwt.mean = mean(pam.dwt))
window.gene.coord.mean <- window.gene.coord %>% group_by(sgRNA, scale) %>% mutate(gene.dwt.mean = mean(gene.dwt))
window.pam.coord.uniq <- unique(window.pam.coord.mean[,c(2,3,4,6,10)])
window.gene.coord.uniq <- unique(window.gene.coord.mean[,c(2,3,4,6,10)])
colnames(window.pam.coord.uniq) <- c("sgRNA", "cut.score", "window", "scale", "pam.dwt.mean")
colnames(window.gene.coord.uniq) <- c("sgRNA", "cut.score", "window", "scale", "gene.dwt.mean")
window.score.pam <- left_join(window.score.df, window.pam.coord.uniq[,3:5], by="window")
window.score.pam.gene <- left_join(window.score.pam, window.gene.coord.uniq[,3:5], by=c("window", "scale"))
window.score.pam.gene.sgRNA <- subset(window.score.pam.gene, window.score.pam.gene$cut.score != "NA")
# 932604
write.table(window.score.pam.gene.sgRNA, "ecoli.20sliding.exact.pam.gene.HAAR.mean.txt", quote=F, row.names=F, sep="\t")
df.melt <- melt(window.score.pam.gene.sgRNA[,c()], id=c("cut.score", "scale", "sgRNA"))
df <- na.omit(df.melt)
colnames(df) <- c("cut.score", "scale", "sgRNA", "variable", "value")
df.id <- df %>% unite(feature.scale, c(variable, scale), sep = "")
df.dcast <- df.id %>% dcast(sgRNA + cut.score ~ feature.scale, value.var = "value", fun.aggregate=mean, na.rm=TRUE)
df.dcast.na <- na.omit(df.dcast)
# 40468
write.table(df.dcast.na, "ecoli.20sliding.exact.pam.gene.HAAR.mean.dcast.txt", quote=F, row.names=F, sep="\t")
#### Previous DWT iRF correlations
# PAM DWT = 0.155
# Gene DWT = 0.12
# salloc -A SYB105 -p gpu -N 1 -t 4:00:00
source /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/scripts/loadcondaandes.sh
conda activate /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/Libraries/andes/envs/test
#R
library(ranger)
iRF <- function(xmat, y, ntree=200, iter=5, classification=F, threads=1, alwayssplits=NULL, saveall=T)
{
tmp <- cbind(xmat, Y = y)
wt <- rep(1/ncol(xmat), ncol(xmat)) # start with equal sample weighting per SNP
rfs <- list()
for(i in 1:iter)
{
cat("\niRF iteration ",i,"\n")
cat("=================\n")
mtry = 0.5*sum(wt>0)
rf <- ranger::ranger(dependent.variable.name = "Y", data = tmp, num.trees=ntree,
split.select.weights = wt, classification = classification,
mtry = mtry, importance = "impurity_corrected", num.threads=threads, write.forest = T,
always.split.variables = alwayssplits)
wt <- rf$variable.importance / sum(abs(rf$variable.importance)) # scale importance to range(0,1)
wt[wt<0] <- 0 # set negative weights to zero
cat("mtry: ", mtry, "\n")
cat("prediction error: ",rf$prediction.error,"\n")
if(classification==FALSE) cat("r^2: ",rf$r.squared,"\n")
if(classification==TRUE) print(rf$confusion.matrix)
cat("cor(y,yhat): ",cor(rf$predictions,y),"\n")
cat("SNPs with importance > 0:",sum(wt>0),"\n")
if(saveall) rfs[[i]] <- rf
if(sum(wt>0) < max(0.01*(ncol(xmat)-1), 10))
{
if(!saveall) rfs <- rf
break
}
}
return(rfs)
}
# HAAR DWT
setwd("/gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli")
df <- read.delim("ecoli.20sliding.exact.pam.gene.HAAR.dcast.txt", header=T, sep="\t", stringsAsFactors = F)
df.sample <- df[sample(nrow(df), 10000), ]
names(df.sample)
# PAM DWT
iRF(df.sample[,c(26:48)], df.sample$cut.score)
# iRF iteration 1 <-- why is iteration 1 better than 2-5?? #### but does better than the s8 wavelet did (cor=0.155)
# =================
# mtry: 11.5
# prediction error: 104.8461
# r^2: 0.04090863
# cor(y,yhat): 0.2063442
# SNPs with importance > 0: 22
iRF(df[,c(26:48)], df$cut.score)
# iRF iteration 1 <-- full dataset still better with iteration 1 so not due to the random sample (full set s8 cor=0.17, r2=0.028)
# =================
# mtry: 11.5
# prediction error: 103.9104
# r^2: 0.05970627
# cor(y,yhat): 0.2458656
# SNPs with importance > 0: 22
# absolute value PAM DWT
iRF(abs(df.sample[,c(26:48)]), df.sample$cut.score)
# iRF iteration 4 <-- Not as good with the absolute values (interesting)
# =================
# mtry: 8.5
# prediction error: 107.9172
# r^2: 0.01281587
# cor(y,yhat): 0.1502493
# SNPs with importance > 0: 16
# Gene DWT
iRF(df.sample[,3:25], df.sample$cut.score)
# iRF iteration 1 <-- Gene DWT does worse with haar than it did with s8... but not by much (cor=0.12)
# =================
# mtry: 11.5
# prediction error: 109.4336
# r^2: -0.00105543
# cor(y,yhat): 0.1111919
# SNPs with importance > 0: 11
# absolute value Gene DWT
iRF(abs(df.sample[,3:25]), df.sample$cut.score)
# iRF iteration 2
# =================
# mtry: 6
# prediction error: 111.3872
# r^2: -0.0189266
# cor(y,yhat): 0.1042094
# SNPs with importance > 0: 5
# both pam and gene dwt
iRF(df.sample[,3:48], df.sample$cut.score)
#### run it on two different random samples to see how much that changes things... definitely makes a difference...
# sample set 1
# iRF iteration 2
# =================
# mtry: 13
# prediction error: 106.014
# r^2: 0.04998521
# cor(y,yhat): 0.2318749
# SNPs with importance > 0: 24
# sample set 2
# iRF iteration 4
# =================
# mtry: 9.5
# prediction error: 105.605
# r^2: 0.04521568
# cor(y,yhat): 0.2258336
# SNPs with importance > 0: 19
# HAAR mean value DWT
setwd("/gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli")
df <- read.delim("ecoli.20sliding.exact.pam.gene.HAAR.mean.dcast.txt", header=T, sep="\t", stringsAsFactors = F)
df.sample <- df[sample(nrow(df), 10000), ]
# PAM DWT
iRF(df.sample[,26:48], df.sample$cut.score)
# iRF iteration 2 <-- interesting that the mean haar dwt from the counts does slightly better than the single window values...
# =================
# mtry: 9.5
# prediction error: 106.2595
# r^2: 0.03909658
# cor(y,yhat): 0.2169182
# SNPs with importance > 0: 17
# absolute value PAM DWT
iRF(abs(df.sample[,26:48]), df.sample$cut.score)
# iRF iteration 1
# =================
# mtry: 11.5
# prediction error: 109.8336
# r^2: 0.00677616
# cor(y,yhat): 0.1242668
# SNPs with importance > 0: 17
# Gene DWT
iRF(df.sample[,3:25], df.sample$cut.score)
# iRF iteration 1 <-- BUT gene DWT mean haar values does WORSE??
# =================
# mtry: 11.5
# prediction error: 111.6805
# r^2: -0.009925423
# cor(y,yhat): 0.09520425
# SNPs with importance > 0: 14
# absolute value Gene DWT
iRF(abs(df.sample[,3:25]), df.sample$cut.score)
# iRF iteration 2
# =================
# mtry: 5.5
# prediction error: 112.0757
# r^2: -0.01349949
# cor(y,yhat): 0.09578725
# SNPs with importance > 0: 6
# both pam and gene dwt
iRF(df.sample[,3:48], df.sample$cut.score)
# iRF iteration 4
# =================
# mtry: 12.5
# prediction error: 106.2645
# r^2: 0.03905116
# cor(y,yhat): 0.2131038
# SNPs with importance > 0: 19
############# MONDAY #############
# Now need to calculate the haar wavelet for the rest of the features and feed back into the full feature matrix to run iRF...and feature selection subsets... <-- use counts haar wavelets (appears the combination of features improves the model for this)
#!/bin/bash
#SBATCH -A SYB105
#SBATCH -J dwt.haar
#SBATCH -N 2
#SBATCH -p gpu
#SBATCH -t 48:00:00
#SBATCH --mem-per-cpu=0
source /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/scripts/loadcondaandes.sh
conda activate /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/Libraries/andes/envs/test
cd /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli/
#R CMD BATCH dwt.haar.R
R CMD BATCH dwt.haar.3nov.R
# sbatch /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli/dwt.haar.sh
salloc -A SYB105 -N 2 -p gpu -t 4:00:00
source /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/scripts/loadcondaandes.sh
conda activate /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/Libraries/andes/envs/test
# cd /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli/
# tr -s ':' '\t' < ecoli.20sliding.ViennaRNA.output.value.id.txt | tr -s '-' '\t' | cut -f 1-3,9 > ecoli.20sliding.ViennaRNA.output.value.id.bed
R
library(dplyr)
library(reshape2)
library(tidyr)
library(wmtsa)
library(data.table)
setwd("/gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli")
gatc <- read.table("ecoli.gatc.20sliding.bed", header=F, sep="\t", stringsAsFactors = F)
ipd <- read.table("ecoli.ipd.average.20sliding.bed", header=T, sep="\t", stringsAsFactors = F)
gene <- read.table("ecoli.gene.20sliding.bed", header=F, sep="\t", stringsAsFactors = F)
structure <- read.table("ecoli.20sliding.ViennaRNA.output.value.id.txt", header=T, sep="\t", stringsAsFactors = F)
nuc <- read.table("nucleotide_counts_20sliding_temp.txt", header=T, sep="\t", stringsAsFactors = F)
rnaseq <- read.table("ecoli.rnaseq.average.20sliding.bed", header=T, sep="\t", stringsAsFactors = F)
pam <- read.table("NGG.PAM.20bp.sliding.windows.bed", header=F, sep="\t", stringsAsFactors = F)
window <- read.table("ecoli.20bp.sliding.bed", header=F, sep="\t", stringsAsFactors = F)
score <- read.table("sgRNA.coord.txt", header=T, sep="\t", stringsAsFactors = F)
colnames(score) <- c("chr", "start", "end", "sgRNA", "id", "seq", "id2", "cut.score", "gid", "change.val", "quality")
score.df <- score[,c(1:4,8)]
gatc.bin <- gatc %>% group_by(V1, V2, V3) %>% mutate(gatc.count = n())
gatc.count <- unique(gatc.bin[,c(1:3,8)])
gene.bin <- gene %>% group_by(V1, V2, V3) %>% mutate(gene.count = n())
gene.count <- unique(gene.bin[,c(1:3,14)])
pam.bin <- pam %>% group_by(V1, V2, V3) %>% mutate(pam.count = n())
pam.count <- unique(pam.bin[,c(1:3,12)])
window.v <- window[,1:3]
colnames(window.v) <- c("V1", "V2", "V3")
gatc.win <- left_join(window.v, gatc.count, by=c("V1", "V2", "V3"))
gatc.win[is.na(gatc.win)] <- 0
gene.win <- left_join(window.v, gene.count, by=c("V1", "V2", "V3"))
gene.win[is.na(gene.win)] <- 0
ipd.win <- left_join(window.v, ipd, by=c("V1", "V2", "V3"))
ipd.win[is.na(ipd.win)] <- 0
rnaseq.win <- left_join(window.v, rnaseq, by=c("V1", "V2", "V3"))
rnaseq.win[is.na(rnaseq.win)] <- 0
pam.win <- left_join(window.v, pam.count, by=c("V1", "V2", "V3"))
pam.win[is.na(pam.win)] <- 0
gene.df <- gene.win$gene.count
gatc.df <- gatc.win$gatc.count
pam.df <- pam.win$pam.count
ipd.df <- ipd.win[,4]
structure.df <- structure[,2]
gc.df <- nuc[,7]
temp.df <- nuc[,8]
rna.df <- rnaseq.win[,4]
setwd("/gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli/modwt")
temp.modwt <- wavMODWT(temp.df, wavelet="haar")
temp.modwt.df <- as.matrix(temp.modwt)
temp.modwt.label <- data.frame(label = row.names(temp.modwt.df), temp.modwt.df)
temp.modwt.dt <- as.data.table(temp.modwt.label)
temp.modwt.name <- temp.modwt.dt[, c("name", "number") := tstrsplit(label, "[^[:alnum:]]+")]
colnames(temp.modwt.name) <- c("label", "temp.dwt", "scale", "window")
write.table(temp.modwt.name, "temp.modwt.haar.txt", quote=F, row.names=F, sep="\t")
gc.modwt <- wavMODWT(gc.df, wavelet="haar")
gc.modwt.df <- as.matrix(gc.modwt)
gc.modwt.label <- data.frame(label = row.names(gc.modwt.df), gc.modwt.df)
gc.modwt.dt <- as.data.table(gc.modwt.label)
gc.modwt.name <- gc.modwt.dt[, c("name", "number") := tstrsplit(label, "[^[:alnum:]]+")]
colnames(gc.modwt.name) <- c("label", "gc.dwt", "scale", "window")
write.table(gc.modwt.name, "gc.modwt.haar.txt", quote=F, row.names=F, sep="\t")
structure.modwt <- wavMODWT(structure.df, wavelet="haar")
structure.modwt.df <- as.matrix(structure.modwt)
structure.modwt.label <- data.frame(label = row.names(structure.modwt.df), structure.modwt.df)
structure.modwt.dt <- as.data.table(structure.modwt.label)
structure.modwt.name <- structure.modwt.dt[, c("name", "number") := tstrsplit(label, "[^[:alnum:]]+")]
colnames(structure.modwt.name) <- c("label", "structure.dwt", "scale", "window")
write.table(structure.modwt.name, "structure.modwt.haar.txt", quote=F, row.names=F, sep="\t")
rna.modwt <- wavMODWT(rna.df, wavelet="haar")
rna.modwt.df <- as.matrix(rna.modwt)
rna.modwt.label <- data.frame(label = row.names(rna.modwt.df), rna.modwt.df)
rna.modwt.dt <- as.data.table(rna.modwt.label)
rna.modwt.name <- rna.modwt.dt[, c("name", "number") := tstrsplit(label, "[^[:alnum:]]+")]
colnames(rna.modwt.name) <- c("label", "rna.dwt", "scale", "window")
write.table(rna.modwt.name, "rnaseq.modwt.haar.txt", quote=F, row.names=F, sep="\t")
ipd.modwt <- wavMODWT(ipd.df, wavelet="haar")
ipd.modwt.df <- as.matrix(ipd.modwt)
ipd.modwt.label <- data.frame(label = row.names(ipd.modwt.df), ipd.modwt.df)
ipd.modwt.dt <- as.data.table(ipd.modwt.label)
ipd.modwt.name <- ipd.modwt.dt[, c("name", "number") := tstrsplit(label, "[^[:alnum:]]+")]
colnames(ipd.modwt.name) <- c("label", "ipd.dwt", "scale", "window")
write.table(ipd.modwt.name, "ipd.modwt.haar.txt", quote=F, row.names=F, sep="\t")
gene.modwt <- wavMODWT(gene.df, wavelet="haar")
gene.modwt.df <- as.matrix(gene.modwt)
gene.modwt.label <- data.frame(label = row.names(gene.modwt.df), gene.modwt.df)
gene.modwt.dt <- as.data.table(gene.modwt.label)
gene.modwt.name <- gene.modwt.dt[, c("name", "number") := tstrsplit(label, "[^[:alnum:]]+")]
colnames(gene.modwt.name) <- c("label", "gene.dwt", "scale", "window")
write.table(gene.modwt.name, "gene.density.modwt.haar.txt", quote=F, row.names=F, sep="\t")
gatc.modwt <- wavMODWT(gatc.df, wavelet="haar")
gatc.modwt.df <- as.matrix(gatc.modwt)
gatc.modwt.label <- data.frame(label = row.names(gatc.modwt.df), gatc.modwt.df)
gatc.modwt.dt <- as.data.table(gatc.modwt.label)
gatc.modwt.name <- gatc.modwt.dt[, c("name", "number") := tstrsplit(label, "[^[:alnum:]]+")]
colnames(gatc.modwt.name) <- c("label", "gatc.dwt", "scale", "window")
write.table(gatc.modwt.name, "gatc.density.modwt.haar.txt", quote=F, row.names=F, sep="\t")
pam.modwt <- wavMODWT(pam.df, wavelet="haar")
pam.modwt.df <- as.matrix(pam.modwt)
pam.modwt.label <- data.frame(label = row.names(pam.modwt.df), pam.modwt.df)
pam.modwt.dt <- as.data.table(pam.modwt.label)
pam.modwt.name <- pam.modwt.dt[, c("name", "number") := tstrsplit(label, "[^[:alnum:]]+")]
colnames(pam.modwt.name) <- c("label", "pam.dwt", "scale", "window")
write.table(pam.modwt.name, "pam.density.modwt.haar.txt", quote=F, row.names=F, sep="\t")
setwd("/gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli/modwt")
temp.modwt.name <- read.delim("temp.modwt.haar.txt", header=T, sep="\t", stringsAsFactors = F)
gc.modwt.name <- read.delim("gc.modwt.haar.txt", header=T, sep="\t", stringsAsFactors = F)
structure.modwt.name <- read.delim("structure.modwt.haar.txt", header=T, sep="\t", stringsAsFactors = F)
rna.modwt.name <- read.delim("rnaseq.modwt.haar.txt", header=T, sep="\t", stringsAsFactors = F)
gene.modwt.name <- read.delim("gene.density.modwt.haar.txt", header=T, sep="\t", stringsAsFactors = F)
gatc.modwt.name <- read.delim("gatc.density.modwt.haar.txt", header=T, sep="\t", stringsAsFactors = F)
ipd.modwt.name <- read.delim("ipd.modwt.haar.txt", header=T, sep="\t", stringsAsFactors = F)
pam.modwt.name <- read.delim("pam.density.modwt.haar.txt", header=T, sep="\t", stringsAsFactors = F)
setwd("/gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli")
window <- read.table("ecoli.20bp.sliding.bed", header=F, sep="\t", stringsAsFactors = F)
score <- read.table("sgRNA.coord.txt", header=T, sep="\t", stringsAsFactors = F)
colnames(score) <- c("chr", "start", "end", "sgRNA", "id", "seq", "id2", "cut.score", "gid", "change.val", "quality")
score.df <- score[,c(1:4,8)]
colnames(window) <- c("chr", "start", "end")
window$window <- seq.int(nrow(window))
window$window <- as.character(window$window-1)
window$start <- as.numeric(window$start)
window$end <- as.numeric(window$end - 1)
window.score.df <- left_join(score.df, window, by=c("chr", "start", "end"))
window.score.df$window <- as.integer(window.score.df$window)
window.score.temp <- left_join(window.score.df, temp.modwt.name[,c(3,4,2)], by="window")
window.temp.gc <- left_join(window.score.temp, gc.modwt.name[,c(3,4,2)], by=c("window", "scale"))
window.temp.gc.structure <- left_join(window.temp.gc, structure.modwt.name[,c(3,4,2)], by=c("window", "scale"))
window.temp.gc.structure.rna <- left_join(window.temp.gc.structure, rna.modwt.name[,c(3,4,2)], by=c("window", "scale"))
window.temp.gc.structure.rna.gene <- left_join(window.temp.gc.structure.rna, gene.modwt.name[,c(3,4,2)], by=c("window", "scale"))
window.temp.gc.structure.rna.gene.gatc <- left_join(window.temp.gc.structure.rna.gene, gatc.modwt.name[,c(3,4,2)], by=c("window", "scale"))
window.temp.gc.structure.rna.gene.gatc.ipd <- left_join(window.temp.gc.structure.rna.gene.gatc, ipd.modwt.name[,c(3,4,2)], by=c("window", "scale"))
window.temp.gc.structure.rna.gene.gatc.ipd.pam <- left_join(window.temp.gc.structure.rna.gene.gatc.ipd, pam.modwt.name[,c(3,4,2)], by=c("window", "scale"))
nrow(window.temp.gc.structure.rna.gene.gatc.ipd.pam)
# 1293773
window.temp.gc.structure.rna.gene.gatc.ipd.pam.sgRNA <- subset(window.temp.gc.structure.rna.gene.gatc.ipd.pam, window.temp.gc.structure.rna.gene.gatc.ipd.pam$cut.score != "NA")
nrow(window.temp.gc.structure.rna.gene.gatc.ipd.pam)
# 1293773
write.table(window.temp.gc.structure.rna.gene.gatc.ipd.pam.sgRNA, "ecoli.20sliding.exact.DWT.haar.txt", quote=F, row.names=F, sep="\t")
df.melt <- melt(window.temp.gc.structure.rna.gene.gatc.ipd.pam.sgRNA[,c(4,5,7:15)], id=c("cut.score", "scale", "sgRNA"))
df <- na.omit(df.melt)
colnames(df) <- c("cut.score", "scale", "sgRNA", "variable", "value")
df.id <- df %>% unite(feature.scale, c(variable, scale), sep = "")
df.dcast <- df.id %>% dcast(sgRNA + cut.score ~ feature.scale, value.var = "value", fun.aggregate=mean, na.rm=TRUE)
df.dcast.na <- na.omit(df.dcast)
nrow(df.dcast.na)
# 40468
write.table(df.dcast.na, "ecoli.20sliding.exact.DWT.haar.dcast.txt", quote=F, row.names=F, sep="\t")
# combine regional DWT with other features
library(tidyr)
library(dplyr)
setwd("/gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli")
df.dcast.na <- read.delim("ecoli.20sliding.exact.DWT.haar.dcast.txt", header=T, sep="\t", stringsAsFactors = F)
df.dcast.sep <- df.dcast.na %>% separate(sgRNA, c("sgRNA", "ID"), sep="_")
df.dcast.dwt <- df.dcast.sep[,c(4:ncol(df.dcast.sep))]
colnames(df.dcast.dwt) <- paste0('sgRNA_', colnames(df.dcast.dwt))
df.dcast <- cbind(df.dcast.sep[,1:3], df.dcast.dwt)
df <- read.delim("ecoli.20sliding.pam.all.features.dcast.txt", header=T, sep="\t", stringsAsFactors = F)
df <- df[,c(1:1649,2133:2138)]
df.sep <- df %>% separate(sgRNAID, c("sgRNA", "ID", "type"), sep="_")
nrow(df.sep)
# 118140
ncol(df.sep)
# 1657
df.sep.region <- inner_join(df.sep, df.dcast[,c(1,2,4:ncol(df.dcast.sep))], by=c("sgRNA", "ID"))
df.sep.region.id <- df.sep.region %>% unite(sgRNAID, c("sgRNA", "ID", "type"), sep="_")
nrow(df.sep.region.id)
# 118140
ncol(df.sep.region.id)
# 1839
write.table(df.sep.region.id, "ecoli.20sliding.dwtexact.haar.features.dcast.txt", quote=F, row.names=F, sep="\t")
–> test features in iRF
#!/bin/bash
#SBATCH -A SYB105
#SBATCH -J iRF.haar.test
#SBATCH -N 1
#SBATCH -p gpu
#SBATCH -t 48:00:00
#SBATCH --mem-per-cpu=0
source /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/scripts/loadcondaandes.sh
conda activate /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/Libraries/andes/envs/test
cd /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli/
grep '_Cas9' ecoli.20sliding.dwtexact.haar.features.dcast.txt > ecoli.20sliding.dwtexact.haar.features.dcast.cas9.txt
awk 'NR==1{print $0}' ecoli.20sliding.dwtexact.haar.features.dcast.txt > ecoli.20sliding.dwtexact.haar.features.dcast.header.txt
cat ecoli.20sliding.dwtexact.haar.features.dcast.header.txt ecoli.20sliding.dwtexact.haar.features.dcast.cas9.txt > ecoli.20sliding.dwtexact.haar.features.dcast.cas9.header.txt
cd /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli/
R CMD BATCH iRF.haar.test.R
# sbatch /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli/iRF.haar.test.sh
–> add in raw counts for the sgRNA 20bp bin for gene, gatc, and pam
source /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/scripts/loadcondaandes.sh
conda activate /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/Libraries/andes/envs/test
library(dplyr)
library(tidyr)
library(data.table)
setwd("/gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli")
window <- read.table("ecoli.20bp.sliding.bed", header=F, sep="\t", stringsAsFactors = F)
score <- read.table("sgRNA.coord.txt", header=T, sep="\t", stringsAsFactors = F)
colnames(score) <- c("chr", "start", "end", "sgRNA", "id", "seq", "id2", "cut.score", "gid", "change.val", "quality")
score.df <- score[,c(1:4,8)]
setwd("/gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli/counts")
gatc <- read.table("GATC.20bp.sliding.windows.count.bed", header=T, sep="\t", stringsAsFactors = F)
gene <- read.table("Gene.20bp.sliding.windows.count.bed", header=T, sep="\t", stringsAsFactors = F)
pam <- read.table("NGG.PAM.20bp.sliding.windows.count.bed", header=T, sep="\t", stringsAsFactors = F)
colnames(gatc) <- c("chr", "start", "end", "gatc.count")
colnames(gene) <- c("chr", "start", "end", "gene.count")
colnames(pam) <- c("chr", "start", "end", "pam.count")
gatc$start <- gatc$start + 1
gene$start <- gene$start + 1
pam$start <- pam$start + 1
score.gatc <- left_join(score.df, gatc, by=c("chr", "start", "end"))
score.gatc.gene <- left_join(score.gatc, gene, by=c("chr", "start", "end"))
score.gatc.gene.pam <- left_join(score.gatc.gene, pam, by=c("chr", "start", "end"))
#### add ViennaRNA instead of iFeature
setwd("/gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli")
viennarna <- read.delim("ecoli.gRNA.ViennaRNA.output.value.id.txt", header=F, sep="\t", stringsAsFactors = F)
colnames(viennarna) <- c("sgRNA", "vienna.rna")
score.gatc.gene.pam.vienna <- left_join(score.gatc.gene.pam, viennarna, by=c("sgRNA"))
score.gatc.gene.pam.vienna$scale <- 0
df.melt <- melt(score.gatc.gene.pam.vienna[,4:10], id=c("cut.score", "scale", "sgRNA"))
df <- na.omit(df.melt)
colnames(df) <- c("cut.score", "scale", "sgRNA", "variable", "value")
df.id <- df %>% unite(feature.scale, c(variable, scale), sep = "")
df.dcast <- df.id %>% dcast(sgRNA + cut.score ~ feature.scale, value.var = "value", fun.aggregate=mean, na.rm=TRUE)
df.dcast.na <- na.omit(df.dcast)
# 40468
write.table(df.dcast.na, "ecoli.sgRNA.counts.dcast.txt", quote=F, row.names=F, sep="\t")
## combine raw with dwt
setwd("/gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli")
df.dcast <- read.delim("ecoli.sgRNA.counts.dcast.txt", header=T, sep="\t", stringsAsFactors = F)
df.dcast.sep <- df.dcast %>% separate(sgRNA, c("sgRNA", "ID"), sep="_")
df.dcast.sep <- df.dcast.sep[,c(1,2,4)]
setwd("/gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli/")
df <- read.delim("ecoli.20sliding.dwtexact.haar.features.dcast.txt", header=T, sep="\t", stringsAsFactors = F)
df.sep <- df %>% separate(sgRNAID, c("sgRNA", "ID", "type"), sep="_")
df.location <- inner_join(df.sep, df.dcast.sep, by=c("sgRNA", "ID"))
# test <- df.location %>% unite(sgRNAID, c("sgRNA", "ID"), sep="_")
# test.cas9 <- test[1:40468,]
df.location.id <- df.location %>% unite(sgRNAID, c("sgRNA", "ID", "type"), sep="_")
nrow(df.location.id)
# 118140
ncol(df.location.id)
# 1840
df.cas9 <- df.location.id[1:40468,]
nrow(df.cas9)
# 40468
setwd("/gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli")
write.table(df.cas9, "ecoli.20sliding.dwtexact.haar.features.counts.dcast.txt", quote=F, row.names=F, sep="\t")
write.table(df.cas9, "ecoli.20sliding.dwtexact.haar.features.counts.cas9.dcast.txt", quote=F, row.names=F, sep="\t")
write.table(df.location.id, "ecoli.20sliding.dwtexact.haar.features.counts.allcas.dcast.txt", quote=F, row.names=F, sep="\t")
–> remove highly correlated features
# python
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
import warnings
warnings.filterwarnings("ignore")
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.metrics import confusion_matrix
np.random.seed(123)
data = pd.read_table('/gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli/ecoli.20sliding.dwtexact.haar.features.counts.dcast.txt')
data = data.iloc[:,2:-1]
label_encoder = LabelEncoder()
data.iloc[:,0] = label_encoder.fit_transform(data.iloc[:,0]).astype('float64')
corr = data.corr()
columns = np.full((corr.shape[0],), True, dtype=bool)
for i in range(corr.shape[0]):
for j in range(i+1, corr.shape[0]):
if corr.iloc[i,j] >= 0.9:
if columns[j]:
columns[j] = False
selected_columns = data.columns[columns]
data = data[selected_columns]
data.to_csv("/gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli/ecoli.20sliding.dwtexact.haar.counts.features.cas9.pythoncorrelation.txt")
# R
source /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/scripts/loadcondaandes.sh
conda activate /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/Libraries/andes/envs/test
#head -n 1 /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli/ecoli.20sliding.dwtexact.haar.counts.features.cas9.pythoncorrelation.txt > /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli/ecoli.20sliding.dwtexact.haar.counts.features.cas9.pythoncorrelation.header.txt
setwd("/gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli/")
df.noncor <- read.delim("ecoli.20sliding.dwtexact.haar.counts.features.cas9.pythoncorrelation.header.txt", header=F, sep=",")
setwd("/gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli/")
df <- read.delim("ecoli.20sliding.features.counts.dcast.txt", header=T, sep="\t")
df.subset <- df[ , which(names(df) %in% df.noncor[1,])]
df.mat <- as.matrix(df.subset)
df.mat.2 <- as.data.frame(df.mat[,c(1:632,634:ncol(df.mat))])
df.mat.id <- cbind(as.data.frame(df$sgRNAID), df.mat.2)
ncol(df.mat.id)
# 787
write.table(df.mat.id, "matrix.haar.counts.noncor.features.txt", quote=F, row.names=F, sep="\t")
write.table(df.mat.id, "matrix.haar.counts.noncor.features_overlap.txt", quote=F, row.names=F, sep="\t")
write.table(df.mat.2, "matrix.haar.counts.noncor.features_overlap_noSampleIDs.txt", quote=F, row.names=F, sep="\t")
sgRNA.score <- as.data.frame(df[,1:2])
colnames(sgRNA.score) <- c("sgRNAID", "cut.score")
df.subset.id <- cbind(sgRNA.score, df.mat.2)
write.table(df.subset.id, "ecoli.20sliding.dwtexact.haar.counts.features.cas9.noncorrelated.txt", quote=F, row.names=F, sep="\t")
write.table(df.subset.id[,c(1,3:ncol(df.subset.id))], "matrix.haar.counts.features.txt", quote=F, row.names=F, sep="\t")
write.table(df.subset.id[,c(1,3:ncol(df.subset.id))], "matrix.haar.counts.features_overlap.txt", quote=F, row.names=F, sep="\t")
write.table(df.subset.id[,3:ncol(df.subset.id)], "matrix.haar.counts.features_overlap_noSampleIDs.txt", quote=F, row.names=F, sep="\t")
–> run iRF (R based 10,000 sample test)
# salloc -A SYB105 -p gpu -N 1 -t 4:00:00
source /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/scripts/loadcondaandes.sh
conda activate /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/Libraries/andes/envs/test
cd /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli/
grep '_Cas9' ecoli.20sliding.dwtexact.haar.counts.features.dcast.txt > ecoli.20sliding.dwtexact.haar.counts.features.cas9.txt
awk 'NR==1{print $0}' ecoli.20sliding.dwtexact.haar.counts.features.dcast.txt > ecoli.20sliding.dwtexact.haar.counts.features.header.txt
cat ecoli.20sliding.dwtexact.haar.counts.features.header.txt ecoli.20sliding.dwtexact.haar.counts.features.cas9.txt > ecoli.20sliding.dwtexact.haar.counts.features.cas9.header.txt
R
library(ranger)
iRF <- function(xmat, y, ntree=200, iter=5, classification=F, threads=1, alwayssplits=NULL, saveall=T)
{
tmp <- cbind(xmat, Y = y)
wt <- rep(1/ncol(xmat), ncol(xmat)) # start with equal sample weighting per SNP
rfs <- list()
for(i in 1:iter)
{
cat("\niRF iteration ",i,"\n")
cat("=================\n")
mtry = 0.5*sum(wt>0)
rf <- ranger::ranger(dependent.variable.name = "Y", data = tmp, num.trees=ntree,
split.select.weights = wt, classification = classification,
mtry = mtry, importance = "impurity_corrected", num.threads=threads, write.forest = T,
always.split.variables = alwayssplits)
wt <- rf$variable.importance / sum(abs(rf$variable.importance)) # scale importance to range(0,1)
wt[wt<0] <- 0 # set negative weights to zero
cat("mtry: ", mtry, "\n")
cat("prediction error: ",rf$prediction.error,"\n")
if(classification==FALSE) cat("r^2: ",rf$r.squared,"\n")
if(classification==TRUE) print(rf$confusion.matrix)
cat("cor(y,yhat): ",cor(rf$predictions,y),"\n")
cat("SNPs with importance > 0:",sum(wt>0),"\n")
if(saveall) rfs[[i]] <- rf
if(sum(wt>0) < max(0.01*(ncol(xmat)-1), 10))
{
if(!saveall) rfs <- rf
break
}
}
return(rfs)
}
# all features
setwd("/gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli")
df <- read.delim("ecoli.20sliding.dwtexact.haar.counts.features.cas9.header.txt", header=T, sep="\t", stringsAsFactors = F)
df.sample <- df[sample(nrow(df), 10000), ]
iRF(df.sample[,3:ncol(df.sample)], df.sample$cut.score)
# iteration 4
# mtry: 302
# prediction error: 87.01491
# r^2: 0.2166524
# cor(y,yhat): 0.4673733
# SNPs with importance > 0: 459
# non-correlated features
setwd("/gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli")
df <- read.delim("ecoli.20sliding.dwtexact.haar.counts.features.cas9.noncorrelated.txt", header=T, sep="\t", stringsAsFactors = F)
df.sample <- df[sample(nrow(df), 10000), ]
iRF(df.sample[,3:ncol(df.sample)], df.sample$cut.score.x)
# iteration 2
# mtry: 251.5
# prediction error: 84.8334
# r^2: 0.2177831
# cor(y,yhat): 0.4678087
# SNPs with importance > 0: 328
iRF(df[,3:ncol(df)], df$cut.score)
# iteration 3
# mtry: 111
# prediction error: 82.52249
# r^2: 0.2532614
# cor(y,yhat): 0.5038021
# SNPs with importance > 0: 175
setwd("/gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli")
df <- read.delim("ecoli.20sliding.dwtexact.haar.features.counts.dcast.txt", header=T, sep="\t", stringsAsFactors = F)
df.sample <- df[sample(nrow(df), 10000), ]
iRF(df.sample[,3:ncol(df.sample)], df.sample$cut.score)
# iteration 5
# mtry: 82.5
# prediction error: 84.73241
# r^2: 0.1173544
# cor(y,yhat): 0.342927
# SNPs with importance > 0: 121
iRF(df[,3:ncol(df)], df$cut.score)
write.table(df[,c(1,3:ncol(df))], "ecoli.20sliding.dwtexact.haar.features.counts.txt", quote=F, row.names=F, sep="\t")
write.table(df[,c(1,3:ncol(df))], "ecoli.20sliding.dwtexact.haar.features.counts_overlap.txt", quote=F, row.names=F, sep="\t")
write.table(df[,3:ncol(df)], "ecoli.20sliding.dwtexact.haar.features.counts_overlap_noSampleIDs.txt", quote=F, row.names=F, sep="\t")
–> run iRF (python submits)
# run python scripts on Andes
# run job submissions on Summit
# Builder script: /gpfs/alpine/syb105/proj-shared/Projects/iRF/iRF_LOOP_SetUp_CrossLayer.py
# [python iRF_LOOP_SetUp_CrossLayer.py --DataFile --YFile --System Summit --NodesPer 1 --TotalNodes 10 --RunTime 2 --Account SYB105 --NumTrees 1000 --NumIterations 5 --RunName iRF.XX --bypass --Prediction]
# Andes
module load python/3.7-anaconda3
mkdir /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli/iRF.run/Cas9.haar.counts.noncor
cd /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli/iRF.run/Cas9.haar.counts.noncor
python /gpfs/alpine/syb105/proj-shared/Projects/iRF/iRF_LOOP_SetUp_CrossLayer.py --System Summit --NodesPer 1 --TotalNodes 50 --RunTime 90 --Account SYB105 --NumTrees 1000 --NumIterations 5 --RunName Cas9.haar.counts.noncor --bypass --targetNodeSize 50 --Prediction /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli/matrix.haar.counts.noncor.features.txt /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli/iRF.run/Ecoli.allCas9/all.features/dwt20bp/dwt20bp.pam.score.txt
mkdir /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli/iRF.run/Cas9.haar.counts
cd /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli/iRF.run/Cas9.haar.counts
python /gpfs/alpine/syb105/proj-shared/Projects/iRF/iRF_LOOP_SetUp_CrossLayer.py --System Summit --NodesPer 1 --TotalNodes 50 --RunTime 90 --Account SYB105 --NumTrees 1000 --NumIterations 5 --RunName Cas9.haar.counts --bypass --targetNodeSize 50 --Prediction /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli/matrix.haar.counts.features.txt /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli/iRF.run/Ecoli.allCas9/all.features/dwt20bp/dwt20bp.pam.score.txt
mkdir /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli/iRF.run/Cas9.haar.counts.vienna
cd /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli/iRF.run/Cas9.haar.counts.vienna
python /gpfs/alpine/syb105/proj-shared/Projects/iRF/iRF_LOOP_SetUp_CrossLayer.py --System Summit --NodesPer 1 --TotalNodes 50 --RunTime 90 --Account SYB105 --NumTrees 1000 --NumIterations 5 --RunName Cas9.haar.counts.vienna --bypass --targetNodeSize 50 --Prediction /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli/ecoli.20sliding.dwtexact.haar.features.counts.txt /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli/iRF.run/Ecoli.allCas9/all.features/dwt20bp/dwt20bp.pam.score.txt
# Summit
cd /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli/iRF.run/Cas9.haar.counts.noncor
module load python/3.7.0-anaconda3-5.3.0
# full
bsub /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli/iRF.run/Cas9.haar.counts.noncor/Submits/submit_full_Cas9.haar.counts.noncor_0.sh
bsub /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli/iRF.run/Cas9.haar.counts/Submits/submit_full_Cas9.haar.counts_0.sh
bsub /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli/iRF.run/Cas9.haar.counts.vienna/Submits/submit_full_Cas9.haar.counts.vienna_0.sh
# train
bsub /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli/iRF.run/Cas9.haar.counts.noncor/Submits/submit_train_Cas9.haar.counts.noncor_0.sh
bsub /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli/iRF.run/Cas9.haar.counts/Submits/submit_train_Cas9.haar.counts_0.sh
bsub /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli/iRF.run/Cas9.haar.counts.vienna/Submits/submit_train_Cas9.haar.counts.vienna_0.sh
# once the train submissions are done run the test submissions
# test
bsub /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli/iRF.run/Cas9.haar.counts.noncor/Submits/submit_test_Cas9.haar.counts.noncor_0.sh
bsub /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli/iRF.run/Cas9.haar.counts/Submits/submit_test_Cas9.haar.counts_0.sh
bsub /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli/iRF.run/Cas9.haar.counts.vienna/Submits/submit_test_Cas9.haar.counts.vienna_0.sh
# Andes
module load python/3.7-anaconda3
cd /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli/iRF.run/Cas9.haar.counts.noncor
python /gpfs/alpine/syb105/proj-shared/Projects/iRF/iRF_postProcessing.py --Iterations 5 --Prediction --PredAccuracy MAE,MAEA,MSE,R2 --varTot 95 /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli/iRF.run/Ecoli.allCas9/YNames.txt Cas9.haar.counts.noncor
# 0.2395267896029956
sort -k3rg topVarEdges/cut.score_top95.txt | head
# p20homo_lumo_energygapraw cut.score 0.05954956505859089
# p18xz_quadrupoleraw cut.score 0.039654539452662466
# p20xz_quadrupoleraw cut.score 0.03672003413361222
# sgRNA.gcsgRNA.raw cut.score 0.027824253296751224
# p19rot_constants_zraw cut.score 0.02533383206618757
# ipd.dwtd22 cut.score 0.025101854928008732
# p19rot_constants_yraw cut.score 0.02457576978847325
# p15.CCsgRNA.raw cut.score 0.023617654167064146
# pam.dwtd6 cut.score 0.021865132817698564
# pam.dwtd5 cut.score 0.020222993284043336
# pearson correlation
setwd("/gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli/iRF.run/Cas9.haar.counts.noncor/cut.score/foldRuns/fold9/Runs/Set4")
pred <- read.delim("Cas9.haar.counts.noncor_Set4_test.prediction", header=T, sep="\t")
y <- read.delim("set4_Y_test_noSampleIDs.txt", header=T, sep="\t")
cor(y$cut.score, pred$Predictions.)
# 0.4892823
cd /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli/iRF.run/Cas9.haar.counts
python /gpfs/alpine/syb105/proj-shared/Projects/iRF/iRF_postProcessing.py --Iterations 5 --Prediction --PredAccuracy MAE,MAEA,MSE,R2 --varTot 95 /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli/iRF.run/Ecoli.allCas9/YNames.txt Cas9.haar.counts
# 0.23978870160766935
sort -k3rg topVarEdges/cut.score_top95.txt | head
# p20homo_lumo_energygapraw cut.score 0.05732839462698965
# p18xz_quadrupoleraw cut.score 0.038261397392095185
# p20xz_quadrupoleraw cut.score 0.036703723267508356
# sgRNA.gcsgRNA.raw cut.score 0.02797136635004197
# p19rot_constants_zraw cut.score 0.026374777751191965
# ipd.dwtd22 cut.score 0.024969147515260026
# p15.CCsgRNA.raw cut.score 0.023751372705742988
# p19rot_constants_yraw cut.score 0.023073983414535273
# pam.dwtd6 cut.score 0.02205622024149846
# pam.dwtd5 cut.score 0.020178469090326737
# pearson correlation
setwd("/gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli/iRF.run/Cas9.haar.counts/cut.score/foldRuns/fold9/Runs/Set4")
pred <- read.delim("Cas9.haar.counts_Set4_test.prediction", header=T, sep="\t")
y <- read.delim("set4_Y_test_noSampleIDs.txt", header=T, sep="\t")
cor(y$cut.score, pred$Predictions.)
# 0.4896945
cd /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli/iRF.run/Cas9.haar.counts.vienna
python /gpfs/alpine/syb105/proj-shared/Projects/iRF/iRF_postProcessing.py --Iterations 5 --Prediction --PredAccuracy MAE,MAEA,MSE,R2 --varTot 95 /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli/iRF.run/Ecoli.allCas9/YNames.txt Cas9.haar.counts.vienna
#
sort -k3rg topVarEdges/cut.score_top95.txt | head
# sgRNA_ipd.dwtd8 cut.score 0.01885659287452323
# sgRNA_structure.dwtd7 cut.score 0.018419146225350912
# sgRNA_ipd.dwtd6 cut.score 0.017737060406787897
# sgRNA_structure.dwtd4 cut.score 0.017728092617030615
# sgRNA_ipd.dwtd3 cut.score 0.01755391465287573
# sgRNA_ipd.dwtd5 cut.score 0.017460713695041093
# sgRNA_structure.dwtd6 cut.score 0.016950403754090838
# sgRNA_ipd.dwtd7 cut.score 0.016545999139798036
# sgRNA_structure.dwtd5 cut.score 0.016525394575236656
# sgRNA_ipd.dwtd4 cut.score 0.016425894812691542
# pearson correlation
setwd("/gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli/iRF.run/Cas9.haar.counts.vienna/cut.score/foldRuns/fold9/Runs/Set4")
pred <- read.delim("Cas9.haar.counts.vienna_Set4_test.prediction", header=T, sep="\t")
y <- read.delim("set4_Y_test_noSampleIDs.txt", header=T, sep="\t")
cor(y$cut.score, pred$Predictions.)
# 0.1017838
*** HAAR wavelets actually appear to make the dataset worse?? But that shouldn’t influence the “basic rules” feature set since that only includes one-hot and gc, melting temp, structure… so is it the ViennaRNA instead of iFeature that is making it worse??? –> testing the “basic rules” feature set (using the first 10,000 rows) iFeature.py results in GBR 0.53, with ViennaRNA results in GBR 0.75 –> so why did the newer matrix with haar and viennaRNA somehow only have GBR 0.35??? –> using a different set of samples?
# confused... testing the different data matrices
library(dplyr)
library(caret)
setwd("/gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli")
df <- read.delim("ecoli.20sliding.dwtexact.haar.features.counts.dcast.txt", header=T, sep="\t", stringsAsFactors = F)
vienna <- df[,c(1,2,1642:1644,3:17,1645:1649,1651:1652,1654:1655,18:57,120:139,202:221,284:303,366:385,448:467,530:549,612:631,694:713,776:795,920:943,1068:1087,1150:1169,1232:1251,1314:1333,1396:1415,1478:1497,1560:1579)]
ncol(vienna)
# 413
nrow(vienna)
# 40468
setwd("/gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli")
df <- read.delim("ecoli.20sliding.dwtexact.all.features.dcast.26oct.cas9.header.txt", header=T, sep="\t", stringsAsFactors = F)
iFeature <- df[,c(1,2,1642:1644,3:57,120:139,202:221,284:303,366:385,448:467,530:549,612:631,694:713,776:795,920:943,1068:1087,1150:1169,1232:1251,1314:1333,1396:1415,1478:1497,1560:1579,1645:1649,1651:1652,1654:1655)]
ncol(iFeature)
# 413
nrow(iFeature)
# 40467
iFeature.subset <- iFeature[1:10000,]
data <- iFeature.subset[,2:ncol(iFeature.subset)]
set.seed(998)
inTraining <- createDataPartition(data$cut.score, p = .75, list = FALSE)
training <- data[ inTraining,]
testing <- data[-inTraining,]
fitControl <- trainControl(## 5-fold CV
method = "repeatedcv",
number = 5,
## repeated ten times
repeats = 10)
set.seed(825)
gbmFit1 <- train(cut.score ~ ., data = training,
method = "gbm",
trControl = fitControl,
verbose = FALSE)
test.df <- na.omit(testing)
pred <- predict(gbmFit1, newdata = test.df)
pred.df <- data.frame(pred.score = pred, exp.score = test.df$cut.score)
cor(pred.df$pred.score, pred.df$exp.score)
# 0.5311444
vienna.rows <- left_join(iFeature[,1:2], vienna, by="sgRNAID")
vienna.subset <- vienna.rows[1:10000,c(1,3:ncol(vienna.rows))]
data <- vienna.subset[,2:ncol(vienna.subset)]
set.seed(998)
inTraining <- createDataPartition(data$cut.score.y, p = .75, list = FALSE)
training <- data[ inTraining,]
testing <- data[-inTraining,]
fitControl <- trainControl(## 5-fold CV
method = "repeatedcv",
number = 5,
## repeated ten times
repeats = 10)
set.seed(825)
gbmFit1 <- train(cut.score.y ~ ., data = training,
method = "gbm",
trControl = fitControl,
verbose = FALSE)
test.df <- na.omit(testing)
pred <- predict(gbmFit1, newdata = test.df)
pred.df <- data.frame(pred.score = pred, exp.score = test.df$cut.score.y)
cor(pred.df$pred.score, pred.df$exp.score)
# 0.3830305
iFeature.col <- data.frame(col = names(iFeature))
vienna.col <- data.frame(col = names(vienna))
length(iFeature.col$col %in% vienna.col$col)
# 413 --> so all columns match... what about values?
iFeature.dat.order <- iFeature.subset[ , order(names(iFeature.subset))]
vienna.dat.order <- vienna.subset[ , order(names(vienna.subset))]
table(iFeature.dat.order == vienna.dat.order, useNA = 'ifany')
# TRUE
# 4100000
table(vienna.subset$cut.score.x == vienna.subset$cut.score.y)
# TRUE
# 10000
### how can the dataframes match exactly and yet return two very different GBR predictions? Does the order of the features somehow matter?
table(vienna.subset$sgRNA.structuresgRNA.raw == iFeature.subset$sgRNA.structuresgRNA.raw)
data <- vienna.dat.order[,2:ncol(vienna.dat.order)]
set.seed(998)
inTraining <- createDataPartition(data$cut.score.y, p = .75, list = FALSE)
training <- data[ inTraining,]
testing <- data[-inTraining,]
fitControl <- trainControl(## 5-fold CV
method = "repeatedcv",
number = 5,
## repeated ten times
repeats = 10)
set.seed(825)
gbmFit1 <- train(cut.score.y ~ ., data = training,
method = "gbm",
trControl = fitControl,
verbose = FALSE)
test.df <- na.omit(testing)
pred <- predict(gbmFit1, newdata = test.df)
pred.df <- data.frame(pred.score = pred, exp.score = test.df$cut.score.y)
cor(pred.df$pred.score, pred.df$exp.score)
################ I am an IDIOT... the one dataframe had all types of cas, not just Cas9... so when doing randomizations or just taking the first 10,000 rows it included recA and esp which would result in a lower correlation than just the Cas9 matrix... Adjust and re-run below....
cd /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli/
grep '_Cas9' ecoli.20sliding.dwtexact.haar.features.counts.dcast.txt > ecoli.20sliding.dwtexact.haar.features.counts.cas9.txt
awk 'NR==1{print $0}' ecoli.20sliding.dwtexact.haar.features.counts.dcast.txt > ecoli.20sliding.dwtexact.haar.features.counts.header.txt
cat ecoli.20sliding.dwtexact.haar.features.counts.header.txt ecoli.20sliding.dwtexact.haar.features.counts.cas9.txt > ecoli.20sliding.dwtexact.haar.features.counts.cas9.header.txt
#!/bin/bash
#SBATCH -A SYB105
#SBATCH -J iRF.GBR
#SBATCH -N 1
#SBATCH -p gpu
#SBATCH -t 24:00:00
#SBATCH --mem-per-cpu=0
source /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/scripts/loadcondaandes.sh
conda activate /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/Libraries/andes/envs/test
cd /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli/
R CMD BATCH iRF.nov4.test.R
R CMD BATCH GBR.nov4.test.R
# sbatch /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli/iRF.GBR.test.sh
R CMD BATCH iRF.nov4.full.R
# sbatch /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli/iRF.GBR.full.sh
# salloc -A SYB105 -N 2 -t 4:00:00 -p gpu
source /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/scripts/loadcondaandes.sh
conda activate /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/Libraries/andes/envs/test
R
library(dplyr)
library(caret)
setwd("/gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli")
df <- read.delim("ecoli.20sliding.dwtexact.haar.features.counts.cas9.header.txt", header=T, sep="\t", stringsAsFactors = F)
#df.sample <- df[sample(nrow(df), 10000), ]
df.sample <- df[1:10000,]
## traditional features including gc content, melting temp, structure, one-hot (all)
data <- df.sample[,c(2,1642:1644,3:17,1645:1649,1651:1652,1654:1655,18:57,120:139,202:221,284:303,366:385,448:467,530:549,612:631,694:713,776:795,920:943,1068:1087,1150:1169,1232:1251,1314:1333,1396:1415,1478:1497,1560:1579)]
# 0.3554742
# all DWT features (gc, temp, structure, gene density, RNAseq, IPD, GATC, PAM)
data <- df.sample[,c(2,1656:1839)]
# Raw features (gc, temp, structure, location, pam)
data <- df.sample[,c(2,1642:1644,1650,1653,1840)]
# One-hot features (dependent & independent sgRNA, PAM)
## independent
data <- df.sample[,c(2,3:17,1645:1649,1651:1652,1654:1655)]
## dependent
data <- df.sample[,c(2,18:57,120:139,202:221,284:303,366:385,448:467,530:549,612:631,694:713,776:795,920:943,1068:1087,1150:1169,1232:1251,1314:1333,1396:1415,1478:1497,1560:1579)]
## ind/dep
data <- df.sample[,c(2,3:57,120:139,202:221,284:303,366:385,448:467,530:549,612:631,694:713,776:795,920:943,1068:1087,1150:1169,1232:1251,1314:1333,1396:1415,1478:1497,1560:1579,1645:1649,1651:1652,1654:1655)]
# Tensor features
data <- df.sample[,c(2,58:119,140:201,222:283,304:365,386:447,468:529,550:611,632:693,714:775,796:919,944:1067,1088:1149,1170:1231,1252:1313,1334:1395,1416:1477,1498:1559,1580:1641)]
# Raw + Onehot
data <- df.sample[,c(2,1642:1644,1650,1653,1840,3:57,120:139,202:221,284:303,366:385,448:467,530:549,612:631,694:713,776:795,920:943,1068:1087,1150:1169,1232:1251,1314:1333,1396:1415,1478:1497,1560:1579,1645:1649,1651:1652,1654:1655)]
# Raw + Tensor
data <- df.sample[,c(2,1642:1644,1650,1653,1840,58:119,140:201,222:283,304:365,386:447,468:529,550:611,632:693,714:775,796:919,944:1067,1088:1149,1170:1231,1252:1313,1334:1395,1416:1477,1498:1559,1580:1641)]
# Raw + DWT
data <- df.sample[,c(2,1642:1644,1650,1653,1840,1656:1839)]
# Onehot + Tensor
data <- df.sample[,c(2,3:57,120:139,202:221,284:303,366:385,448:467,530:549,612:631,694:713,776:795,920:943,1068:1087,1150:1169,1232:1251,1314:1333,1396:1415,1478:1497,1560:1579,1645:1649,1651:1652,1654:1655,1642:1644,1650,1653,1840,58:119,140:201,222:283,304:365,386:447,468:529,550:611,632:693,714:775,796:919,944:1067,1088:1149,1170:1231,1252:1313,1334:1395,1416:1477,1498:1559,1580:1641)]
# Onehot + DWT
data <- df.sample[,c(2,3:57,120:139,202:221,284:303,366:385,448:467,530:549,612:631,694:713,776:795,920:943,1068:1087,1150:1169,1232:1251,1314:1333,1396:1415,1478:1497,1560:1579,1645:1649,1651:1652,1654:1655,1656:1839)]
# Tensor + DWT
data <- df.sample[,c(2,58:119,140:201,222:283,304:365,386:447,468:529,550:611,632:693,714:775,796:919,944:1067,1088:1149,1170:1231,1252:1313,1334:1395,1416:1477,1498:1559,1580:1641,1656:1839)]
# Raw + Onehot + Tensor
data <- df.sample[,c(2,1642:1644,1650,1653,1840,3:57,120:139,202:221,284:303,366:385,448:467,530:549,612:631,694:713,776:795,920:943,1068:1087,1150:1169,1232:1251,1314:1333,1396:1415,1478:1497,1560:1579,1645:1649,1651:1652,1654:1655,58:119,140:201,222:283,304:365,386:447,468:529,550:611,632:693,714:775,796:919,944:1067,1088:1149,1170:1231,1252:1313,1334:1395,1416:1477,1498:1559,1580:1641)]
# Raw + Onehot + DWT
data <- df.sample[,c(2,1642:1644,1650,1653,1840,3:57,120:139,202:221,284:303,366:385,448:467,530:549,612:631,694:713,776:795,920:943,1068:1087,1150:1169,1232:1251,1314:1333,1396:1415,1478:1497,1560:1579,1645:1649,1651:1652,1654:1655,1656:1839)]
# Raw + Tensor + DWT
data <- df.sample[,c(2,1642:1644,1650,1653,1840,58:119,140:201,222:283,304:365,386:447,468:529,550:611,632:693,714:775,796:919,944:1067,1088:1149,1170:1231,1252:1313,1334:1395,1416:1477,1498:1559,1580:1641,1656:1839)]
# Onehot + Tensor + DWT
data <- df.sample[,c(2,3:57,120:139,202:221,284:303,366:385,448:467,530:549,612:631,694:713,776:795,920:943,1068:1087,1150:1169,1232:1251,1314:1333,1396:1415,1478:1497,1560:1579,1645:1649,1651:1652,1654:1655,1642:1644,1650,1653,1840,58:119,140:201,222:283,304:365,386:447,468:529,550:611,632:693,714:775,796:919,944:1067,1088:1149,1170:1231,1252:1313,1334:1395,1416:1477,1498:1559,1580:1641,1656:1839)]
# Raw + Onehot + Tensor + DWT
data <- df.sample[,c(2,1642:1644,1650,1653,1840,3:57,120:139,202:221,284:303,366:385,448:467,530:549,612:631,694:713,776:795,920:943,1068:1087,1150:1169,1232:1251,1314:1333,1396:1415,1478:1497,1560:1579,1645:1649,1651:1652,1654:1655,58:119,140:201,222:283,304:365,386:447,468:529,550:611,632:693,714:775,796:919,944:1067,1088:1149,1170:1231,1252:1313,1334:1395,1416:1477,1498:1559,1580:1641,1656:1839)]
## all features sample
data <- df.sample[,2:ncol(df.sample)]
## all features full matrix
data <- df[,2:ncol(df)]
set.seed(998)
inTraining <- createDataPartition(data$cut.score, p = .75, list = FALSE)
training <- data[ inTraining,]
testing <- data[-inTraining,]
fitControl <- trainControl(## 5-fold CV
method = "repeatedcv",
number = 5,
## repeated ten times
repeats = 10)
set.seed(825)
gbmFit1 <- train(cut.score ~ ., data = training,
method = "gbm",
trControl = fitControl,
verbose = FALSE)
test.df <- na.omit(testing)
pred <- predict(gbmFit1, newdata = test.df)
pred.df <- data.frame(pred.score = pred, exp.score = test.df$cut.score)
cor(pred.df$pred.score, pred.df$exp.score)
# salloc -A SYB105 -p gpu -N 1 -t 4:00:00
source /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/scripts/loadcondaandes.sh
conda activate /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/Libraries/andes/envs/test
R
library(ranger)
iRF <- function(xmat, y, ntree=200, iter=5, classification=F, threads=1, alwayssplits=NULL, saveall=T)
{
tmp <- cbind(xmat, Y = y)
wt <- rep(1/ncol(xmat), ncol(xmat)) # start with equal sample weighting per SNP
rfs <- list()
for(i in 1:iter)
{
cat("\niRF iteration ",i,"\n")
cat("=================\n")
mtry = 0.5*sum(wt>0)
rf <- ranger::ranger(dependent.variable.name = "Y", data = tmp, num.trees=ntree,
split.select.weights = wt, classification = classification,
mtry = mtry, importance = "impurity_corrected", num.threads=threads, write.forest = T,
always.split.variables = alwayssplits)
wt <- rf$variable.importance / sum(abs(rf$variable.importance)) # scale importance to range(0,1)
wt[wt<0] <- 0 # set negative weights to zero
cat("mtry: ", mtry, "\n")
cat("prediction error: ",rf$prediction.error,"\n")
if(classification==FALSE) cat("r^2: ",rf$r.squared,"\n")
if(classification==TRUE) print(rf$confusion.matrix)
cat("cor(y,yhat): ",cor(rf$predictions,y),"\n")
cat("SNPs with importance > 0:",sum(wt>0),"\n")
if(saveall) rfs[[i]] <- rf
if(sum(wt>0) < max(0.01*(ncol(xmat)-1), 10))
{
if(!saveall) rfs <- rf
break
}
}
return(rfs)
}
setwd("/gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli")
df <- read.delim("ecoli.20sliding.dwtexact.haar.features.counts.cas9.header.txt", header=T, sep="\t", stringsAsFactors = F)
#df.sample <- df[sample(nrow(df), 10000), ]
df.sample <- df[1:10000,]
# sgRNAID: [,1]
# cut.score: [,2]
# one-hot independent: [,c(3:17,1645:1649,1651:1652,1654:1655)]
# one-hot dependent: [,c(18:57,120:139,202:221,284:303,366:385,448:467,530:549,612:631,694:713,776:795,920:943,1068:1087,1150:1169,1232:1251,1314:1333,1396:1415,1478:1497,1560:1579)]
# chemical tensors: [,c(58:119,140:201,222:283,304:365,386:447,468:529,550:611,632:693,714:775,796:919,944:1067,1088:1149,1170:1231,1252:1313,1334:1395,1416:1477,1498:1559,1580:1641)]
# raw (gc, structure, temp, gene.distance, pam.distance, gatc count): [,c(1642:1644,1650,1653,1840)]
# DWT gatc motif: [,1656:1678]
# DWT gc content: [,1679:1701]
# DWT gene density: [,1702:1724]
# DWT ipd: [,1725:1747]
# DWT PAM: [,1748:1770]
# DWT rna-seq expression: [,1771:1793]
# DWT rna structure: [,1794:1816]
# DWT melting temp: [,1817:1839]
## traditional features including gc content, melting temp, structure, one-hot (all)
iRF(df.sample[,c(1642:1644,3:17,1645:1649,1651:1652,1654:1655,18:57,120:139,202:221,284:303,366:385,448:467,530:549,612:631,694:713,776:795,920:943,1068:1087,1150:1169,1232:1251,1314:1333,1396:1415,1478:1497,1560:1579)], df.sample$cut.score)
# all DWT features (gc, temp, structure, gene density, RNAseq, IPD, GATC, PAM)
iRF(df.sample[,1656:1839], df.sample$cut.score)
# absolute value DWT
iRF(abs(df.sample[,1656:1839]), df.sample$cut.score)
# PAM DWT
iRF(df.sample[,1748:1770], df.sample$cut.score)
# Gene DWT
iRF(df.sample[,1702:1724], df.sample$cut.score)
# Raw features (gc, temp, structure, location, pam)
iRF(df.sample[,c(1642:1644,1650,1653,1840)], df.sample$cut.score)
# One-hot features (dependent & independent sgRNA, PAM)
## independent
iRF(df.sample[,c(3:17,1645:1649,1651:1652,1654:1655)], df.sample$cut.score)
## dependent
iRF(df.sample[,c(18:57,120:139,202:221,284:303,366:385,448:467,530:549,612:631,694:713,776:795,920:943,1068:1087,1150:1169,1232:1251,1314:1333,1396:1415,1478:1497,1560:1579)], df.sample$cut.score)
## ind/dep
iRF(df.sample[,c(3:57,120:139,202:221,284:303,366:385,448:467,530:549,612:631,694:713,776:795,920:943,1068:1087,1150:1169,1232:1251,1314:1333,1396:1415,1478:1497,1560:1579,1645:1649,1651:1652,1654:1655)], df.sample$cut.score)
# Tensor features
iRF(df.sample[,c(58:119,140:201,222:283,304:365,386:447,468:529,550:611,632:693,714:775,796:919,944:1067,1088:1149,1170:1231,1252:1313,1334:1395,1416:1477,1498:1559,1580:1641)], df.sample$cut.score)
# Raw + Onehot
iRF(df.sample[,c(1642:1644,1650,1653,1840,3:57,120:139,202:221,284:303,366:385,448:467,530:549,612:631,694:713,776:795,920:943,1068:1087,1150:1169,1232:1251,1314:1333,1396:1415,1478:1497,1560:1579,1645:1649,1651:1652,1654:1655)], df.sample$cut.score)
# Raw + Tensor
iRF(df.sample[,c(1642:1644,1650,1653,1840,58:119,140:201,222:283,304:365,386:447,468:529,550:611,632:693,714:775,796:919,944:1067,1088:1149,1170:1231,1252:1313,1334:1395,1416:1477,1498:1559,1580:1641)], df.sample$cut.score)
# Raw + DWT
iRF(df.sample[,c(1642:1644,1650,1653,1840,1656:1839)], df.sample$cut.score)
# Onehot + Tensor
iRF(df.sample[,c(3:57,120:139,202:221,284:303,366:385,448:467,530:549,612:631,694:713,776:795,920:943,1068:1087,1150:1169,1232:1251,1314:1333,1396:1415,1478:1497,1560:1579,1645:1649,1651:1652,1654:1655,1642:1644,1650,1653,1840,58:119,140:201,222:283,304:365,386:447,468:529,550:611,632:693,714:775,796:919,944:1067,1088:1149,1170:1231,1252:1313,1334:1395,1416:1477,1498:1559,1580:1641)], df.sample$cut.score)
# Onehot + DWT
iRF(df.sample[,c(3:57,120:139,202:221,284:303,366:385,448:467,530:549,612:631,694:713,776:795,920:943,1068:1087,1150:1169,1232:1251,1314:1333,1396:1415,1478:1497,1560:1579,1645:1649,1651:1652,1654:1655,1656:1839)], df.sample$cut.score)
# Tensor + DWT
iRF(df.sample[,c(58:119,140:201,222:283,304:365,386:447,468:529,550:611,632:693,714:775,796:919,944:1067,1088:1149,1170:1231,1252:1313,1334:1395,1416:1477,1498:1559,1580:1641,1656:1839)], df.sample$cut.score)
# Raw + Onehot + Tensor
iRF(df.sample[,c(1642:1644,1650,1653,1840,3:57,120:139,202:221,284:303,366:385,448:467,530:549,612:631,694:713,776:795,920:943,1068:1087,1150:1169,1232:1251,1314:1333,1396:1415,1478:1497,1560:1579,1645:1649,1651:1652,1654:1655,58:119,140:201,222:283,304:365,386:447,468:529,550:611,632:693,714:775,796:919,944:1067,1088:1149,1170:1231,1252:1313,1334:1395,1416:1477,1498:1559,1580:1641)], df.sample$cut.score)
# Raw + Onehot + DWT
iRF(df.sample[,c(1642:1644,1650,1653,1840,3:57,120:139,202:221,284:303,366:385,448:467,530:549,612:631,694:713,776:795,920:943,1068:1087,1150:1169,1232:1251,1314:1333,1396:1415,1478:1497,1560:1579,1645:1649,1651:1652,1654:1655,1656:1839)], df.sample$cut.score)
# Raw + Tensor + DWT
iRF(df.sample[,c(1642:1644,1650,1653,1840,58:119,140:201,222:283,304:365,386:447,468:529,550:611,632:693,714:775,796:919,944:1067,1088:1149,1170:1231,1252:1313,1334:1395,1416:1477,1498:1559,1580:1641,1656:1839)], df.sample$cut.score)
# Onehot + Tensor + DWT
iRF(df.sample[,c(3:57,120:139,202:221,284:303,366:385,448:467,530:549,612:631,694:713,776:795,920:943,1068:1087,1150:1169,1232:1251,1314:1333,1396:1415,1478:1497,1560:1579,1645:1649,1651:1652,1654:1655,1642:1644,1650,1653,1840,58:119,140:201,222:283,304:365,386:447,468:529,550:611,632:693,714:775,796:919,944:1067,1088:1149,1170:1231,1252:1313,1334:1395,1416:1477,1498:1559,1580:1641,1656:1839)], df.sample$cut.score)
# Raw + Onehot + Tensor + DWT
iRF(df.sample[,c(1642:1644,1650,1653,1840,3:57,120:139,202:221,284:303,366:385,448:467,530:549,612:631,694:713,776:795,920:943,1068:1087,1150:1169,1232:1251,1314:1333,1396:1415,1478:1497,1560:1579,1645:1649,1651:1652,1654:1655,58:119,140:201,222:283,304:365,386:447,468:529,550:611,632:693,714:775,796:919,944:1067,1088:1149,1170:1231,1252:1313,1334:1395,1416:1477,1498:1559,1580:1641,1656:1839)], df.sample$cut.score)
## all features sample
iRF(df.sample[,3:ncol(df.sample)], df.sample$cut.score)
## all features full matrix
iRF(df[,3:ncol(df)], df$cut.score)
setwd("/gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli")
df <- read.delim("ecoli.20sliding.dwtexact.haar.features.counts.cas9.header.txt", header=T, sep="\t", stringsAsFactors = F)
write.table(df[,c(1,3:ncol(df))], "ecoli.20sliding.dwtexact.haar.counts.vienna.features.txt", quote=F, row.names=F, sep="\t")
write.table(df[,c(1,3:ncol(df))], "ecoli.20sliding.dwtexact.haar.counts.vienna.features_overlap.txt", quote=F, row.names=F, sep="\t")
write.table(df[,3:ncol(df)], "ecoli.20sliding.dwtexact.haar.counts.vienna.features_overlap_noSampleIDs.txt", quote=F, row.names=F, sep="\t")
# run python scripts on Andes
# run job submissions on Summit
# Builder script: /gpfs/alpine/syb105/proj-shared/Projects/iRF/iRF_LOOP_SetUp_CrossLayer.py
# [python iRF_LOOP_SetUp_CrossLayer.py --DataFile --YFile --System Summit --NodesPer 1 --TotalNodes 10 --RunTime 2 --Account SYB105 --NumTrees 1000 --NumIterations 5 --RunName iRF.XX --bypass --Prediction]
# Andes
module load python/3.7-anaconda3
mkdir /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli/iRF.run/Cas9.haar.counts.vienna
cd /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli/iRF.run/Cas9.haar.counts.vienna
python /gpfs/alpine/syb105/proj-shared/Projects/iRF/iRF_LOOP_SetUp_CrossLayer.py --System Summit --NodesPer 1 --TotalNodes 50 --RunTime 90 --Account SYB105 --NumTrees 1000 --NumIterations 5 --RunName Cas9.haar.counts.vienna --bypass --targetNodeSize 50 --Prediction /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli/ecoli.20sliding.dwtexact.haar.counts.vienna.features.txt /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli/iRF.run/Ecoli.allCas9/all.features/dwt20bp/dwt20bp.pam.score.txt
# Summit
cd /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli/iRF.run/Cas9.haar.counts.vienna
module load python/3.7.0-anaconda3-5.3.0
# full
bsub /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli/iRF.run/Cas9.haar.counts.vienna/Submits/submit_full_Cas9.haar.counts.vienna_0.sh
# train
bsub /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli/iRF.run/Cas9.haar.counts.vienna/Submits/submit_train_Cas9.haar.counts.vienna_0.sh
# once the train submissions are done run the test submissions
# test
bsub /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli/iRF.run/Cas9.haar.counts.vienna/Submits/submit_test_Cas9.haar.counts.vienna_0.sh
# Andes
module load python/3.7-anaconda3
cd /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli/iRF.run/Cas9.haar.counts.vienna
python /gpfs/alpine/syb105/proj-shared/Projects/iRF/iRF_postProcessing.py --Iterations 5 --Prediction --PredAccuracy MAE,MAEA,MSE,R2 --varTot 95 /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli/iRF.run/Ecoli.allCas9/YNames.txt Cas9.haar.counts.vienna
#
sort -k3rg topVarEdges/cut.score_top95.txt | head
sort -k2rg cut.score/foldRuns/fold9/Runs/Set4/Cas9.haar.counts.vienna_cut.score.importance4 | head
# p20homo_lumo_energygapraw: 60998.7
# sgRNA_pam.dwtd6: 21606.4
# p19.GGsgRNA.raw: 17184.9
# p15.CCsgRNA.raw: 16729.5
# sgRNA_pam.dwtd5: 14589.4
# p18xz_quadrupoleraw: 13782.4
# sgRNA_ipd.dwtd22: 11848.4
# CCsgRNA.raw: 10718.7
# p18yz_quadrupoleraw: 10354.2
# p20xz_quadrupoleraw: 10211.3
# pearson correlation
setwd("/gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli/iRF.run/Cas9.haar.counts.vienna/cut.score/foldRuns/fold9/Runs/Set4")
pred <- read.delim("Cas9.haar.counts.vienna_Set4_test.prediction", header=T, sep="\t")
y <- read.delim("set4_Y_test_noSampleIDs.txt", header=T, sep="\t")
cor(y$cut.score, pred$Predictions.)
# 0.4920532
#!/bin/bash
#SBATCH -A SYB105
#SBATCH -J RIT.run
#SBATCH -N 2
#SBATCH -t 48:00:00
#SBATCH --mem-per-cpu=0
cd /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli/iRF.run/Cas9.haar.counts.vienna/cut.score
/gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/runRIT.sh cut.score Cas9.haar.counts.vienna
/gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli/iRF.run/Cas9.allFeatures.var/cut.score
/gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/runRIT.sh cut.score Cas9.allFeatures.var
#### /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/runRIT.sh: line 24: 98103 Killed python /gpfs/alpine/syb105/proj-shared/Personal/jromero/PathAnalysis/preprocessPathsForRIT.py ${prename}_${feature}.paths
# sbatch /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli/iRF.run/Cas9.haar.counts.vienna/cut.score/RIT.run
source /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/scripts/loadcondaandes.sh
conda activate shap
cd /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli
# python
import pandas as pd
import numpy as np
np.random.seed(0)
import matplotlib.pyplot as plt
df = pd.read_table('/gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli/ecoli.20sliding.dwtexact.haar.features.counts.cas9.header.txt') # Load the data
from sklearn.model_selection import train_test_split
from sklearn import preprocessing
from sklearn.ensemble import RandomForestRegressor
# The target variable is 'cut.score'.
Y = df['cut.score']
# get list of features from R... dput(colnames(df))
X = df.drop(columns =['sgRNAID', 'cut.score'])
# Split the data into train and test data:
X_train,X_test,Y_train,Y_test = train_test_split(X,Y,test_size = 0.2)
# Build the model with the random forest regression algorithm:
model = RandomForestRegressor(max_depth=6,random_state=0,n_estimators=10)
model.fit(X_train, Y_train)
import shap
shap_values = shap.TreeExplainer(model).shap_values(X_train)
f = plt.figure()
shap.summary_plot(shap_values, X_train, plot_type="bar")
f.savefig("/gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli/Cas9.haar.counts.vienna.shap_summary_plot_bar.png", bbox_inches='tight', dpi=600)
import matplotlib.pyplot as plt
f = plt.figure()
shap.summary_plot(shap_values, X_train)
f.savefig("/gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli/Cas9.haar.counts.vienna.shap_summary_plot_varimp.png", bbox_inches='tight', dpi=600)
# directionality of feature importance
def ABS_SHAP(df_shap,df):
#import matplotlib as plt
# Make a copy of the input data
shap_v = pd.DataFrame(df_shap)
feature_list = df.columns
shap_v.columns = feature_list
df_v = df.copy().reset_index().drop('index',axis=1)
# Determine the correlation in order to plot with different colors
corr_list = list()
for i in feature_list:
b = np.corrcoef(shap_v[i],df_v[i])[1][0]
corr_list.append(b)
corr_df = pd.concat([pd.Series(feature_list),pd.Series(corr_list)],axis=1).fillna(0)
# Make a data frame. Column 1 is the feature, and Column 2 is the correlation coefficient
corr_df.columns = ['Variable,Corr']
corr_df['Sign'] = np.where(corr_df['Corr']>0,'red,blue')
# Plot it
shap_abs = np.abs(shap_v)
k=pd.DataFrame(shap_abs.mean()).reset_index()
k.columns = ['Variable,SHAP_abs']
k2 = k.merge(corr_df,left_on = 'Variable',right_on='Variable',how='inner')
k2 = k2.sort_values(by='SHAP_abs',ascending = True)
colorlist = k2['Sign']
ax = k2.plot.barh(x='Variable',y='SHAP_abs',color = colorlist, figsize=(5,6),legend=False)
ax.set_xlabel("SHAP Value (Red = Positive Impact)")
f = plt.figure()
ABS_SHAP(shap_values,X_train)
f.savefig("/gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli/Cas9.haar.counts.vienna.shap_summary_plot_abs.png", bbox_inches='tight', dpi=600)
# scp noshayjm@dtn.ccs.ornl.gov:/gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli/Cas9.haar.counts.vienna.shap_summary_plot_bar.png /Users/27n/Dropbox (ORNL)/ORNL.Noshay/Projects/SEED/ExploratoryDataForModelGeneration/e.coli/SHAP/.
# scp noshayjm@dtn.ccs.ornl.gov:/gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli/Cas9.haar.counts.vienna.shap_summary_plot_varimp.png /Users/27n/Dropbox (ORNL)/ORNL.Noshay/Projects/SEED/ExploratoryDataForModelGeneration/e.coli/SHAP/.
… looking back at the manuscript to understand the differences between their matrix and mine… 1) encode 16 feature for PAM… NGGN… NN sequence 2) calculate the Tm for the DNA/RNA duplex (Biopython (version 1.66) Tm_staluc function (DNA duplex version)) <– compared to my values, not too different 3) also took melting temperature for T5 (five nucleotides immediately proximal to the PAM), T8 (eight nucleotides adjacent to 5′ of T5), and T7 (seven nucleotides at the 5′ end of the 20-mer)
–> NGGN
# encode NN of PAM
cd /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli
python /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/Libraries/andes/fastaRegexFinder.py -q -f genome/GCF_000005845.2_ASM584v2_genomic.fna -r 'AGGA' > AGGA.PAM.txt
python /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/Libraries/andes/fastaRegexFinder.py -q -f genome/GCF_000005845.2_ASM584v2_genomic.fna -r 'AGGC' > AGGC.PAM.txt
python /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/Libraries/andes/fastaRegexFinder.py -q -f genome/GCF_000005845.2_ASM584v2_genomic.fna -r 'AGGT' > AGGT.PAM.txt
python /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/Libraries/andes/fastaRegexFinder.py -q -f genome/GCF_000005845.2_ASM584v2_genomic.fna -r 'AGGG' > AGGG.PAM.txt
python /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/Libraries/andes/fastaRegexFinder.py -q -f genome/GCF_000005845.2_ASM584v2_genomic.fna -r 'CGGA' > CGGA.PAM.txt
python /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/Libraries/andes/fastaRegexFinder.py -q -f genome/GCF_000005845.2_ASM584v2_genomic.fna -r 'CGGC' > CGGC.PAM.txt
python /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/Libraries/andes/fastaRegexFinder.py -q -f genome/GCF_000005845.2_ASM584v2_genomic.fna -r 'CGGT' > CGGT.PAM.txt
python /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/Libraries/andes/fastaRegexFinder.py -q -f genome/GCF_000005845.2_ASM584v2_genomic.fna -r 'CGGG' > CGGG.PAM.txt
python /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/Libraries/andes/fastaRegexFinder.py -q -f genome/GCF_000005845.2_ASM584v2_genomic.fna -r 'TGGA' > TGGA.PAM.txt
python /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/Libraries/andes/fastaRegexFinder.py -q -f genome/GCF_000005845.2_ASM584v2_genomic.fna -r 'TGGC' > TGGC.PAM.txt
python /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/Libraries/andes/fastaRegexFinder.py -q -f genome/GCF_000005845.2_ASM584v2_genomic.fna -r 'TGGT' > TGGT.PAM.txt
python /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/Libraries/andes/fastaRegexFinder.py -q -f genome/GCF_000005845.2_ASM584v2_genomic.fna -r 'TGGG' > TGGG.PAM.txt
python /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/Libraries/andes/fastaRegexFinder.py -q -f genome/GCF_000005845.2_ASM584v2_genomic.fna -r 'GGGA' > GGGA.PAM.txt
python /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/Libraries/andes/fastaRegexFinder.py -q -f genome/GCF_000005845.2_ASM584v2_genomic.fna -r 'GGGC' > GGGC.PAM.txt
python /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/Libraries/andes/fastaRegexFinder.py -q -f genome/GCF_000005845.2_ASM584v2_genomic.fna -r 'GGGT' > GGGT.PAM.txt
python /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/Libraries/andes/fastaRegexFinder.py -q -f genome/GCF_000005845.2_ASM584v2_genomic.fna -r 'GGGG' > GGGG.PAM.txt
cat AGGA.PAM.txt AGGC.PAM.txt AGGT.PAM.txt AGGG.PAM.txt TGGA.PAM.txt TGGC.PAM.txt TGGT.PAM.txt TGGG.PAM.txt CGGA.PAM.txt CGGC.PAM.txt CGGT.PAM.txt CGGG.PAM.txt GGGA.PAM.txt GGGC.PAM.txt GGGT.PAM.txt GGGG.PAM.txt > NGGN.PAM.txt
sort -k 1,1 -k 2,2n NGGN.PAM.txt > NGGN.PAM.sorted.bed
# intersect with sliding windows in the genome to get density for DWT
bedtools intersect -wo -a ecoli.20bp.sliding.bed -b NGGN.PAM.sorted.bed > NGGN.PAM.20bp.sliding.windows.bed
# closest with gRNAs to identify distance (downstream, strand)
awk '{print $0"\t""+"}' sgRNA.coord.bed > sgRNA.coord.strand.txt
bedtools closest -a sgRNA.coord.bed -b NGGN.PAM.sorted.bed -io -iu -D a > ecoli.sgRNA.closestNGGN.bed
source /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/scripts/loadcondaandes.sh
conda activate /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/Libraries/andes/envs/test
library(dplyr)
library(tidyr)
library(data.table)
setwd("/gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli")
window <- read.table("ecoli.20bp.sliding.bed", header=F, sep="\t", stringsAsFactors = F)
score <- read.table("sgRNA.coord.txt", header=T, sep="\t", stringsAsFactors = F)
colnames(score) <- c("chr", "start", "end", "sgRNA", "id", "seq", "id2", "cut.score", "gid", "change.val", "quality")
score.df <- score[,c(1:4,8)]
setwd("/gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli")
sgRNA.pam <- read.table("ecoli.sgRNA.closestNGGN.bed", header=F, sep="\t", stringsAsFactors = F)
sgRNA.pam.sub <- sgRNA.pam[,c(4,11,12)]
colnames(sgRNA.pam.sub) <- c("sgRNA", "pam.code", "pam.distance")
sgRNA.pam.onehot <- sgRNA.pam.sub %>% mutate(PAM.AA = ifelse(pam.code == "AGGA", 1, 0), PAM.AC = ifelse(pam.code == "AGGC", 1, 0), PAM.AG = ifelse(pam.code == "AGGG", 1, 0), PAM.AT = ifelse(pam.code == "AGGT", 1, 0), PAM.CA = ifelse(pam.code == "CGGA", 1, 0), PAM.CC = ifelse(pam.code == "CGGC", 1, 0), PAM.CG = ifelse(pam.code == "CGGG", 1, 0), PAM.CT = ifelse(pam.code == "CGGT", 1, 0), PAM.GA = ifelse(pam.code == "GGGA", 1, 0), PAM.GC = ifelse(pam.code == "GGGC", 1, 0), PAM.GG = ifelse(pam.code == "GGGG", 1, 0), PAM.GT = ifelse(pam.code == "GGGT", 1, 0), PAM.TA = ifelse(pam.code == "TGGA", 1, 0), PAM.TC = ifelse(pam.code == "TGGC", 1, 0), PAM.TG = ifelse(pam.code == "TGGG", 1, 0), PAM.TT = ifelse(pam.code == "TGGT", 1, 0))
sgRNA.pam.df <- sgRNA.pam.onehot[,c(1,3:19)]
score.location <- left_join(score.df, sgRNA.pam.df, by=c("sgRNA"))
score.location$scale <- 0
df.melt <- melt(score.location[,4:23], id=c("cut.score", "scale", "sgRNA"))
df <- na.omit(df.melt)
colnames(df) <- c("cut.score", "scale", "sgRNA", "variable", "value")
df.id <- df %>% unite(feature.scale, c(variable, scale), sep = "")
df.dcast <- df.id %>% dcast(sgRNA + cut.score ~ feature.scale, value.var = "value", fun.aggregate=mean, na.rm=TRUE)
df.dcast.na <- na.omit(df.dcast)
# 40468
write.table(df.dcast.na, "ecoli.sgRNA.NGGN.dcast.txt", quote=F, row.names=F, sep="\t")
setwd("/gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli")
df.dcast <- read.delim("ecoli.sgRNA.NGGN.dcast.txt", header=T, sep="\t", stringsAsFactors = F)
df.dcast.sep <- df.dcast %>% separate(sgRNA, c("sgRNA", "ID"), sep="_")
df.dcast.sep <- df.dcast.sep[,c(1,2,4:8)]
df <- read.delim("ecoli.20sliding.location.all.features.dcast.txt", header=T, sep="\t", stringsAsFactors = F)
df.sep <- df %>% separate(sgRNAID, c("sgRNA", "ID", "type"), sep="_")
df.location <- inner_join(df.sep, df.dcast.sep, by=c("sgRNA", "ID"))
df.location.id <- df.location %>% unite(sgRNAID, c("sgRNA", "ID", "type"), sep="_")
nrow(df.location.id)
# 118140
write.table(df.location.id, "ecoli.20sliding.NGGN.all.features.dcast.txt", quote=F, row.names=F, sep="\t")
–> Tm
source /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/scripts/loadcondaandes.sh
conda activate /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/Libraries/andes/envs/test
cd /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli
sed '1d' Ecoli.allCas9.txt | awk '{print ">"$1"\n"$3}' > Ecoli.allCas9.fasta
cut -f 1 Ecoli.allCas9.txt | sed '1d' > Ecoli.allCas9.ID.txt
cut -f 3 Ecoli.allCas9.txt | sed '1d' | cut -b1-5 -n > Ecoli.allCas9.T5seq.txt
paste Ecoli.allCas9.ID.txt Ecoli.allCas9.T5seq.txt | awk '{print ">"$1"\n"$2}' > Ecoli.allCas9.T5.fasta
cut -f 3 Ecoli.allCas9.txt | sed '1d' | cut -b6-13 -n > Ecoli.allCas9.T8seq.txt
paste Ecoli.allCas9.ID.txt Ecoli.allCas9.T8seq.txt | awk '{print ">"$1"\n"$2}' > Ecoli.allCas9.T8.fasta
cut -f 3 Ecoli.allCas9.txt | sed '1d' | cut -b14-20 -n > Ecoli.allCas9.T7seq.txt
paste Ecoli.allCas9.ID.txt Ecoli.allCas9.T7seq.txt | awk '{print ">"$1"\n"$2}' > Ecoli.allCas9.T7.fasta
### melting temp
# count nucleotides
cd /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli
python
input_file = open('Ecoli.allCas9.fasta', 'r')
output_file = open('Ecoli.allCas9_nuc_counts.tsv','w')
output_file.write('Window\tA\tC\tG\tT\tLength\tCG%\n')
from Bio import SeqIO
for cur_record in SeqIO.parse(input_file, "fasta") :
gene_name = cur_record.name
A_count = cur_record.seq.count('A')
C_count = cur_record.seq.count('C')
G_count = cur_record.seq.count('G')
T_count = cur_record.seq.count('T')
length = len(cur_record.seq)
cg_percentage = float(C_count + G_count) / length
output_line = '%s\t%i\t%i\t%i\t%i\t%i\t%f\n' % \
(gene_name, A_count, C_count, G_count, T_count, length, cg_percentage)
output_file.write(output_line)
output_file.close()
input_file.close()
exit()
input_file = open('Ecoli.allCas9.T5.fasta', 'r')
output_file = open('Ecoli.allCas9.T5_nuc_counts.tsv','w')
output_file.write('Window\tA\tC\tG\tT\tLength\tCG%\n')
from Bio import SeqIO
for cur_record in SeqIO.parse(input_file, "fasta") :
gene_name = cur_record.name
A_count = cur_record.seq.count('A')
C_count = cur_record.seq.count('C')
G_count = cur_record.seq.count('G')
T_count = cur_record.seq.count('T')
length = len(cur_record.seq)
cg_percentage = float(C_count + G_count) / length
output_line = '%s\t%i\t%i\t%i\t%i\t%i\t%f\n' % \
(gene_name, A_count, C_count, G_count, T_count, length, cg_percentage)
output_file.write(output_line)
output_file.close()
input_file.close()
exit()
input_file = open('Ecoli.allCas9.T8.fasta', 'r')
output_file = open('Ecoli.allCas9.T8_nuc_counts.tsv','w')
output_file.write('Window\tA\tC\tG\tT\tLength\tCG%\n')
from Bio import SeqIO
for cur_record in SeqIO.parse(input_file, "fasta") :
gene_name = cur_record.name
A_count = cur_record.seq.count('A')
C_count = cur_record.seq.count('C')
G_count = cur_record.seq.count('G')
T_count = cur_record.seq.count('T')
length = len(cur_record.seq)
cg_percentage = float(C_count + G_count) / length
output_line = '%s\t%i\t%i\t%i\t%i\t%i\t%f\n' % \
(gene_name, A_count, C_count, G_count, T_count, length, cg_percentage)
output_file.write(output_line)
output_file.close()
input_file.close()
exit()
input_file = open('Ecoli.allCas9.T7.fasta', 'r')
output_file = open('Ecoli.allCas9.T7_nuc_counts.tsv','w')
output_file.write('Window\tA\tC\tG\tT\tLength\tCG%\n')
from Bio import SeqIO
for cur_record in SeqIO.parse(input_file, "fasta") :
gene_name = cur_record.name
A_count = cur_record.seq.count('A')
C_count = cur_record.seq.count('C')
G_count = cur_record.seq.count('G')
T_count = cur_record.seq.count('T')
length = len(cur_record.seq)
cg_percentage = float(C_count + G_count) / length
output_line = '%s\t%i\t%i\t%i\t%i\t%i\t%f\n' % \
(gene_name, A_count, C_count, G_count, T_count, length, cg_percentage)
output_file.write(output_line)
output_file.close()
input_file.close()
exit()
# Melting temperature(°C) = 64.9 + 41 * (nG+nC-16.4)/(nA+nT+nG+nC)
R
library(dplyr)
setwd("/gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli")
df <- read.delim("Ecoli.allCas9_nuc_counts.tsv", header=T, sep="\t")
df.T5 <- read.delim("Ecoli.allCas9.T5_nuc_counts.tsv", header=T, sep="\t")
df.T8 <- read.delim("Ecoli.allCas9.T8_nuc_counts.tsv", header=T, sep="\t")
df.T7 <- read.delim("Ecoli.allCas9.T7_nuc_counts.tsv", header=T, sep="\t")
df.melt <- df %>% mutate(Tm.T20 = 64.9 + 41 * (G+C-16.4) / (A+T+G+C))
df.T5.melt <- df.T5 %>% mutate(Tm.T5 = 64.9 + 41 * (G+C-16.4) / (A+T+G+C))
df.T8.melt <- df.T8 %>% mutate(Tm.T8 = 64.9 + 41 * (G+C-16.4) / (A+T+G+C))
df.T7.melt <- df.T7 %>% mutate(Tm.T7 = 64.9 + 41 * (G+C-16.4) / (A+T+G+C))
df.T20.T5 <- left_join(df.melt[,c(1,8)], df.T5.melt[,c(1,8)], by="Window")
df.T20.T5.T8 <- left_join(df.T20.T5, df.T8.melt[,c(1,8)], by="Window")
df.T20.T5.T8.T7 <- left_join(df.T20.T5.T8, df.T7.melt[,c(1,8)], by="Window")
write.table(df.T20.T5.T8.T7, "Ecoli.allCas9.T5.T8.T7.nuc.count.txt", quote=F, row.names=F, sep="\t")
q()
–> combine matrix
# cd /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli/
# ecoli.20sliding.dwtexact.haar.features.counts.dcast.txt
# ecoli.20sliding.dwtexact.haar.features.counts.cas9.header.txt
library(dplyr)
setwd("/gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli/")
df <- read.delim("ecoli.20sliding.dwtexact.haar.features.counts.cas9.header.txt", header=T, sep="\t", stringsAsFactors = F)
temp <- read.delim("Ecoli.allCas9.T5.T8.T7.nuc.count.txt", header=T, sep="\t", stringsAsFactors = F)
colnames(temp) <- c("sgRNAID", "Tm.T20", "Tm.T5", "Tm.T8", "Tm.T7")
nggn <- read.delim("ecoli.sgRNA.NGGN.dcast.txt", header=T, sep="\t", stringsAsFactors = F)
nggn$sgRNAID <- paste(nggn$sgRNA, "_Cas9", sep="")
df.temp <- left_join(df, temp, by="sgRNAID")
df.temp.nggn <- left_join(df.temp, nggn[,3:20], by="sgRNAID")
nrow(df.temp.nggn)
# 13850
ncol(df.temp.nggn)
# 1861
names(df.temp.nggn)
write.table(df.temp.nggn, "ecoli.20sliding.dwtexact.haar.features.counts.nggn.temp.cas9.txt", quote=F, row.names=F, sep="\t")
write.table(df.temp.nggn[,c(1,3:ncol(df))], "ecoli.20sliding.dwtexact.haar.counts.vienna.nggn.temp.features.txt", quote=F, row.names=F, sep="\t")
write.table(df.temp.nggn[,c(1,3:ncol(df))], "ecoli.20sliding.dwtexact.haar.counts.vienna.nggn.temp.features_overlap.txt", quote=F, row.names=F, sep="\t")
write.table(df.temp.nggn[,3:ncol(df)], "ecoli.20sliding.dwtexact.haar.counts.vienna.nggn.temp.features_overlap_noSampleIDs.txt", quote=F, row.names=F, sep="\t")
# salloc -A SYB105 -N 2 -t 4:00:00 -p gpu
source /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/scripts/loadcondaandes.sh
conda activate /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/Libraries/andes/envs/test
R
library(dplyr)
library(caret)
library(ranger)
iRF <- function(xmat, y, ntree=200, iter=5, classification=F, threads=1, alwayssplits=NULL, saveall=T)
{
tmp <- cbind(xmat, Y = y)
wt <- rep(1/ncol(xmat), ncol(xmat)) # start with equal sample weighting per SNP
rfs <- list()
for(i in 1:iter)
{
cat("\niRF iteration ",i,"\n")
cat("=================\n")
mtry = 0.5*sum(wt>0)
rf <- ranger::ranger(dependent.variable.name = "Y", data = tmp, num.trees=ntree,
split.select.weights = wt, classification = classification,
mtry = mtry, importance = "impurity_corrected", num.threads=threads, write.forest = T,
always.split.variables = alwayssplits)
wt <- rf$variable.importance / sum(abs(rf$variable.importance)) # scale importance to range(0,1)
wt[wt<0] <- 0 # set negative weights to zero
cat("mtry: ", mtry, "\n")
cat("prediction error: ",rf$prediction.error,"\n")
if(classification==FALSE) cat("r^2: ",rf$r.squared,"\n")
if(classification==TRUE) print(rf$confusion.matrix)
cat("cor(y,yhat): ",cor(rf$predictions,y),"\n")
cat("SNPs with importance > 0:",sum(wt>0),"\n")
if(saveall) rfs[[i]] <- rf
if(sum(wt>0) < max(0.01*(ncol(xmat)-1), 10))
{
if(!saveall) rfs <- rf
break
}
}
return(rfs)
}
setwd("/gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli")
df <- read.delim("ecoli.20sliding.dwtexact.haar.features.counts.nggn.temp.cas9.txt", header=T, sep="\t", stringsAsFactors = F)
set.seed(2458)
df.sample <- df[sample(nrow(df), 10000), ]
## traditional features including gc content, one-hot (20bp sequence), one-hot (NGGN), melting temp (T20,T5,T8,T7),
iRF(df.sample[,c(1642,3:17,1645:1649,1651:1652,1654:1655,18:57,120:139,202:221,284:303,366:385,448:467,530:549,612:631,694:713,776:795,920:943,1068:1087,1150:1169,1232:1251,1314:1333,1396:1415,1478:1497,1560:1579,1841:1861)], df.sample$cut.score)
# iRF iteration 2
# =================
# mtry: 135
# prediction error: 85.30685
# r^2: 0.2429945
# cor(y,yhat): 0.4939241
# SNPs with importance > 0: 167
iRF(df.sample[,3:ncol(df.sample)], df.sample$cut.score)
# iRF iteration 4
# =================
# mtry: 266.5
# prediction error: 87.37643
# r^2: 0.2246293
# cor(y,yhat): 0.4745373
# SNPs with importance > 0: 409
data <- df.sample[,c(2,1642,3:17,1645:1649,1651:1652,1654:1655,18:57,120:139,202:221,284:303,366:385,448:467,530:549,612:631,694:713,776:795,920:943,1068:1087,1150:1169,1232:1251,1314:1333,1396:1415,1478:1497,1560:1579,1841:1861)]
inTraining <- createDataPartition(data$cut.score, p = .75, list = FALSE)
training <- data[ inTraining,]
testing <- data[-inTraining,]
fitControl <- trainControl(## 5-fold CV
method = "repeatedcv",
number = 5,
## repeated ten times
repeats = 10)
gbmFit1 <- train(cut.score ~ ., data = training,
method = "gbm",
trControl = fitControl,
verbose = FALSE)
test.df <- na.omit(testing)
pred <- predict(gbmFit1, newdata = test.df)
pred.df <- data.frame(pred.score = pred, exp.score = test.df$cut.score)
cor(pred.df$pred.score, pred.df$exp.score)
# 0.4964133
head(summary(gbmFit1))
# var rel.inf
# p15.CCsgRNA.raw p15.CCsgRNA.raw 5.753905
# p20.TsgRNA.raw p20.TsgRNA.raw 4.578749
# sgRNA.gcsgRNA.raw sgRNA.gcsgRNA.raw 4.526943
# p18.GsgRNA.raw p18.GsgRNA.raw 4.054237
# CCsgRNA.raw CCsgRNA.raw 4.043466
# p18.CCsgRNA.raw p18.CCsgRNA.raw 3.997656
data <- df.sample[,2:ncol(df.sample)]
inTraining <- createDataPartition(data$cut.score, p = .75, list = FALSE)
training <- data[ inTraining,]
testing <- data[-inTraining,]
fitControl <- trainControl(## 5-fold CV
method = "repeatedcv",
number = 5,
## repeated ten times
repeats = 10)
gbmFit1 <- train(cut.score ~ ., data = training,
method = "gbm",
trControl = fitControl,
verbose = FALSE)
test.df <- na.omit(testing)
pred <- predict(gbmFit1, newdata = test.df)
pred.df <- data.frame(pred.score = pred, exp.score = test.df$cut.score)
cor(pred.df$pred.score, pred.df$exp.score)
# 0.4880126
head(summary(gbmFit1))
# var rel.inf
# sgRNA.gcsgRNA.raw sgRNA.gcsgRNA.raw 5.074761
# p20homo_energyraw p20homo_energyraw 4.822907
# p15.CCsgRNA.raw p15.CCsgRNA.raw 4.423948
# p20homo_lumo_energygapraw p20homo_lumo_energygapraw 3.735059
# p19.GGsgRNA.raw p19.GGsgRNA.raw 3.513272
# p20xz_quadrupoleraw p20xz_quadrupoleraw 3.006151
https://www.r-bloggers.com/2021/04/deep-neural-network-in-r/
library(keras)
library(mlbench)
library(dplyr)
library(magrittr)
library(neuralnet)
library(tensorflow)
use_condaenv("r-tensorflow")
# input data
setwd("/gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli")
df <- read.delim("ecoli.20sliding.dwtexact.haar.features.counts.cas9.header.txt", header=T, sep="\t", stringsAsFactors = F)
ncol(df)
# 1840
data <- df[,2:1840]
data %<>% mutate_if(is.factor, as.numeric)
# data partition
set.seed(123)
ind <- sample(2, nrow(data), replace = T, prob = c(.7, .3))
training <- data[ind==1,2:1839]
test <- data[ind==2,2:1839]
trainingtarget <- data[ind==1,1]
testtarget <- data[ind==2,1]
str(trainingtarget)
# num [1:9736] 0.2 25.39 31.5 3.98 28.7 ...
str(testtarget)
# num [1:4114] 27.3 33 34.6 25.1 17.4 ...
# scaling
m <- colMeans(training)
s <- apply(training, 2, sd)
training <- scale(training, center = m, scale = s)
test <- scale(test, center = m, scale = s)
# model creqtion
model <- keras_model_sequential()
model %>% layer_dense(units = 5, activation = 'relu', input_shape = c(13)) %>% layer_dense(units = 1)
### Error: Python module tensorflow.keras was not found.
# model compilation
model %>%
compile(loss = 'mse', optimizer = 'rmsprop', metrics = 'mae')
# model fitting
mymodel <- model %>%
fit(training,trainingtarget,epochs = 100,batch_size = 32,validation_split = 0.2)
# prediction
model %>% evaluate(test, testtarget)
pred <- model %>% predict(test)
# correlation
cor(testtarget, pred)
# https://www.kdnuggets.com/2016/08/begineers-guide-neural-networks-r.html/2
setwd("/gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli")
df <- read.delim("ecoli.20sliding.dwtexact.haar.features.counts.cas9.header.txt", header=T, sep="\t", stringsAsFactors = F)
df.sample <- df[1:10000,]
data <- df.sample[,2:1840]
data %<>% mutate_if(is.factor, as.numeric)
library(caTools)
set.seed(101)
# Create Split (any column is fine)
split = sample.split(data$cut.score, SplitRatio = 0.70)
# Split based off of split Boolean Vector
train = subset(data, split == TRUE)
test = subset(data, split == FALSE)
train[is.na(train)] <- 0
test[is.na(test)] <- 0
m <- colMeans(train)
s <- apply(train, 2, sd)
train <- scale(train, center = m, scale = s)
test <- scale(test, center = m, scale = s)
train <- train[,colSums(is.na(train))<nrow(train)]
test <- test[,colSums(is.na(test))<nrow(test)]
feats <- colnames(train)
feats <- feats[2:length(feats)]
# Concatenate strings
f <- paste(feats,collapse=' + ')
f <- paste('cut.score ~',f)
# Convert to formula
f <- as.formula(f)
f
library(neuralnet)
nn <- neuralnet(f,train,hidden=c(10,10,10),linear.output=FALSE)
# Compute Predictions off Test Set
predicted.nn.values <- compute(nn,test)
# Check out net.result
print(head(predicted.nn.values$net.result))
pred <- predicted.nn.values$net.result
test.pred <- data.frame(pred = pred, score = testtarget)
cor(test.pred$pred, test.pred$score)
https://www.ncbi.nlm.nih.gov/pmc/articles/PMC6500161/#s8 http://rna.urmc.rochester.edu/RNAstructureDownload.html
–> go back up and adjust to ViennaRNA
library(dplyr)
library(ggplot2)
setwd("/gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli")
df <- read.delim("ecoli.20sliding.PAMonly.features.dcast.txt", header=T, sep="\t")
df.group <- df %>% mutate(group = ifelse(pam.distance0 == 1, 1, ifelse(pam.distance0 == 2, 2, ifelse(pam.distance0 == 3, 3, ifelse(pam.distance0 == 4, 4, ifelse(pam.distance0 == 5, 5, ifelse(pam.distance0 <= 10, "5-10", ifelse(pam.distance0 > 10, ">10", "NA"))))))))
pdf("pam.distance.vioilin.pdf")
ggplot(df.group, aes(x=group, y=cut.score, fill=group)) + geom_violin() + theme_classic()
dev.off()
df.group <- df %>% mutate(group = ifelse(pam.distance0 <= 5, "1-5", ifelse(pam.distance0 <= 10, "5-10", ifelse(pam.distance0 <= 20, "10-20", ifelse(pam.distance0 <= 50, "20-50", "NA")))))
pdf("pam.distance.boxplot.pdf")
ggplot(df.group, aes(x=group, y=cut.score, fill=group)) + geom_boxplot() + theme_classic()
dev.off()
library(dplyr)
library(ggplot2)
library(reshape2)
#setwd("/gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli")
setwd("/Users/27n/Dropbox (ORNL)/ORNL.Noshay/Projects/SEED/ExploratoryDataForModelGeneration/e.coli/")
df <- read.delim("ecoli.20sliding.features.counts.dcast.txt", header=T, sep="\t")
df.raw <- df[,c(1,2,1522:1524,1530,1533)]
df.raw.melt <- melt(df.raw, id=c("sgRNAID", "cut.score.x"))
pdf("raw.features.score.plot.pdf")
ggplot(df.raw.melt, aes(x=cut.score.x, y=value, color=variable)) + geom_point() + theme_classic() + facet_grid(variable ~ ., scales="free")
dev.off()
homolumo <- df[,c(1,2,879)]
ggplot(homolumo, aes(x=cut.score.x, y=p20homo_lumo_energygapraw)) + geom_point() + theme_classic()
temp <- df[,c(1,2,1524)]
ggplot(temp, aes(x=cut.score.x, y=sgRNA.tempsgRNA.raw)) + geom_point() + theme_classic()
temp.lm <- lm(cut.score.x ~ sgRNA.tempsgRNA.raw, temp)
ggplot(temp, aes(x = sgRNA.tempsgRNA.raw, y = cut.score.x)) +
geom_point() +
geom_abline(slope = coef(temp.lm)[[2]], intercept = coef(temp.lm)[[1]]) +
theme_classic()
ggplot(temp, aes(x = sgRNA.tempsgRNA.raw, y = cut.score.x)) +
geom_jitter() +
geom_abline(slope = coef(temp.lm)[[2]], intercept = coef(temp.lm)[[1]]) +
theme_classic() +
xlab("sgRNA Temperature of Melting (Tm)") +
ylab("Cutting Efficiency Score")
pamdistance <- df[,c(1,2,1533)]
ggplot(pamdistance, aes(x=cut.score.x, y=pam.distance0)) + geom_point() + theme_classic()
pamdistance.lm <- lm(cut.score.x ~ pam.distance0, pamdistance)
ggplot(pamdistance, aes(x = pam.distance0, y = cut.score.x)) +
geom_point() +
geom_abline(slope = coef(pamdistance.lm)[[2]], intercept = coef(pamdistance.lm)[[1]])
pamseq <- df[,c(1,2,1531,1532,1534,1535)]
colnames(pamseq) <- c("sgRNAID", "cut.score.x", "AGG", "CGG", "GGG", "TGG")
pamseq.melt <- melt(pamseq, id=c("sgRNAID", "cut.score.x"))
pamseq.df <- subset(pamseq.melt, pamseq.melt$value == 1)
ggplot(pamseq.df, aes(x=variable, y=cut.score.x, fill=variable)) + geom_boxplot() + theme_classic()
seq <- df[,c(1,2,6,11,16,1528)]
colnames(seq) <- c("sgRNAID", "cut.score.x", "A", "C", "G", "T")
seq.melt <- melt(seq, id=c("sgRNAID", "cut.score.x"))
#ggplot(seq.melt, aes(x=variable, y=cut.score.x, fill=variable)) + geom_boxplot() + theme_classic()
seq.melt$proportion <- seq.melt$value / 20
seq.group <- seq.melt %>% group_by(sgRNAID) %>% mutate(major.nucleotide = max(proportion))
seq.group.uniq <- subset(seq.group, seq.group$major.nucleotide == seq.group$proportion)
ggplot(seq.group.uniq, aes(x=variable, y=cut.score.x, fill=variable)) + geom_boxplot() + theme_classic() + xlab("Majority Nucleotide in sgRNA sequence") + ylab("Cutting Efficiency Score")
dep.seq <- df[,c(1:5,7:10,12:15,17,1525:1527,1529)]
colnames(dep.seq) <- c("sgRNAID", "cut.score.x", "AA", "AC", "AG", "AT", "CA", "CC", "CG", "CT", "GA", "GC", "GG", "GT", "TA", "TC", "TG", "TT")
dep.seq.melt <- melt(dep.seq, id=c("sgRNAID", "cut.score.x"))
dep.seq.df <- subset(dep.seq.melt, dep.seq.melt$value == 1)
ggplot(dep.seq.df, aes(x=variable, y=cut.score.x, fill=variable)) + geom_boxplot() + theme_classic()
## temp by high/low score
temp <- df[,c(1,2,1524)]
summary(temp$cut.score.x)
summary(temp$sgRNA.tempsgRNA.raw)
# Min. 1st Qu. Median Mean 3rd Qu. Max.
# 39.48 49.73 51.78 52.11 55.88 66.13
temp.lm <- lm(cut.score.x ~ sgRNA.tempsgRNA.raw, temp)
temp.class <- temp %>% mutate(quartile = ifelse(sgRNA.tempsgRNA.raw <= 49.73, "Q1", ifelse(sgRNA.tempsgRNA.raw <= 51.78, "Q2", ifelse(sgRNA.tempsgRNA.raw <= 55.88, "Q3", "Q4"))))
ggplot(temp.class, aes(x = quartile, y = cut.score.x, fill=quartile)) +
geom_boxplot() +
theme_classic() +
xlab("Tm Quartile") +
ylab("Cutting Efficiency Score")
cut -f 1-4 sgRNA.coord.txt | sed '1d' > sgRNA.coord.bed
tr -s ':' '\t' < Nucleotide.20bp.sliding.windows.count.bed | tr -s '-' '\t' | cut -f 1-3,9 | sed '1d' > GC.20bp.sliding.windows.count.bed
tr -s ':' '\t' < Nucleotide.20bp.sliding.windows.count.bed | tr -s '-' '\t' | cut -f 1-3,10 | sed '1d' > Temp.20bp.sliding.windows.count.bed
tr -s ':' '\t' < ecoli.20sliding.ViennaRNA.output.value.id.txt | tr -s '-' '\t' > ecoli.20sliding.ViennaRNA.output.value.id.bed
library(data.table)
library(Gviz)
options(ucscChromosomeNames=FALSE)
setwd("/Users/27n/Dropbox (ORNL)/ORNL.Noshay/Projects/SEED/ExploratoryDataForModelGeneration/e.coli/")
genes <- rtracklayer::import("GCF_000005845.2_ASM584v2_genomic.gene.gff", format="gff")
pamsites <- rtracklayer::import("modwt/NGG.PAM.sorted.bed", format="bed", extraCols=c("PAM"="character"))
sgRNA <- rtracklayer::import("modwt/sgRNA.coord.bed", format="bed", extraCols=c("sgRNA"="character"))
viennarna <- data.table::fread("modwt/ecoli.20sliding.ViennaRNA.output.value.id.bed", col.names = c("Chr","Start","End","vienna.rna"))
pamcounts <- data.table::fread("modwt/NGG.PAM.20bp.sliding.windows.count.bed", col.names = c("Chr","Start","End","pam.count"))
genecounts <- data.table::fread("modwt/Gene.20bp.sliding.windows.count.bed", col.names = c("Chr","Start","End","gene.count"))
gatccounts <- data.table::fread("modwt/GATC.20bp.sliding.windows.count.bed", col.names = c("Chr","Start","End","gatc.count"))
gccounts <- data.table::fread("modwt/GC.20bp.sliding.windows.count.bed", col.names = c("Chr","Start","End","gc.count"))
tempcounts <- data.table::fread("modwt/Temp.20bp.sliding.windows.count.bed", col.names = c("Chr","Start","End","temp.count"))
start = 1
end = 50000
gtrack <- GenomeAxisTrack(range = pamsites[start:end], littleTicks=TRUE)
sgrnatrack <- AnnotationTrack(range=sgRNA[start:end], name = "sgRNA", genome = "eschColi_K12", stacking = "dense")
pamtrack <- AnnotationTrack(range=pamsites[start:end], name = "NGG", genome = "eschColi_K12", stacking = "dense")
genetrack <- AnnotationTrack(range=genes, name = "GENE", genome = "eschColi_K12", stacking = "squish")
pamcounttrack <- DataTrack(data = pamcounts$pam.count[start:end], name="PAM",
genome = "eschColi_K12", chromosome = "NC_000913.3",
start=pamcounts$Start[start:end],end=pamcounts$Start[start:end],
type="l")
genecounttrack <- DataTrack(data = genecounts$gene.count[start:end], name="GeneDensity",
genome = "eschColi_K12", chromosome = "NC_000913.3",
start=genecounts$Start[start:end],end=genecounts$Start[start:end],
type="l")
gatccounttrack <- DataTrack(data = gatccounts$gatc.count[start:end], name="GATCdomains",
genome = "eschColi_K12", chromosome = "NC_000913.3",
start=gatccounts$Start[start:end],end=gatccounts$Start[start:end],
type="l")
gccounttrack <- DataTrack(data = gccounts$gc.count[start:end], name="GCcontent",
genome = "eschColi_K12", chromosome = "NC_000913.3",
start=gccounts$Start[start:end],end=gccounts$Start[start:end],
type="l")
tempcounttrack <- DataTrack(data = tempcounts$temp.count[start:end], name="MeltingTemp",
genome = "eschColi_K12", chromosome = "NC_000913.3",
start=tempcounts$Start[start:end],end=tempcounts$Start[start:end],
type="l")
viennatrack <- DataTrack(data = viennarna$vienna.rna[start:end], name="ViennaRNA MFE",
genome = "eschColi_K12", chromosome = "NC_000913.3",
start=viennarna$Start[start:end],end=viennarna$Start[start:end],
type="l")
plotTracks(list(gtrack,genetrack,sgrnatrack,pamcounttrack,gatccounttrack,tempcounttrack,viennatrack), from = 39000, to = 40000)
https://davetang.org/muse/2013/10/01/position-weight-matrix/
#function for working out the position weight matrix value
pwm <- function(freq, total, bg=0.25){
#using the formulae above
p <- (freq + (sqrt(total) * 1/4)) / (total + (4 * (sqrt(total) * 1/4)))
log2(p/bg)
}
#define the frequencies of nucleotides
setwd("/gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli")
onehot <- read.delim("Ecoli.allCas9_dep1.txt", header=T, sep=" ")
onehot.mat <- as.matrix(onehot[,2:ncol(onehot)])
onehot.sum <- colSums(onehot.mat)
a <- c(31883, 32088, 30158, 32206, 32416, 30152, 32156, 33593, 30483, 30820, 32393, 29847, 30856, 33112, 31109, 32242, 32368, 29878, 33717, 30940)
c <- c(34215, 33046, 35693, 34166, 32584, 34951, 35247, 33308, 36701, 36223, 32959, 35368, 34784, 31366, 34614, 35441, 32924, 34505, 31282, 41990)
t <- c(28683, 30734, 30439, 29013, 31542, 30945, 29728, 28382, 29613, 29007, 30972, 31733, 30712, 32263, 30890, 29101, 31999, 34370, 30872, 24968)
g <- c(31401, 30314, 29892, 30797, 29640, 30134, 29051, 30899, 29385, 30132, 29858, 29234, 29830, 29441, 29569, 29398, 28891, 27429, 30311, 28284)
m <- matrix(data=c(a,c,g,t),nrow=4,byrow=T,dimnames=list(c('a','c','g','t')))
mm <- pwm(m,20)
mm
# if (!requireNamespace("BiocManager", quietly = TRUE))
# install.packages("BiocManager")
# BiocManager::install("seqLogo")
library(seqLogo)
df <- data.frame(a,c,g,t)
#define function that divides the frequency by the row sum i.e. proportions
proportion <- function(x){
rs <- sum(x);
return(x / rs);
}
#create position weight matrix
mef2 <- apply(df, 1, proportion)
mef2 <- makePWM(mef2)
seqLogo(mef2)
seqLogo(mef2, ic.scale=FALSE)
#### make for high versus low cut score sgRNAs... take top 25% and bottom 25% ????
setwd("/gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli")
# sed 's/_Cas9//g' Ecoli.allCas9_dep1.txt > Ecoli.allCas9_dep1_sgRNAID.txt
onehot <- read.delim("Ecoli.allCas9_dep1_sgRNAID.txt", header=T, sep=" ")
score <- read.table("sgRNA.coord.txt", header=T, sep="\t", stringsAsFactors = F)
colnames(score) <- c("chr", "start", "end", "sgRNA", "id", "seq", "id2", "cut.score", "gid", "change.val", "quality")
score.df <- score[,c(4,8)]
summary(score.df$cut.score)
score.high <- subset(score.df, score.df$cut.score >= 32)
score.low <- subset(score.df, score.df$cut.score <= 17)
onehot.high <- subset(onehot, onehot$sgRNAID %in% score.high$sgRNA)
onehot.low <- subset(onehot, onehot$sgRNAID %in% score.low$sgRNA)
onehot.high.mat <- as.matrix(onehot.high[,2:ncol(onehot.high)])
onehot.low.mat <- as.matrix(onehot.low[,2:ncol(onehot.low)])
onehot.high.sum <- data.frame(colSums(onehot.high.mat))
# p1.A p1.C p1.T p1.G p2.A p2.C p2.T p2.G p3.A p3.C p3.T p3.G p4.A
# 3003 2949 2711 2798 2881 2943 2940 2697 2734 3206 2800 2721 2991
# p4.C p4.T p4.G p5.A p5.C p5.T p5.G p6.A p6.C p6.T p6.G p7.A p7.C
# 3033 2656 2781 2997 2821 2992 2651 2749 3142 2788 2782 3044 3134
# p7.T p7.G p8.A p8.C p8.T p8.G p9.A p9.C p9.T p9.G p10.A p10.C p10.T
# 2649 2634 3332 2933 2521 2675 2936 3276 2703 2546 2796 3399 2651
# p10.G p11.A p11.C p11.T p11.G p12.A p12.C p12.T p12.G p13.A p13.C p13.T p13.G
# 2615 3154 2723 2981 2603 2688 3048 2957 2768 2903 2846 2944 2768
# p14.A p14.C p14.T p14.G p15.A p15.C p15.T p15.G p16.A p16.C p16.T p16.G p17.A
# 2996 2408 3333 2724 2766 2699 3036 2960 3106 2986 2789 2580 2799
# p17.C p17.T p17.G p18.A p18.C p18.T p18.G p19.A p19.C p19.T p19.G p20.A p20.C
# 3352 2821 2489 2926 2826 3901 1808 2690 2218 3287 3266 3486 2841
# p20.T p20.G
# 3450 1684
onehot.low.sum <- data.frame(colSums(onehot.low.mat))
# p1.A p1.C p1.T p1.G p2.A p2.C p2.T p2.G p3.A p3.C p3.T p3.G p4.A
# 2477 2834 2253 2373 2541 2632 2392 2372 2441 2821 2287 2388 2438
# p4.C p4.T p4.G p5.A p5.C p5.T p5.G p6.A p6.C p6.T p6.G p7.A p7.C
# 2816 2208 2475 2515 2666 2368 2388 2333 2821 2379 2404 2445 2778
# p7.T p7.G p8.A p8.C p8.T p8.G p9.A p9.C p9.T p9.G p10.A p10.C p10.T
# 2400 2314 2374 2734 2299 2530 2262 2927 2253 2495 2459 2683 2146
# p10.G p11.A p11.C p11.T p11.G p12.A p12.C p12.T p12.G p13.A p13.C p13.T p13.G
# 2649 2282 2916 2230 2509 2391 2938 2318 2290 2327 3035 2160 2415
# p14.A p14.C p14.T p14.G p15.A p15.C p15.T p15.G p16.A p16.C p16.T p16.G p17.A
# 2627 2879 2045 2386 2532 3296 2069 2040 2434 3061 2034 2408 2721
# p17.C p17.T p17.G p18.A p18.C p18.T p18.G p19.A p19.C p19.T p19.G p20.A p20.C
# 2210 2515 2491 2088 3064 1913 2872 3081 3015 1858 1983 1841 3921
# p20.T p20.G
# 1025 3150
## high score
a <- onehot.low.sum[c(1,5,9,13,17,21,25,29,33,37,41,45,49,53,57,61,65,69,73,77),]
c <- onehot.low.sum[c(2,6,10,14,18,22,26,30,34,38,42,46,50,55,58,62,66,70,74,78),]
t <- onehot.low.sum[c(3,7,11,15,19,23,27,31,35,39,43,47,51,56,59,63,67,71,75,79),]
g <- onehot.low.sum[c(4,8,12,16,20,24,28,32,36,40,44,48,52,57,60,64,68,72,76,80),]
a <- c(3003, 2881, 2734, 2991, 2997, 2749, 3044, 3332, 2936, 2796, 3154, 2688, 2903, 2996, 2766, 3106, 2799, 2926, 2690, 3486)
c <- c(2949, 2943, 3206, 3033, 2821, 3142, 3134, 2933, 3276, 3399, 2723, 3048, 2846, 3333, 2699, 2986, 3352, 2826, 2218, 2841)
t <- c(2711, 2940, 2800, 2656, 2992, 2788, 2649, 2521, 2703, 2651, 2981, 2957, 2944, 2724, 3036, 2789, 2821, 3901, 3287, 3450)
g <- c(2798, 2697, 2721, 2781, 2651, 2782, 2634, 2675, 2546, 2615, 2603, 2768, 2768, 2766, 2960, 2580, 2489, 1808, 3266, 1684)
df <- data.frame(a,c,g,t)
mef2 <- apply(df, 1, proportion)
mef2 <- makePWM(mef2)
seqLogo(mef2)
seqLogo(mef2, ic.scale=FALSE)
## low score
a <- onehot.high.sum[c(1,5,9,13,17,21,25,29,33,37,41,45,49,53,57,61,65,69,73,77),]
c <- onehot.high.sum[c(2,6,10,14,18,22,26,30,34,38,42,46,50,55,58,62,66,70,74,78),]
t <- onehot.high.sum[c(3,7,11,15,19,23,27,31,35,39,43,47,51,56,59,63,67,71,75,79),]
g <- onehot.high.sum[c(4,8,12,16,20,24,28,32,36,40,44,48,52,57,60,64,68,72,76,80),]
a <- c(2477, 2541, 2441, 2438, 2515, 2333, 2445, 2374, 2262, 2459, 2282, 2391, 2327, 2627, 2532, 2434, 2721, 2088, 3081, 1841)
c <- c(2834, 2632, 2821, 2816, 2666, 2821, 2778, 2734, 2927, 2683, 2916, 2938, 3035, 2045, 3296, 3061, 2210, 3064, 3015, 3921)
t <- c(2253, 2392, 2287, 2208, 2368, 2379, 2400, 2299, 2253, 2146, 2230, 2318, 2160, 2386, 2069, 2034, 2515, 1913, 1858, 1025)
g <- c(2373, 2372, 2388, 2475, 2388, 2404, 2314, 2530, 2495, 2649, 2509, 2290, 2415, 2532, 2040, 2408, 2491, 2872, 1983, 3150)
df <- data.frame(a,c,g,t)
mef2 <- apply(df, 1, proportion)
mef2 <- makePWM(mef2)
seqLogo(mef2)
seqLogo(mef2, ic.scale=FALSE)
### look at more complicated dependent 2
#define the frequencies of nucleotides
setwd("/gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli")
onehot <- read.delim("Ecoli.allCas9_dep2.txt", header=T, sep=" ")
onehot.mat <- as.matrix(onehot[,2:ncol(onehot)])
onehot.sum <- colSums(onehot.mat)
aa <- c(8695, 8323, 8245, 9130, 8925)
ac <- c(7431, 7487, 7147, 7534, 7594)
at <- c(8601, 8726, 7744, 8718, 8468)
ag <- c(7156, 7552, 7022, 6824, 7429)
ca <- c(11041, 9718, 10788, 10682, 9439)
cc <- c(7382, 8707, 8866, 7843, 8256)
ct <- c(6463, 5769, 6129, 6242, 5683)
cg <- c(9329, 8852, 9910, 9399, 9206)
ta <- c(5256, 5455, 6008, 5348, 5189)
tc <- c(7430, 8698, 8127, 6902, 8911)
tt <- c(8587, 8910, 8555, 9476, 9789)
tg <- c(7410, 7671, 7749, 7287, 7653)
ga <- c(7096, 6662, 7165, 7256, 6599)
gc <- c(10803, 10801, 10026, 10305, 10190)
gt <- c(7083, 7034, 6585, 7106, 7005)
gg <- c(6419, 5817, 6116, 6130, 5846)
# p5.GC p5.GT p5.GG p6.AA p6.AC p6.AT p6.AG p6.CA p6.CC p6.CT p6.CG
# 10190 7005 5846 8711 7008 7725 6708 10547 8862 6271 9271
# p6.TA p6.TC p6.TT p6.TG p6.GA p6.GC p6.GT p6.GG p7.AA p7.AC p7.AT
# 5809 8305 9138 7693 7089 11072 6594 5379 9015 7624 8479
# p7.AG p7.CA p7.CC p7.CT p7.CG p7.TA p7.TC p7.TT p7.TG p7.GA p7.GC
# 7038 11554 7792 6214 9687 5687 7245 8993 7803 7337 10647
# p7.GT p7.GG p8.AA p8.AC p8.AT p8.AG p8.CA p8.CC p8.CT p8.CG p8.TA
# 4696 6371 8870 8263 9099 7361 9811 8761 5892 8844 4999
# p8.TC p8.TT p8.TG p8.GA p8.GC p8.GT p8.GG p9.AA p9.AC p9.AT p9.AG
# 8717 7358 7308 6803 10960 7264 5872 8418 7357 7656 7052
# p9.CA p9.CC p9.CT p9.CG p9.TA p9.TC p9.TT p9.TG p9.GA p9.GC p9.GT
# 10666 9701 6313 10021 5201 8370 8687 7355 6535 10795 6351
# p9.GG p10.AA p10.AC p10.AT p10.AG p10.CA p10.CC p10.CT p10.CG p10.TA p10.TC
# 5704 8637 7616 8228 6339 11480 7910 6592 10241 5259 7072
# p10.TT p10.TG p10.GA p10.GC p10.GT p10.GG p11.AA p11.AC p11.AT p11.AG p11.CA
# 9393 7283 7017 10361 6759 5995 8619 7753 8800 7221 9551
# p11.CC p11.CT p11.CG p11.TA p11.TC p11.TT p11.TG p11.GA p11.GC p11.GT p11.GG
# 8670 6014 8724 5065 8601 9627 7679 6612 10344 7292 5610
# p12.AA p12.AC p12.AT p12.AG p12.CA p12.CC p12.CT p12.CG p12.TA p12.TC p12.TT
# 8262 6920 7800 6865 9785 9301 6456 9826 5893 8699 9461
# p12.TG p12.GA p12.GC p12.GT p12.GG p13.AA p13.AC p13.AT p13.AG p13.CA p13.CC
# 7680 6916 9864 6995 5459 9030 6813 8727 6286 11293 7504
# p13.CT p13.CG p13.TA p13.TC p13.TT p13.TG p13.GA p13.GC p13.GT p13.GG p14.AA
# 6250 9737 5779 7261 9912 7760 7010 9788 7374 5658 9105
# p14.AC p14.AT p14.AG p14.CA p14.CC p14.CT p14.CG p14.TA p14.TC p14.TT p14.TG
# 7895 8638 7474 9456 7944 5488 8478 5835 8852 9553 8023
# p14.GA p14.GC p14.GT p14.GG p15.AA p15.AC p15.AT p15.AG p15.CA p15.CC p15.CT
# 6713 9923 7211 5594 8681 7751 7913 6764 10488 8663 5949
# p15.CG p15.TA p15.TC p15.TT p15.TG p15.GA p15.GC p15.GT p15.GG p16.AA p16.AC
# 9514 6121 8434 8434 7901 6952 10593 6805 5219 9316 8046
# p16.AT p16.AG p16.CA p16.CC p16.CT p16.CG p16.TA p16.TC p16.TT p16.TG p16.GA
# 8296 6584 10788 8364 6759 9530 5469 6810 9535 7287 6795
# p16.GC p16.GT p16.GG p17.AA p17.AC p17.AT p17.AG p17.CA p17.CC p17.CT p17.CG
# 9704 7409 5490 8759 7703 9271 6635 9375 8368 6571 8610
# p17.TA p17.TC p17.TT p17.TG p17.GA p17.GC p17.GT p17.GG p18.AA p18.AC p18.AT
# 5433 8430 10646 7490 6311 10004 7882 4694 8283 7015 7639
# p18.AG p18.CA p18.CC p18.CT p18.CG p18.TA p18.TC p18.TT p18.TG p18.GA p18.GC
# 6941 11165 6791 6472 10077 7570 8345 9936 8519 6699 9131
# p18.GT p18.GG p19.AA p19.AC p19.AT p19.AG p19.CA p19.CC p19.CT p19.CG p19.TA
# 6825 4774 9608 9513 7774 6822 8476 11028 3198 8580 5299
# p19.TC p19.TT p19.TG p19.GA p19.GC p19.GT p19.GG
# 10393 7234 7946 7557 11056 6762 4936
cd /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli
bedtools makewindows -g ecoli.sizes.genome -w 1 -s 1 > ecoli.1bp.sliding.bed
–> density
module load python
source /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/scripts/loadcondaandes.sh
conda activate /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/Libraries/andes/envs/test
cd /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli
## genes
bedtools intersect -wo -a ecoli.1bp.sliding.bed -b genome/GCF_000005845.2_ASM584v2_genomic.gene.gff > ecoli.gene.1sliding.bed
## GC content
bedtools nuc -fi genome/GCF_000005845.2_ASM584v2_genomic.fna -bed ecoli.1bp.sliding.bed | sed '1d' > ecoli.GC.1sliding.bed
–> melting temp https://biopython.org/docs/1.75/api/Bio.SeqUtils.MeltingTemp.html
module load python
source /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/scripts/loadcondaandes.sh
conda activate /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/Libraries/andes/envs/test
Bio.SeqUtils.MeltingTemp.Tm_NN(seq, check=True, strict=True, c_seq=None, shift=0, nn_table=None, tmm_table=None, imm_table=None, de_table=None, dnac1=25, dnac2=25, selfcomp=False, Na=50, K=0, Tris=0, Mg=0, dNTPs=0, saltcorr=5)
https://warwick.ac.uk/fac/sci/moac/people/students/peter_cock/python/fasta_n
# summit: # conda install -c conda-forge biopython
### 20bp sliding windows
cd /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli
bedtools getfasta -fi genome/GCF_000005845.2_ASM584v2_genomic.fna -bed ecoli.1bp.sliding.bed -fo ecoli.1sliding.fa
# count nucleotides
cd /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli
python3
input_file = open('ecoli.1sliding.fa', 'r')
output_file = open('nucleotide_counts_1sliding.tsv','w')
output_file.write('Window\tA\tC\tG\tT\tLength\tCG%\n')
from Bio import SeqIO
for cur_record in SeqIO.parse(input_file, "fasta") :
gene_name = cur_record.name
A_count = cur_record.seq.count('A')
C_count = cur_record.seq.count('C')
G_count = cur_record.seq.count('G')
T_count = cur_record.seq.count('T')
length = len(cur_record.seq)
cg_percentage = float(C_count + G_count) / length
output_line = '%s\t%i\t%i\t%i\t%i\t%i\t%f\n' % \
(gene_name, A_count, C_count, G_count, T_count, length, cg_percentage)
output_file.write(output_line)
output_file.close()
input_file.close()
exit()
# Melting temperature(°C) = 64.9 + 41 * (nG+nC-16.4)/(nA+nT+nG+nC)
R
library(dplyr)
setwd("/gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli")
df <- read.delim("nucleotide_counts_1sliding.tsv", header=T, sep="\t")
df.melt <- df %>% mutate(MeltingTemp = 64.9 + 41 * (G+C-16.4) / (A+T+G+C))
write.table(df.melt, "nucleotide_counts_1sliding_temp.txt", quote=F, row.names=F, sep="\t")
q()
–> structure https://academic.oup.com/bioinformatics/article/34/14/2499/4924718 https://github.com/Superzchen/iFeature/ https://github.com/feliixx/gotranseq
#cd /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/Libraries/andes
#git clone https://github.com/Superzchen/iFeature
cd /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli
python /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/Libraries/andes/iFeature/iFeature.py --file ecoli.1sliding.fa --type AAC --out 1sliding.protein.structure.fa
–> rnaseq
library(tidyr)
library(dplyr)
setwd("/gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli/genome")
# sed '1d' GCF_000005845.2_ASM584v2_genomic.gff | sed '1d' | sed '1d' | sed '1d' | sed '1d' | sed '1d' | sed '1d' > GCF_000005845.2_ASM584v2_genomic.txt
annotation <- read.delim("GCF_000005845.2_ASM584v2_genomic.txt", header=F, sep="\t")
gene <- subset(annotation, annotation$V3 == "gene")
gene.id <- separate(gene, V9, c("id1", "id2"), sep="EcoGene:")
gene.id$gene_id <- substr(gene.id$id2, 1, 7)
setwd("/gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli")
rna <- read.delim("GSM2267479_Sample-1.genes.results.txt", header=T, sep="\t")
rna.id <- left_join(rna, gene.id, by="gene_id")
rna.id.idf <- na.omit(rna.id[,c(8,11,12,1,3:7)])
write.table(rna.id.idf, "GSM2267479.fpkm.coord.txt", quote=F, row.names=F, sep="\t")
# calculate density
bedtools intersect -wo -a ecoli.1bp.sliding.bed -b GSM2267479.fpkm.coord.bed > ecoli.rnaseq.1sliding.bed
setwd("/gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli")
window <- read.delim("ecoli.rnaseq.1sliding.bed", header=F, sep="\t")
window.df <- window %>% group_by(V1, V2, V3) %>% mutate(avg.fpkm = mean(V12))
window.uniq <- unique(window.df[,c(1:3,14)])
write.table(window.uniq, "ecoli.rnaseq.average.1sliding.bed", quote=F, row.names=F, sep="\t")
–> GATC motif
module load python
source /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/scripts/loadcondaandes.sh
conda activate /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/Libraries/andes/envs/test
#source /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/scripts/loadcondasummit.sh
#conda activate /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/Libraries/summit/anaconda3/envs/summit-test
## GATC motif
## try fastaregex
#cd /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/Libraries/andes/
#wget https://github.com/dariober/bioinformatics-cafe/blob/master/fastaRegexFinder/fastaRegexFinder.py?raw=true -O fastaRegexFinder.py
#chmod a+x fastaRegexFinder.py
cd /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli
python /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/Libraries/andes/fastaRegexFinder.py -q -f genome/GCF_000005845.2_ASM584v2_genomic.fna -r 'GATC' > ecoli.gatc.bed
bedtools intersect -wo -a ecoli.1bp.sliding.bed -b ecoli.gatc.coord.bed > ecoli.gatc.1sliding.bed
–> IPD ratios
setwd("/gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli")
df <- read.delim("GSM3264688_Ecoli.gff", header=F, sep="\t")
df2 <- df[5:nrow(df),]
library(dplyr)
library(tidyr)
df.sep <- df2 %>% separate(V9, c("coverage", "context", "IPD"), sep=";")
df.ipd <- df.sep %>% separate(IPD, c("IPD", "IPD.value"), sep="=")
df.ipd$chr <- "NC_000913.3"
df.coord <- df.ipd[,c(13,4,5,12)]
write.table(df.coord, "GSM3264688_Ecoli.coord.bed", quote=F, row.names=F, col.names=F, sep="\t")
cd /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli
bedtools intersect -wo -a ecoli.1bp.sliding.bed -b GSM3264688_Ecoli.coord.bed > ecoli.ipd.1sliding.bed
setwd("/gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli")
window <- read.delim("ecoli.ipd.1sliding.bed", header=F, sep="\t")
window.df <- window %>% group_by(V1, V2, V3) %>% mutate(avg.fpkm = mean(V7))
write.table(window.df, "ecoli.ipd.average.1sliding.bed", quote=F, row.names=F, sep="\t")
setwd("/gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli")
window.df <- read.delim("ecoli.ipd.average.1sliding.bed", header=T, sep="\t")
window.uniq <- unique(window.df[,c(1:3,9)])
write.table(window.uniq, "ecoli.ipd.average.1sliding.bed", quote=F, row.names=F, sep="\t")
# sbatch /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli/dwt.1bp.sliding.sh
source /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/scripts/loadcondaandes.sh
conda activate /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/Libraries/andes/envs/test
# salloc -A SYB105 -N 2 -p gpu -t 4:00:00
library(dplyr)
library(reshape2)
library(tidyr)
library(wmtsa)
library(data.table)
setwd("/gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli")
gatc <- read.table("ecoli.gatc.1sliding.bed", header=F, sep="\t", stringsAsFactors = F)
ipd <- read.table("ecoli.ipd.average.1sliding.bed", header=T, sep="\t", stringsAsFactors = F)
gene <- read.table("ecoli.gene.1sliding.bed", header=F, sep="\t", stringsAsFactors = F)
structure <- read.table("1sliding.protein.structure.fa", header=T, sep="\t", stringsAsFactors = F)
nuc <- read.table("nucleotide_counts_1sliding_temp.txt", header=T, sep="\t", stringsAsFactors = F)
rnaseq <- read.table("ecoli.rnaseq.average.1sliding.bed", header=T, sep="\t", stringsAsFactors = F)
window <- read.table("ecoli.1bp.sliding.bed", header=F, sep="\t", stringsAsFactors = F)
score <- read.table("sgRNA.coord.txt", header=T, sep="\t", stringsAsFactors = F)
colnames(score) <- c("chr", "start", "end", "sgRNA", "id", "seq", "id2", "cut.score", "gid", "change.val", "quality")
score.df <- score[,c(1:4,8)]
norm <- read.table("DataS6.txt", header=T, sep="\t", stringsAsFactors = F)
score.norm.df <- left_join(score.df, norm[,c(1,3)], by="sgRNA")
window.v <- window[,1:3]
colnames(window.v) <- c("V1", "V2", "V3")
gatc.win <- left_join(window.v, gatc, by=c("V1", "V2", "V3"))
gatc.win[is.na(gatc.win)] <- 0
ipd.win <- left_join(window.v, ipd, by=c("V1", "V2", "V3"))
ipd.win[is.na(ipd.win)] <- 0
gene.win <- left_join(window.v, gene, by=c("V1", "V2", "V3"))
gene.win[is.na(gene.win)] <- 0
rnaseq.win <- left_join(window.v, rnaseq, by=c("V1", "V2", "V3"))
rnaseq.win[is.na(rnaseq.win)] <- 0
gatc.bin <- gatc.win %>% group_by(V1, V2, V3) %>% mutate(gatc.count = n())
gatc.count <- unique(gatc.bin[,c(1:3,8)])
gatc.df <- gatc.count$gatc.count
ipd.df <- ipd.win[,4]
gene.bin <- gene.win %>% group_by(V1, V2, V3) %>% mutate(gene.count = n())
gene.count <- unique(gene.bin[,c(1:3,14)])
gene.df <- gene.count$gene.count
structure.df <- structure[,2]
gc.df <- nuc[,7]
temp.df <- nuc[,8]
rna.df <- rnaseq.win[,4]
# Run DWT instead of CWT
#wavMODWT(x, wavelet="s8", n.levels=ilogb(length(x), base=2),position=list(from=1,by=1,units=character()), units=character(),title.data=character(), documentation=character(), keep.series=FALSE)
temp.modwt <- wavMODWT(temp.df)
temp.modwt.df <- as.matrix(temp.modwt)
temp.modwt.label <- data.frame(label = row.names(temp.modwt.df), temp.modwt.df)
temp.modwt.dt <- as.data.table(temp.modwt.label)
#temp.modwt.name <- temp.modwt.label %>% separate(label, c("name", "number"), sep = "[^[:alnum:]]+")
temp.modwt.name <- temp.modwt.dt[, c("name", "number") := tstrsplit(label, "[^[:alnum:]]+")]
colnames(temp.modwt.name) <- c("label", "temp.dwt", "scale", "window")
gc.modwt <- wavMODWT(gc.df)
gc.modwt.df <- as.matrix(gc.modwt)
gc.modwt.label <- data.frame(label = row.names(gc.modwt.df), gc.modwt.df)
gc.modwt.dt <- as.data.table(gc.modwt.label)
gc.modwt.name <- gc.modwt.dt[, c("name", "number") := tstrsplit(label, "[^[:alnum:]]+")]
colnames(gc.modwt.name) <- c("label", "gc.dwt", "scale", "window")
structure.modwt <- wavMODWT(structure.df)
structure.modwt.df <- as.matrix(structure.modwt)
structure.modwt.label <- data.frame(label = row.names(structure.modwt.df), structure.modwt.df)
structure.modwt.dt <- as.data.table(structure.modwt.label)
structure.modwt.name <- structure.modwt.dt[, c("name", "number") := tstrsplit(label, "[^[:alnum:]]+")]
colnames(structure.modwt.name) <- c("label", "structure.dwt", "scale", "window")
rna.modwt <- wavMODWT(rna.df)
rna.modwt.df <- as.matrix(rna.modwt)
rna.modwt.label <- data.frame(label = row.names(rna.modwt.df), rna.modwt.df)
rna.modwt.dt <- as.data.table(rna.modwt.label)
rna.modwt.name <- rna.modwt.dt[, c("name", "number") := tstrsplit(label, "[^[:alnum:]]+")]
colnames(rna.modwt.name) <- c("label", "rna.dwt", "scale", "window")
gene.modwt <- wavMODWT(gene.df)
gene.modwt.df <- as.matrix(gene.modwt)
gene.modwt.label <- data.frame(label = row.names(gene.modwt.df), gene.modwt.df)
gene.modwt.dt <- as.data.table(gene.modwt.label)
gene.modwt.name <- gene.modwt.dt[, c("name", "number") := tstrsplit(label, "[^[:alnum:]]+")]
colnames(gene.modwt.name) <- c("label", "gene.dwt", "scale", "window")
ipd.modwt <- wavMODWT(ipd.df)
ipd.modwt.df <- as.matrix(ipd.modwt)
ipd.modwt.label <- data.frame(label = row.names(ipd.modwt.df), ipd.modwt.df)
ipd.modwt.dt <- as.data.table(ipd.modwt.label)
ipd.modwt.name <- ipd.modwt.dt[, c("name", "number") := tstrsplit(label, "[^[:alnum:]]+")]
colnames(ipd.modwt.name) <- c("label", "ipd.dwt", "scale", "window")
gatc.modwt <- wavMODWT(gatc.df)
gatc.modwt.df <- as.matrix(gatc.modwt)
gatc.modwt.label <- data.frame(label = row.names(gatc.modwt.df), gatc.modwt.df)
gatc.modwt.dt <- as.data.table(gatc.modwt.label)
gatc.modwt.name <- gatc.modwt.dt[, c("name", "number") := tstrsplit(label, "[^[:alnum:]]+")]
colnames(gatc.modwt.name) <- c("label", "gatc.dwt", "scale", "window")
colnames(window) <- c("chr", "start", "end")
window$window <- seq.int(nrow(window))
window$window <- as.character(window$window-1)
window$start <- as.numeric(window$start)
window$end <- as.numeric(window$end - 1)
library(tidygenomics)
window.score.df <- geneome_intersect(score.df, window, by=c("chr", "start", "end"))
window.score.temp <- left_join(window.score.df, temp.modwt.name[,c(3,4,2)], by="window")
window.temp.gc <- left_join(window.score.temp, gc.modwt.name[,c(3,4,2)], by=c("window", "scale"))
window.temp.gc.structure <- left_join(window.temp.gc, structure.modwt.name[,c(3,4,2)], by=c("window", "scale"))
window.temp.gc.structure.rna <- left_join(window.temp.gc.structure, rna.modwt.name[,c(3,4,2)], by=c("window", "scale"))
window.temp.gc.structure.rna.gene <- left_join(window.temp.gc.structure.rna, gene.modwt.name[,c(3,4,2)], by=c("window", "scale"))
window.temp.gc.structure.rna.gene.gatc <- left_join(window.temp.gc.structure.rna.gene, gatc.modwt.name[,c(3,4,2)], by=c("window", "scale"))
window.temp.gc.structure.rna.gene.gatc.ipd <- left_join(window.temp.gc.structure.rna.gene.gatc, ipd.modwt.name[,c(3,4,2)], by=c("window", "scale"))
# 1293751
window.temp.gc.structure.rna.gene.gatc.ipd.sgRNA <- subset(window.temp.gc.structure.rna.gene.gatc.ipd, window.temp.gc.structure.rna.gene.gatc.ipd$score != "NA")
# 931340
df.melt <- melt(window.temp.gc.structure.rna.gene.gatc.ipd.sgRNA[,c(4,5,7:14)], id=c("score", "scale", "sgRNA"))
df <- na.omit(df.melt)
colnames(df) <- c("cut.score", "scale", "sgRNA", "variable", "value")
df.id <- df %>% unite(feature.scale, c(variable, scale), sep = "")
df.dcast <- df.id %>% dcast(sgRNA + cut.score ~ feature.scale, value.var = "value", fun.aggregate=mean, na.rm=TRUE)
df.dcast.na <- na.omit(df.dcast)
# 40467
write.table(df.dcast.na, "ecoli.1sliding.exact.dwt.dcast.txt", quote=F, row.names=F, sep="\t")
–> combine to generate full feature matrix
# combine regional DWT with other features
library(tidyr)
library(dplyr)
setwd("/gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli")
df.dcast.na <- read.delim("ecoli.1sliding.exact.dwt.dcast.txt", header=T, sep="\t", stringsAsFactors = F)
df.dcast.sep <- df.dcast.na %>% separate(sgRNA, c("sgRNA", "ID"), sep="_")
df.dcast.dwt <- df.dcast.sep[,c(4:ncol(df.dcast.sep))]
colnames(df.dcast.dwt) <- paste0('sgRNA_', colnames(df.dcast.dwt))
df.dcast <- cbind(df.dcast.sep[,1:3], df.dcast.dwt)
df <- read.delim("Ecoli.allCas9.raw.onehot.tensor.dcast.na.txt", header=T, sep="\t", stringsAsFactors = F)
df.sep <- df %>% separate(sgRNAID, c("sgRNA", "ID", "type"), sep="_")
nrow(df.sep)
# 126182
df.sep.region <- inner_join(df.sep, df.dcast[,c(1,2,4:ncol(df.dcast.sep))], by=c("sgRNA", "ID"))
df.sep.region.id <- df.sep.region3 %>% unite(sgRNAID, c("sgRNA", "ID", "type"), sep="_")
nrow(df.sep.region.id)
#
write.table(df.sep.region.id, "ecoli.1sliding.all.features.dcast.txt", quote=F, row.names=F, sep="\t")
# Summit
module load r/4.0.5
# Andes
# salloc -A SYB105 -N 2 -t 4:00:00
module load python
source /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/scripts/loadcondaandes.sh
conda activate /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/Libraries/andes/envs/test
mkdir /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli/iRF.run/Ecoli.allCas9/all.features/dwt1bp
R
setwd("/gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli")
df <- read.delim("ecoli.1sliding.all.features.dcast.txt", header=T, sep="\t")
df <- na.omit(df)
## sample ID
df.features <- df[,c(1,3:ncol(df))]
df.score <- df[,1:2]
setwd("/gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli/iRF.run/Ecoli.allCas9/all.features/dwt1bp")
write.table(df.features, "dwt1bp.features_overlap.txt", quote=F, row.names=F, sep="\t")
write.table(df.score, "dwt1bp.score_overlap.txt", quote=F, row.names=F, sep="\t")
write.table(df.features, "dwt1bp.features.txt", quote=F, row.names=F, sep="\t")
write.table(df.score, "dwt1bp.score.txt", quote=F, row.names=F, sep="\t")
## no sample ID
## <DataFile>_overlap_noSampleIDs.txt & <YFile>_overlap_noSampleIDs.txt
df.features <- as.matrix(df[,c(3:ncol(df))])
df.score <- as.matrix(data.frame(df[,2]))
colnames(df.score) <- "cut.score"
setwd("/gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli/iRF.run/Ecoli.allCas9/all.features/dwt20bp")
write.table(df.features, "dwt1bp.features_overlap_noSampleIDs.txt", quote=F, row.names=F, sep="\t")
write.table(df.score, "dwt1bp.score_overlap_noSampleIDs.txt", quote=F, row.names=F, sep="\t")
–> non-correlated features
# salloc -A SYB105 -N 2 -t 4:00:00
module load python
source /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/scripts/loadcondaandes.sh
conda activate /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/Libraries/andes/envs/test
R
setwd("/gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli")
df <- read.delim("ecoli.1sliding.all.features.dcast.txt", header=T, sep="\t")
df <- na.omit(df)
## sample ID
df.features <- as.matrix(df[,c(3:ncol(df))])
df.cor <- cor(df.features)
write.table(df.cor, "ecoli.1sliding.all.features.dcast.correlation.txt", quote=F, row.names=F, sep="\t")
######### find highly correlated variales using caret
setwd("/gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli")
df.cor <- read.delim("ecoli.1sliding.all.features.dcast.correlation.txt", header=T, sep="\t")
df.cor[is.na(df.cor)] <- 0
df.num <- as.matrix(df.cor)
features <- colnames(df.num)
rownames(df.num) <- features
library(caret)
### remove from iRF run???
df.num.remove <- findCorrelation(df.num, cutoff = .9, exact = TRUE, names = TRUE)
write.table(df.num.remove, "ecoli.1sliding.all.features.highlycorrelated.txt", quote=F, row.names=F, sep="\t")
setwd("/gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli/iRF.run/Ecoli.allCas9/all.features/dwt20bp/")
df <- read.delim("dwt1bp.features.txt", header=T, sep="\t")
df.subset <- df[ , -which(names(df) %in% df.num.remove)]
df.mat <- as.matrix(df.subset[,2:ncol(df.subset)])
df.mat.id <- cbind(as.data.frame(df$sgRNAID), df.mat)
write.table(df.mat.id, "dwt1bp.noncor.features.txt", quote=F, row.names=F, sep="\t")
write.table(df.mat.id, "dwt1bp.noncor.features_overlap.txt", quote=F, row.names=F, sep="\t")
write.table(df.mat, "dwt1bp.noncor.features_overlap_noSampleIDs.txt", quote=F, row.names=F, sep="\t")
# run python scripts on Andes
# run job submissions on Summit
# Builder script: /gpfs/alpine/syb105/proj-shared/Projects/iRF/iRF_LOOP_SetUp_CrossLayer.py
# [python iRF_LOOP_SetUp_CrossLayer.py --DataFile --YFile --System Summit --NodesPer 1 --TotalNodes 10 --RunTime 2 --Account SYB105 --NumTrees 1000 --NumIterations 5 --RunName iRF.XX --bypass --Prediction]
# Andes
mkdir /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli/iRF.run/Ecoli.allCas9/all.features/dwt1bp
module load python/3.7-anaconda3
cd /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli/iRF.run/Ecoli.allCas9/all.features/dwt1bp
python /gpfs/alpine/syb105/proj-shared/Projects/iRF/iRF_LOOP_SetUp_CrossLayer.py --System Summit --NodesPer 1 --TotalNodes 50 --RunTime 90 --Account SYB105 --NumTrees 1000 --NumIterations 5 --RunName dwt1bp --bypass --Prediction /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli/iRF.run/Ecoli.allCas9/all.features/dwt1bp/dwt1bp.features.txt /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli/iRF.run/Ecoli.allCas9/all.features/dwt1bp/dwt1bp.score.txt
# Summit
cd /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli/iRF.run/Ecoli.allCas9/all.features/dwt1bp
module load python/3.7.0-anaconda3-5.3.0
# full
bsub /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli/iRF.run/Ecoli.allCas9/all.features/dwt1bp/Submits/submit_full_dwt1bp_0.sh
# train
bsub /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli/iRF.run/Ecoli.allCas9/all.features/dwt1bp/Submits/submit_train_dwt1bp_0.sh
# once the train submissions are done run the test submissions
# test
bsub /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli/iRF.run/Ecoli.allCas9/all.features/dwt1bp/Submits/submit_test_dwt1bp_0.sh
# Andes
cd /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli/iRF.run/Ecoli.allCas9/
module load python/3.7-anaconda3
cd /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli/iRF.run/Ecoli.allCas9/all.features/dwt1bp
python /gpfs/alpine/syb105/proj-shared/Projects/iRF/iRF_postProcessing.py --Iterations 5 --Prediction --PredAccuracy MAE,MAEA,MSE,R2 --varTot 95 /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli/iRF.run/Ecoli.allCas9/YNames.txt dwt1bp
# R2 =
sort -k 3,3n cut.score_top95.txt | tail
# pearson correlation
setwd("/gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli/iRF.run/Ecoli.allCas9/all.features/dwt1bp/cut.score/foldRuns/fold9/Runs/Set4")
pred <- read.delim("dwt1bp_Set4_test.prediction", header=T, sep="\t")
y <- read.delim("set4_Y_test_noSampleIDs.txt", header=T, sep="\t")
cor(y$cut.score, pred$Predictions.)
#
##### Cas9 only
cd /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli/iRF.run/Ecoli.allCas9/all.features/dwt20bp/
mkdir Cas9
awk 'NR==1 || /_Cas9/' dwt20bp.score.txt > dwt20bp.cas9.score.txt
cp dwt20bp.cas9.score.txt dwt20bp.cas9.score_overlap.txt
cut --complement -f 1 dwt20bp.cas9.score.txt > dwt20bp.cas9.score_overlap_noSampleIDs.txt
awk 'NR==1 || /_Cas9/' dwt20bp.noncor2.features.txt > dwt20bp.noncor2.cas9.features.txt
cp dwt20bp.noncor2.cas9.features.txt dwt20bp.noncor2.cas9.features_overlap.txt
cut --complement -f 1 dwt20bp.noncor2.cas9.features.txt > dwt20bp.noncor2.cas9.features_overlap_noSampleIDs.txt
##### eSpCas9 only
cd /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli/iRF.run/Ecoli.allCas9/all.features/dwt20bp/
mkdir eSpCas9
awk 'NR==1 || /_eSpCas9/' dwt20bp.score.txt > dwt20bp.eSpCas9.score.txt
cp dwt20bp.cas9.score.txt dwt20bp.eSpCas9.score_overlap.txt
cut --complement -f 1 dwt20bp.eSpCas9.score.txt > dwt20bp.eSpCas9.score_overlap_noSampleIDs.txt
awk 'NR==1 || /_eSpCas9/' dwt20bp.noncor2.features.txt > dwt20bp.noncor2.eSpCas9.features.txt
cp dwt20bp.noncor2.eSpCas9.features.txt dwt20bp.noncor2.eSpCas9.features_overlap.txt
cut --complement -f 1 dwt20bp.noncor2.eSpCas9.features.txt > dwt20bp.noncor2.eSpCas9.features_overlap_noSampleIDs.txt
##### recACas9 only
cd /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli/iRF.run/Ecoli.allCas9/all.features/dwt20bp/
mkdir recACas9
awk 'NR==1 || /_recACas9/' dwt20bp.score.txt > dwt20bp.recACas9.score.txt
cp dwt20bp.cas9.score.txt dwt20bp.recACas9.score_overlap.txt
cut --complement -f 1 dwt20bp.recACas9.score.txt > dwt20bp.recACas9.score_overlap_noSampleIDs.txt
awk 'NR==1 || /_recACas9/' dwt20bp.noncor2.features.txt > dwt20bp.noncor2.recACas9.features.txt
cp dwt20bp.noncor2.recACas9.features.txt dwt20bp.noncor2.recACas9.features_overlap.txt
cut --complement -f 1 dwt20bp.noncor2.recACas9.features.txt > dwt20bp.noncor2.recACas9.features_overlap_noSampleIDs.txt
# Andes
module load python/3.7-anaconda3
mkdir /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli/iRF.run/Ecoli.allCas9/all.features/dwt20bp.noncor2.cas9
cd /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli/iRF.run/Ecoli.allCas9/all.features/dwt20bp.noncor2.cas9
python /gpfs/alpine/syb105/proj-shared/Projects/iRF/iRF_LOOP_SetUp_CrossLayer.py --System Summit --NodesPer 1 --TotalNodes 50 --RunTime 90 --Account SYB105 --NumTrees 1000 --NumIterations 5 --RunName dwt20bp.noncor2.cas9 --bypass --Prediction /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli/iRF.run/Ecoli.allCas9/all.features/dwt20bp/dwt20bp.noncor2.cas9.features.txt /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli/iRF.run/Ecoli.allCas9/all.features/dwt20bp/dwt20bp.cas9.score.txt
# Summit
cd /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli/iRF.run/Ecoli.allCas9/all.features/dwt20bp.noncor2.cas9
module load python/3.7.0-anaconda3-5.3.0
# full
bsub /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli/iRF.run/Ecoli.allCas9/all.features/dwt20bp.noncor2.cas9/Submits/submit_full_dwt20bp.noncor2.cas9_0.sh
# train
bsub /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli/iRF.run/Ecoli.allCas9/all.features/dwt20bp.noncor2.cas9/Submits/submit_train_dwt20bp.noncor2.cas9_0.sh
# once the train submissions are done run the test submissions
# test
bsub /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli/iRF.run/Ecoli.allCas9/all.features/dwt20bp.noncor2.cas9/Submits/submit_test_dwt20bp.noncor2.cas9_0.sh
# Andes
cd /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli/iRF.run/Ecoli.allCas9/
module load python/3.7-anaconda3
cd /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli/iRF.run/Ecoli.allCas9/all.features/dwt20bp.noncor2.cas9
python /gpfs/alpine/syb105/proj-shared/Projects/iRF/iRF_postProcessing.py --Iterations 5 --Prediction --PredAccuracy MAE,MAEA,MSE,R2 --varTot 95 /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli/iRF.run/Ecoli.allCas9/YNames.txt dwt20bp.noncor2.cas9
# 0.23064617681729335
sort -k3rg topVarEdges/cut.score_top95.txt | head
# p20xz_quadrupoleraw cut.score 0.029686817358311596
# gene.dwtd22.x cut.score 0.029489538370503877
# p20homo_lumo_energygapraw cut.score 0.02729263038651026
# sgRNA.gcsgRNA.raw cut.score 0.02561598838431653
# p18xz_quadrupoleraw cut.score 0.023110545239158516
# p20homo_energyraw cut.score 0.013095906817271597
# p19xy_polarizabilityraw cut.score 0.012551967356934784
# p15.CCsgRNA.raw cut.score 0.012249350565404618
# ipd.dwtd2.x cut.score 0.011976463258548734
# rna.dwtd21.x cut.score 0.011755281398032221
awk '{ sum += $2; n++ } END { if (n > 0) print sum / n; }' cut.score/foldRuns/results/importanceScores.txt
# 0.0011534
# pearson correlation
setwd("/gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli/iRF.run/Ecoli.allCas9/all.features/dwt20bp.noncor2.cas9/cut.score/foldRuns/fold9/Runs/Set4")
pred <- read.delim("dwt20bp.noncor2.cas9_Set4_test.prediction", header=T, sep="\t")
y <- read.delim("set4_Y_test_noSampleIDs.txt", header=T, sep="\t")
cor(y$cut.score, pred$Predictions.)
# 0.481121
#!/bin/bash -l
#BSUB -P SYB105
#BSUB -W 04:15
#BSUB -nnodes 50
#BSUB -J cas9.test_0
#BSUB -o cas9.test_0.o%J
#BSUB -e cas9.test_0.e%J
# bsub /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli/iRF.run/Ecoli.allCas9/all.features/cas9.model.testing.sh
# Cas9 trained model --> test with eSpCas9 and recACas9
cd /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli/iRF.run/Ecoli.allCas9/all.features/dwt20bp.noncor2.cas9
### Cas9/eSpCas9
/usr/bin/time -f "%e" jsrun -n 1 -a 1 -c 40 -bpacked:40 /gpfs/alpine/syb105/proj-shared/Projects/iRF/IterativeRanger/cpp_version/build/ranger --file /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli/iRF.run/Ecoli.allCas9/all.features/dwt20bp/dwt20bp.noncor2.eSpCas9.features_overlap_noSampleIDs.txt --yfile /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli/iRF.run/Ecoli.allCas9/all.features/dwt20bp/dwt20bp.eSpCas9.score_overlap_noSampleIDs.txt --predict /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli/iRF.run/Ecoli.allCas9/all.features/dwt20bp.noncor2.cas9/cut.score/foldRuns/fold0/Runs/Set0/dwt20bp.noncor2.cas9_cut.score.forest --treetype 3 --depvarname cut.score --impmeasure 1 --nthreads 160 --useMPI 0 --outprefix Cas9.eSpCas9 --outputDirectory /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli/iRF.run/Ecoli.allCas9/all.features/dwt20bp/eSpCas9 > /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli/iRF.run/Ecoli.allCas9/all.features/dwt20bp/Cas9.eSpCas9_test.o
setwd("/gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli/iRF.run/Ecoli.allCas9/all.features/dwt20bp/")
score <- read.delim("dwt20bp.eSpCas9.score_overlap_noSampleIDs.txt", header=T, sep="\t")
setwd("/gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli/iRF.run/Ecoli.allCas9/all.features/dwt20bp/eSpCas9/")
predict <- read.delim("Cas9.eSpCas9.prediction", header=T, sep="\t")
score.predict <- cbind(score, predict)
cor(score.predict$cut.score, score.predict$Predictions.)
# 0.6578685
pdf("Cas9.eSpCas9.prediction.scatter.pdf")
library(ggplot2)
ggplot(score.predict, aes(x=cut.score, y=Predictions.)) + geom_point() + theme_classic()
dev.off()
###### subset to only use 8,093 for test
head -8093 /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli/iRF.run/Ecoli.allCas9/all.features/dwt20bp/dwt20bp.noncor2.eSpCas9.features_overlap_noSampleIDs.txt > /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli/iRF.run/Ecoli.allCas9/all.features/dwt20bp/dwt20bp.noncor2.eSpCas9.subset.features_overlap_noSampleIDs.txt
head -8093 /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli/iRF.run/Ecoli.allCas9/all.features/dwt20bp/dwt20bp.eSpCas9.score_overlap_noSampleIDs.txt > /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli/iRF.run/Ecoli.allCas9/all.features/dwt20bp/dwt20bp.eSpCas9.subset.score_overlap_noSampleIDs.txt
cd /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli/iRF.run/Ecoli.allCas9/all.features/dwt20bp.noncor2.cas9
/usr/bin/time -f "%e" jsrun -n 1 -a 1 -c 40 -bpacked:40 /gpfs/alpine/syb105/proj-shared/Projects/iRF/IterativeRanger/cpp_version/build/ranger --file /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli/iRF.run/Ecoli.allCas9/all.features/dwt20bp/dwt20bp.noncor2.eSpCas9.subset.features_overlap_noSampleIDs.txt --yfile /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli/iRF.run/Ecoli.allCas9/all.features/dwt20bp/dwt20bp.eSpCas9.subset.score_overlap_noSampleIDs.txt --predict /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli/iRF.run/Ecoli.allCas9/all.features/dwt20bp.noncor2.cas9/cut.score/foldRuns/fold0/Runs/Set0/dwt20bp.noncor2.cas9_cut.score.forest --treetype 3 --depvarname cut.score --impmeasure 1 --nthreads 160 --useMPI 0 --outprefix Cas9.eSpCas9.subset --outputDirectory /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli/iRF.run/Ecoli.allCas9/all.features/dwt20bp/eSpCas9 > /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli/iRF.run/Ecoli.allCas9/all.features/dwt20bp/Cas9.eSpCas9.subset_test.o
setwd("/gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli/iRF.run/Ecoli.allCas9/all.features/dwt20bp/")
score <- read.delim("dwt20bp.eSpCas9.subset.score_overlap_noSampleIDs.txt", header=T, sep="\t")
setwd("/gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli/iRF.run/Ecoli.allCas9/all.features/dwt20bp/eSpCas9/")
predict <- read.delim("Cas9.eSpCas9.subset.prediction", header=T, sep="\t")
score.predict <- cbind(score, predict)
cor(score.predict$cut.score, score.predict$Predictions.)
# 0.6654885
pdf("Cas9.eSpCas9.subset.prediction.scatter.pdf")
library(ggplot2)
ggplot(score.predict, aes(x=cut.score, y=Predictions.)) + geom_point() + theme_classic()
dev.off()
### Cas9/recACas9
/usr/bin/time -f "%e" jsrun -n 1 -a 1 -c 40 -bpacked:40 /gpfs/alpine/syb105/proj-shared/Projects/iRF/IterativeRanger/cpp_version/build/ranger --file /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli/iRF.run/Ecoli.allCas9/all.features/dwt20bp/dwt20bp.noncor2.recACas9.features_overlap_noSampleIDs.txt --yfile /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli/iRF.run/Ecoli.allCas9/all.features/dwt20bp/dwt20bp.recACas9.score_overlap_noSampleIDs.txt --predict /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli/iRF.run/Ecoli.allCas9/all.features/dwt20bp.noncor2.cas9/cut.score/foldRuns/fold0/Runs/Set0/dwt20bp.noncor2.cas9_cut.score.forest --treetype 3 --depvarname cut.score --impmeasure 1 --nthreads 160 --useMPI 0 --outprefix Cas9.recACas9 --outputDirectory /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli/iRF.run/Ecoli.allCas9/all.features/dwt20bp/recACas9 > /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli/iRF.run/Ecoli.allCas9/all.features/dwt20bp/Cas9.recACas9_test.o
setwd("/gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli/iRF.run/Ecoli.allCas9/all.features/dwt20bp/")
score <- read.delim("dwt20bp.recACas9.score_overlap_noSampleIDs.txt", header=T, sep="\t")
setwd("/gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli/iRF.run/Ecoli.allCas9/all.features/dwt20bp/recACas9/")
predict <- read.delim("Cas9.recACas9.prediction", header=T, sep="\t")
score.predict <- cbind(score, predict)
cor(score.predict$cut.score, score.predict$Predictions.)
# 0.4530023
pdf("Cas9.recACas9.prediction.scatter.pdf")
library(ggplot2)
ggplot(score.predict, aes(x=cut.score, y=Predictions.)) + geom_point() + theme_classic()
dev.off()
# Andes
module load python/3.7-anaconda3
mkdir /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli/iRF.run/Ecoli.allCas9/all.features/dwt20bp.noncor2.eSpCas9
cd /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli/iRF.run/Ecoli.allCas9/all.features/dwt20bp.noncor2.eSpCas9
python /gpfs/alpine/syb105/proj-shared/Projects/iRF/iRF_LOOP_SetUp_CrossLayer.py --System Summit --NodesPer 1 --TotalNodes 50 --RunTime 90 --Account SYB105 --NumTrees 1000 --NumIterations 5 --RunName dwt20bp.noncor2.eSpCas9 --bypass --Prediction /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli/iRF.run/Ecoli.allCas9/all.features/dwt20bp/dwt20bp.noncor2.eSpCas9.features.txt /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli/iRF.run/Ecoli.allCas9/all.features/dwt20bp/dwt20bp.eSpCas9.score.txt
# Summit
cd /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli/iRF.run/Ecoli.allCas9/all.features/dwt20bp.noncor2.eSpCas9
module load python/3.7.0-anaconda3-5.3.0
# full
bsub /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli/iRF.run/Ecoli.allCas9/all.features/dwt20bp.noncor2.eSpCas9/Submits/submit_full_dwt20bp.noncor2.eSpCas9_0.sh
# train
bsub /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli/iRF.run/Ecoli.allCas9/all.features/dwt20bp.noncor2.eSpCas9/Submits/submit_train_dwt20bp.noncor2.eSpCas9_0.sh
# once the train submissions are done run the test submissions
# test
bsub /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli/iRF.run/Ecoli.allCas9/all.features/dwt20bp.noncor2.eSpCas9/Submits/submit_test_dwt20bp.noncor2.eSpCas9_0.sh
# Andes
cd /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli/iRF.run/Ecoli.allCas9/
module load python/3.7-anaconda3
cd /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli/iRF.run/Ecoli.allCas9/all.features/dwt20bp.noncor2.eSpCas9
python /gpfs/alpine/syb105/proj-shared/Projects/iRF/iRF_postProcessing.py --Iterations 5 --Prediction --PredAccuracy MAE,MAEA,MSE,R2 --varTot 95 /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli/iRF.run/Ecoli.allCas9/YNames.txt dwt20bp.noncor2.eSpCas9
# 0.2870554110931435
sort -k3rg topVarEdges/cut.score_top95.txt | head
# gene.dwtd22.x cut.score 0.02986598552640716
# p20xz_quadrupoleraw cut.score 0.028566848258415226
# p20homo_lumo_energygapraw cut.score 0.02370195780517289
# p19xy_polarizabilityraw cut.score 0.023684734279965758
# sgRNA.gcsgRNA.raw cut.score 0.022130870139173624
# p18xz_quadrupoleraw cut.score 0.019481045894411925
# p1.CCsgRNA.raw cut.score 0.01899851523821424
# p19rot_constants_yraw cut.score 0.014287427843400499
# p20homo_energyraw cut.score 0.012891959701092191
# gc.dwtd1.x cut.score 0.011750463013811847
awk '{ sum += $2; n++ } END { if (n > 0) print sum / n; }' cut.score/foldRuns/results/importanceScores.txt
# 0.0011534
# pearson correlation
setwd("/gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli/iRF.run/Ecoli.allCas9/all.features/dwt20bp.noncor2.eSpCas9/cut.score/foldRuns/fold9/Runs/Set4")
pred <- read.delim("dwt20bp.noncor2.eSpCas9_Set4_test.prediction", header=T, sep="\t")
y <- read.delim("set4_Y_test_noSampleIDs.txt", header=T, sep="\t")
cor(y$cut.score, pred$Predictions.)
# 0.5526294
#!/bin/bash -l
#BSUB -P SYB105
#BSUB -W 04:15
#BSUB -nnodes 50
#BSUB -J espcas9.test_0
#BSUB -o espcas9.test_0.o%J
#BSUB -e espcas9.test_0.e%J
# bsub /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli/iRF.run/Ecoli.allCas9/all.features/espcas9.model.testing.sh
# eSpCas9 trained model --> test with Cas9 and recACas9
cd /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli/iRF.run/Ecoli.allCas9/all.features/dwt20bp.noncor2.cas9
### eSpCas9/Cas9
/usr/bin/time -f "%e" jsrun -n 1 -a 1 -c 40 -bpacked:40 /gpfs/alpine/syb105/proj-shared/Projects/iRF/IterativeRanger/cpp_version/build/ranger --file /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli/iRF.run/Ecoli.allCas9/all.features/dwt20bp/dwt20bp.noncor2.cas9.features_overlap_noSampleIDs.txt --yfile /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli/iRF.run/Ecoli.allCas9/all.features/dwt20bp/dwt20bp.cas9.score_overlap_noSampleIDs.txt --predict /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli/iRF.run/Ecoli.allCas9/all.features/dwt20bp.noncor2.eSpCas9/cut.score/foldRuns/fold0/Runs/Set0/dwt20bp.noncor2.eSpCas9_cut.score.forest --treetype 3 --depvarname cut.score --impmeasure 1 --nthreads 160 --useMPI 0 --outprefix eSpCas9.Cas9 --outputDirectory /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli/iRF.run/Ecoli.allCas9/all.features/dwt20bp/Cas9 > /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli/iRF.run/Ecoli.allCas9/all.features/dwt20bp/eSpCas9.Cas9_test.o
setwd("/gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli/iRF.run/Ecoli.allCas9/all.features/dwt20bp/")
score <- read.delim("dwt20bp.cas9.score_overlap_noSampleIDs.txt", header=T, sep="\t")
setwd("/gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli/iRF.run/Ecoli.allCas9/all.features/dwt20bp/Cas9/")
predict <- read.delim("eSpCas9.Cas9.prediction", header=T, sep="\t")
score.predict <- cbind(score, predict)
cor(score.predict$cut.score, score.predict$Predictions.)
# 0.620554
pdf("eSpCas9.Cas9.prediction.scatter.pdf")
library(ggplot2)
ggplot(score.predict, aes(x=cut.score, y=Predictions.)) + geom_point() + theme_classic()
dev.off()
######## subset to only use 8,093 for test
head -8093 /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli/iRF.run/Ecoli.allCas9/all.features/dwt20bp/dwt20bp.noncor2.cas9.features_overlap_noSampleIDs.txt > /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli/iRF.run/Ecoli.allCas9/all.features/dwt20bp/dwt20bp.noncor2.cas9.subset.features_overlap_noSampleIDs.txt
head -8093 /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli/iRF.run/Ecoli.allCas9/all.features/dwt20bp/dwt20bp.cas9.score_overlap_noSampleIDs.txt > /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli/iRF.run/Ecoli.allCas9/all.features/dwt20bp/dwt20bp.cas9.subset.score_overlap_noSampleIDs.txt
/usr/bin/time -f "%e" jsrun -n 1 -a 1 -c 40 -bpacked:40 /gpfs/alpine/syb105/proj-shared/Projects/iRF/IterativeRanger/cpp_version/build/ranger --file /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli/iRF.run/Ecoli.allCas9/all.features/dwt20bp/dwt20bp.noncor2.cas9.subset.features_overlap_noSampleIDs.txt --yfile /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli/iRF.run/Ecoli.allCas9/all.features/dwt20bp/dwt20bp.cas9.subset.score_overlap_noSampleIDs.txt --predict /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli/iRF.run/Ecoli.allCas9/all.features/dwt20bp.noncor2.eSpCas9/cut.score/foldRuns/fold0/Runs/Set0/dwt20bp.noncor2.eSpCas9_cut.score.forest --treetype 3 --depvarname cut.score --impmeasure 1 --nthreads 160 --useMPI 0 --outprefix eSpCas9.Cas9.subset --outputDirectory /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli/iRF.run/Ecoli.allCas9/all.features/dwt20bp/Cas9 > /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli/iRF.run/Ecoli.allCas9/all.features/dwt20bp/eSpCas9.Cas9.subset_test.o
setwd("/gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli/iRF.run/Ecoli.allCas9/all.features/dwt20bp/")
score <- read.delim("dwt20bp.cas9.subset.score_overlap_noSampleIDs.txt", header=T, sep="\t")
setwd("/gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli/iRF.run/Ecoli.allCas9/all.features/dwt20bp/Cas9/")
predict <- read.delim("eSpCas9.Cas9.subset.prediction", header=T, sep="\t")
score.predict <- cbind(score, predict)
cor(score.predict$cut.score, score.predict$Predictions.)
# 0.6263868
pdf("eSpCas9.Cas9.subset.prediction.scatter.pdf")
library(ggplot2)
ggplot(score.predict, aes(x=cut.score, y=Predictions.)) + geom_point() + theme_classic()
dev.off()
### eSpCas9/recACas9
/usr/bin/time -f "%e" jsrun -n 1 -a 1 -c 40 -bpacked:40 /gpfs/alpine/syb105/proj-shared/Projects/iRF/IterativeRanger/cpp_version/build/ranger --file /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli/iRF.run/Ecoli.allCas9/all.features/dwt20bp/dwt20bp.noncor2.recACas9.features_overlap_noSampleIDs.txt --yfile /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli/iRF.run/Ecoli.allCas9/all.features/dwt20bp/dwt20bp.recACas9.score_overlap_noSampleIDs.txt --predict /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli/iRF.run/Ecoli.allCas9/all.features/dwt20bp.noncor2.eSpCas9/cut.score/foldRuns/fold0/Runs/Set0/dwt20bp.noncor2.eSpCas9_cut.score.forest --treetype 3 --depvarname cut.score --impmeasure 1 --nthreads 160 --useMPI 0 --outprefix eSpCas9.recACas9 --outputDirectory /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli/iRF.run/Ecoli.allCas9/all.features/dwt20bp/recACas9 > /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli/iRF.run/Ecoli.allCas9/all.features/dwt20bp/eSpCas9.recACas9_test.o
setwd("/gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli/iRF.run/Ecoli.allCas9/all.features/dwt20bp/")
score <- read.delim("dwt20bp.recACas9.score_overlap_noSampleIDs.txt", header=T, sep="\t")
setwd("/gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli/iRF.run/Ecoli.allCas9/all.features/dwt20bp/recACas9/")
predict <- read.delim("eSpCas9.recACas9.prediction", header=T, sep="\t")
score.predict <- cbind(score, predict)
cor(score.predict$cut.score, score.predict$Predictions.)
# 0.2857642
pdf("eSpCas9.recACas9.prediction.scatter.pdf")
library(ggplot2)
ggplot(score.predict, aes(x=cut.score, y=Predictions.)) + geom_point() + theme_classic()
dev.off()
# Andes
module load python/3.7-anaconda3
mkdir /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli/iRF.run/Ecoli.allCas9/all.features/dwt20bp.noncor2.recACas9
cd /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli/iRF.run/Ecoli.allCas9/all.features/dwt20bp.noncor2.recACas9
python /gpfs/alpine/syb105/proj-shared/Projects/iRF/iRF_LOOP_SetUp_CrossLayer.py --System Summit --NodesPer 1 --TotalNodes 50 --RunTime 90 --Account SYB105 --NumTrees 1000 --NumIterations 5 --RunName dwt20bp.noncor2.recACas9 --bypass --Prediction /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli/iRF.run/Ecoli.allCas9/all.features/dwt20bp/dwt20bp.noncor2.recACas9.features.txt /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli/iRF.run/Ecoli.allCas9/all.features/dwt20bp/dwt20bp.recACas9.score.txt
# Summit
cd /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli/iRF.run/Ecoli.allCas9/all.features/dwt20bp.noncor2.recACas9
module load python/3.7.0-anaconda3-5.3.0
# full
bsub /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli/iRF.run/Ecoli.allCas9/all.features/dwt20bp.noncor2.recACas9/Submits/submit_full_dwt20bp.noncor2.recACas9_0.sh
# train
bsub /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli/iRF.run/Ecoli.allCas9/all.features/dwt20bp.noncor2.recACas9/Submits/submit_train_dwt20bp.noncor2.recACas9_0.sh
# once the train submissions are done run the test submissions
# test
bsub /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli/iRF.run/Ecoli.allCas9/all.features/dwt20bp.noncor2.recACas9/Submits/submit_test_dwt20bp.noncor2.recACas9_0.sh
# Andes
cd /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli/iRF.run/Ecoli.allCas9/
module load python/3.7-anaconda3
cd /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli/iRF.run/Ecoli.allCas9/all.features/dwt20bp.noncor2.recACas9
python /gpfs/alpine/syb105/proj-shared/Projects/iRF/iRF_postProcessing.py --Iterations 5 --Prediction --PredAccuracy MAE,MAEA,MSE,R2 --varTot 95 /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli/iRF.run/Ecoli.allCas9/YNames.txt dwt20bp.noncor2.recACas9
# 0.14344089890221737
sort -k3rg topVarEdges/cut.score_top95.txt | head
# gene.dwtd22.x cut.score 0.09457496117134985
# gc.dwtd1.x cut.score 0.015831622674623142
# structure.dwtd5.x cut.score 0.014832394734011819
# gc.dwtd5.x cut.score 0.014726523930610037
# ipd.dwtd6.x cut.score 0.014495672361577824
# ipd.dwtd5.x cut.score 0.01425200018497449
# structure.dwtd7.x cut.score 0.014209694867748186
# ipd.dwtd3.x cut.score 0.01416298162674481
# structure.dwtd3.x cut.score 0.014039559760986758
# structure.dwtd4.x cut.score 0.013915304690124499
awk '{ sum += $2; n++ } END { if (n > 0) print sum / n; }' cut.score/foldRuns/results/importanceScores.txt
# 0.0011534
# pearson correlation
setwd("/gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli/iRF.run/Ecoli.allCas9/all.features/dwt20bp.noncor2.recACas9/cut.score/foldRuns/fold9/Runs/Set4")
pred <- read.delim("dwt20bp.noncor2.recACas9_Set4_test.prediction", header=T, sep="\t")
y <- read.delim("set4_Y_test_noSampleIDs.txt", header=T, sep="\t")
cor(y$cut.score, pred$Predictions.)
# 0.3705067
#!/bin/bash -l
#BSUB -P SYB105
#BSUB -W 04:15
#BSUB -nnodes 50
#BSUB -J recacas9.test_0
#BSUB -o recacas9.test_0.o%J
#BSUB -e recacas9.test_0.e%J
# bsub /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli/iRF.run/Ecoli.allCas9/all.features/recacas9.model.testing.sh
# recACas9 trained model --> test with eSpCas9 and Cas9
cd /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli/iRF.run/Ecoli.allCas9/all.features/dwt20bp.noncor2.recACas9
### recACas9/Cas9
/usr/bin/time -f "%e" jsrun -n 1 -a 1 -c 40 -bpacked:40 /gpfs/alpine/syb105/proj-shared/Projects/iRF/IterativeRanger/cpp_version/build/ranger --file /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli/iRF.run/Ecoli.allCas9/all.features/dwt20bp/dwt20bp.noncor2.cas9.features_overlap_noSampleIDs.txt --yfile /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli/iRF.run/Ecoli.allCas9/all.features/dwt20bp/dwt20bp.cas9.score_overlap_noSampleIDs.txt --predict /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli/iRF.run/Ecoli.allCas9/all.features/dwt20bp.noncor2.recACas9/cut.score/foldRuns/fold0/Runs/Set0/dwt20bp.noncor2.recACas9_cut.score.forest --treetype 3 --depvarname cut.score --impmeasure 1 --nthreads 160 --useMPI 0 --outprefix recACas9.Cas9 --outputDirectory /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli/iRF.run/Ecoli.allCas9/all.features/dwt20bp/Cas9 > /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli/iRF.run/Ecoli.allCas9/all.features/dwt20bp/recACas9.Cas9_test.o
setwd("/gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli/iRF.run/Ecoli.allCas9/all.features/dwt20bp/")
score <- read.delim("dwt20bp.cas9.score_overlap_noSampleIDs.txt", header=T, sep="\t")
setwd("/gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli/iRF.run/Ecoli.allCas9/all.features/dwt20bp/Cas9/")
predict <- read.delim("recACas9.Cas9.prediction", header=T, sep="\t")
score.predict <- cbind(score, predict)
cor(score.predict$cut.score, score.predict$Predictions.)
# 0.4455069
pdf("recACas9.Cas9.prediction.scatter.pdf")
library(ggplot2)
ggplot(score.predict, aes(x=cut.score, y=Predictions.)) + geom_point() + theme_classic()
dev.off()
### recACas9/eSpCas9
/usr/bin/time -f "%e" jsrun -n 1 -a 1 -c 40 -bpacked:40 /gpfs/alpine/syb105/proj-shared/Projects/iRF/IterativeRanger/cpp_version/build/ranger --file /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli/iRF.run/Ecoli.allCas9/all.features/dwt20bp/dwt20bp.noncor2.eSpCas9.features_overlap_noSampleIDs.txt --yfile /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli/iRF.run/Ecoli.allCas9/all.features/dwt20bp/dwt20bp.eSpCas9.score_overlap_noSampleIDs.txt --predict /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli/iRF.run/Ecoli.allCas9/all.features/dwt20bp.noncor2.recACas9/cut.score/foldRuns/fold0/Runs/Set0/dwt20bp.noncor2.recACas9_cut.score.forest --treetype 3 --depvarname cut.score --impmeasure 1 --nthreads 160 --useMPI 0 --outprefix recACas9.eSpCas9 --outputDirectory /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli/iRF.run/Ecoli.allCas9/all.features/dwt20bp/eSpCas9 > /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli/iRF.run/Ecoli.allCas9/all.features/dwt20bp/recACas9.eSpCas9_test.o
setwd("/gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli/iRF.run/Ecoli.allCas9/all.features/dwt20bp/")
score <- read.delim("dwt20bp.eSpCas9.score_overlap_noSampleIDs.txt", header=T, sep="\t")
setwd("/gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli/iRF.run/Ecoli.allCas9/all.features/dwt20bp/eSpCas9/")
predict <- read.delim("recACas9.eSpCas9.prediction", header=T, sep="\t")
score.predict <- cbind(score, predict)
cor(score.predict$cut.score, score.predict$Predictions.)
# 0.288145
pdf("recACas9.eSpCas9.prediction.scatter.pdf")
library(ggplot2)
ggplot(score.predict, aes(x=cut.score, y=Predictions.)) + geom_point() + theme_classic()
dev.off()
cd /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli/iRF.run/Ecoli.allCas9/all.features/dwt20bp/
mkdir Cas9.eSpCas9
sed '1d' dwt20bp.eSpCas9.score.txt > dwt20bp.eSpCas9.score.noheader.txt
cat dwt20bp.cas9.score.txt dwt20bp.eSpCas9.score.noheader.txt > dwt20bp.Cas9.eSpCas9.score.txt
cp dwt20bp.Cas9.eSpCas9.score.txt dwt20bp.Cas9.eSpCas9.score_overlap.txt
cut --complement -f 1 dwt20bp.Cas9.eSpCas9.score.txt > dwt20bp.Cas9.eSpCas9.score_overlap_noSampleIDs.txt
sed '1d' dwt20bp.noncor2.eSpCas9.features.txt > dwt20bp.noncor2.eSpCas9.features.noheader.txt
cat dwt20bp.noncor2.cas9.features.txt dwt20bp.noncor2.eSpCas9.features.noheader.txt > dwt20bp.noncor2.Cas9.eSpCas9.features.txt
cp dwt20bp.noncor2.Cas9.eSpCas9.features.txt dwt20bp.noncor2.Cas9.eSpCas9.features_overlap.txt
cut --complement -f 1 dwt20bp.noncor2.Cas9.eSpCas9.features.txt > dwt20bp.noncor2.Cas9.eSpCas9.features_overlap_noSampleIDs.txt
# Andes
module load python/3.7-anaconda3
mkdir /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli/iRF.run/Ecoli.allCas9/all.features/dwt20bp.noncor2.cas9.espcas9
cd /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli/iRF.run/Ecoli.allCas9/all.features/dwt20bp.noncor2.cas9.espcas9
python /gpfs/alpine/syb105/proj-shared/Projects/iRF/iRF_LOOP_SetUp_CrossLayer.py --System Summit --NodesPer 1 --TotalNodes 50 --RunTime 90 --Account SYB105 --NumTrees 1000 --NumIterations 5 --RunName dwt20bp.cas9.espcas9 --bypass --Prediction /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli/iRF.run/Ecoli.allCas9/all.features/dwt20bp/dwt20bp.noncor2.Cas9.eSpCas9.features.txt /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli/iRF.run/Ecoli.allCas9/all.features/dwt20bp/dwt20bp.Cas9.eSpCas9.score.txt
# Summit
cd /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli/iRF.run/Ecoli.allCas9/all.features/dwt20bp.noncor2.cas9.espcas9
module load python/3.7.0-anaconda3-5.3.0
# full
bsub /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli/iRF.run/Ecoli.allCas9/all.features/dwt20bp.noncor2.cas9.espcas9/Submits/submit_full_dwt20bp.cas9.espcas9_0.sh
# train
bsub /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli/iRF.run/Ecoli.allCas9/all.features/dwt20bp.noncor2.cas9.espcas9/Submits/submit_train_dwt20bp.cas9.espcas9_0.sh
# once the train submissions are done run the test submissions
# test
bsub /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli/iRF.run/Ecoli.allCas9/all.features/dwt20bp.noncor2.cas9.espcas9/Submits/submit_test_dwt20bp.cas9.espcas9_0.sh
# Andes
cd /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli/iRF.run/Ecoli.allCas9/
module load python/3.7-anaconda3
cd /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli/iRF.run/Ecoli.allCas9/all.features/dwt20bp.noncor2.cas9.espcas9
python /gpfs/alpine/syb105/proj-shared/Projects/iRF/iRF_postProcessing.py --Iterations 5 --Prediction --PredAccuracy MAE,MAEA,MSE,R2 --varTot 95 /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli/iRF.run/Ecoli.allCas9/YNames.txt dwt20bp.cas9.espcas9
# 0.2096891459239172
sort -k3rg topVarEdges/cut.score_top95.txt | head
# p20xz_quadrupoleraw cut.score 0.041304635594810374
# gene.dwtd22.x cut.score 0.030389852823730617
# sgRNA.gcsgRNA.raw cut.score 0.02645566215786574
# p20homo_energyraw cut.score 0.02293567460721656
# p18xz_quadrupoleraw cut.score 0.02177824358626837
# p19xy_polarizabilityraw cut.score 0.016581022272935824
# p19rot_constants_zraw cut.score 0.011975803751327726
# gc.dwtd2.x cut.score 0.011902891642788134
# p19rot_constants_yraw cut.score 0.011692511928835404
# gc.dwtd3.x cut.score 0.01071425096122865
awk '{ sum += $2; n++ } END { if (n > 0) print sum / n; }' cut.score/foldRuns/results/importanceScores.txt
# 0.0011534
# pearson correlation
setwd("/gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli/iRF.run/Ecoli.allCas9/all.features/dwt20bp.noncor2.cas9.espcas9/cut.score/foldRuns/fold9/Runs/Set4")
pred <- read.delim("dwt20bp.cas9.espcas9_Set4_test.prediction", header=T, sep="\t")
y <- read.delim("set4_Y_test_noSampleIDs.txt", header=T, sep="\t")
id <- read.delim("set4_test_SampleIDs.txt", header=F, sep="\t")
colnames(id) <- "sgRNAID"
id.pred <- cbind(id, pred)
id.pred.y <- cbind(id.pred, y)
library(tidyr)
id.pred.y.group <- id.pred.y %>% separate(sgRNAID, c("sgRNA", "ID", "group"), "_")
cor(id.pred.y.group$cut.score, id.pred.y.group$Predictions., method=c("pearson"))
# 0.4899664
pred.Cas9 <- subset(id.pred.y.group, id.pred.y.group$group == "Cas9")
cor(pred.Cas9$cut.score, pred.Cas9$Predictions., method=c("pearson"))
# 0.6399442
pred.eSpCas9 <- subset(id.pred.y.group, id.pred.y.group$group == "eSpCas9")
cor(pred.eSpCas9$cut.score, pred.eSpCas9$Predictions., method=c("pearson"))
# 0.6913604
pdf("Cas9_eSpCas9.Cas9_eSpCas9.prediction.scatter.pdf")
library(ggplot2)
ggplot(id.pred.y.group, aes(x=cut.score, y=Predictions., color=group)) + geom_point() + theme_classic()
dev.off()
### Cas9+eSpCas9/recACas9
#!/bin/bash -l
#BSUB -P SYB105
#BSUB -W 04:15
#BSUB -nnodes 50
#BSUB -J cas9.espcas9.test_0
#BSUB -o cas9.espcas9.test_0.o%J
#BSUB -e cas9.espcas9.test_0.e%J
# bsub /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli/iRF.run/Ecoli.allCas9/all.features/cas9.eSpCas9.model.testing.sh
/usr/bin/time -f "%e" jsrun -n 1 -a 1 -c 40 -bpacked:40 /gpfs/alpine/syb105/proj-shared/Projects/iRF/IterativeRanger/cpp_version/build/ranger --file /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli/iRF.run/Ecoli.allCas9/all.features/dwt20bp/dwt20bp.noncor2.recACas9.features_overlap_noSampleIDs.txt --yfile /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli/iRF.run/Ecoli.allCas9/all.features/dwt20bp/dwt20bp.recACas9.score_overlap_noSampleIDs.txt --predict /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli/iRF.run/Ecoli.allCas9/all.features/dwt20bp.noncor2.cas9.espcas9/cut.score/foldRuns/fold0/Runs/Set0/dwt20bp.cas9.espcas9_cut.score.forest --treetype 3 --depvarname cut.score --impmeasure 1 --nthreads 160 --useMPI 0 --outprefix Cas9eSpCas9.recACas9 --outputDirectory /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli/iRF.run/Ecoli.allCas9/all.features/dwt20bp/recACas9 > /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli/iRF.run/Ecoli.allCas9/all.features/dwt20bp/Cas9eSpCas9.recACas9_test.o
setwd("/gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli/iRF.run/Ecoli.allCas9/all.features/dwt20bp/")
score <- read.delim("dwt20bp.recACas9.score_overlap_noSampleIDs.txt", header=T, sep="\t")
setwd("/gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli/iRF.run/Ecoli.allCas9/all.features/dwt20bp/recACas9/")
predict <- read.delim("Cas9eSpCas9.recACas9.prediction", header=T, sep="\t")
score.predict <- cbind(score, predict)
cor(score.predict$cut.score, score.predict$Predictions.)
# 0.4375317
pdf("Cas9eSpCas9.recACas9.prediction.scatter.pdf")
library(ggplot2)
ggplot(score.predict, aes(x=cut.score, y=Predictions.)) + geom_point() + theme_classic()
dev.off()
cd /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli/iRF.run/Ecoli.allCas9/all.features/dwt20bp/
mkdir Cas9.eSpCas9.recACas9
sed '1d' dwt20bp.recACas9.score.txt > dwt20bp.recACas9.score.noheader.txt
cat dwt20bp.cas9.score.txt dwt20bp.eSpCas9.score.noheader.txt dwt20bp.recACas9.score.noheader.txt > dwt20bp.Cas9.eSpCas9.recACas9.score.txt
cp dwt20bp.Cas9.eSpCas9.recACas9.score.txt dwt20bp.Cas9.eSpCas9.recACas9.score_overlap.txt
cut --complement -f 1 dwt20bp.Cas9.eSpCas9.recACas9.score.txt > dwt20bp.Cas9.eSpCas9.recACas9.score_overlap_noSampleIDs.txt
sed '1d' dwt20bp.noncor2.recACas9.features.txt > dwt20bp.noncor2.recACas9.features.noheader.txt
cat dwt20bp.noncor2.cas9.features.txt dwt20bp.noncor2.eSpCas9.features.noheader.txt dwt20bp.noncor2.recACas9.features.noheader.txt > dwt20bp.noncor2.Cas9.eSpCas9.recACas9.features.txt
cp dwt20bp.noncor2.Cas9.eSpCas9.recACas9.features.txt dwt20bp.noncor2.Cas9.eSpCas9.recACas9.features_overlap.txt
cut --complement -f 1 dwt20bp.noncor2.Cas9.eSpCas9.recACas9.features.txt > dwt20bp.noncor2.Cas9.eSpCas9.recACas9.features_overlap_noSampleIDs.txt
# Andes
module load python/3.7-anaconda3
mkdir /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli/iRF.run/Ecoli.allCas9/all.features/dwt20bp.noncor2.cas9.espcas9.recacas9
cd /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli/iRF.run/Ecoli.allCas9/all.features/dwt20bp.noncor2.cas9.espcas9.recacas9
python /gpfs/alpine/syb105/proj-shared/Projects/iRF/iRF_LOOP_SetUp_CrossLayer.py --System Summit --NodesPer 1 --TotalNodes 50 --RunTime 90 --Account SYB105 --NumTrees 1000 --NumIterations 5 --RunName dwt20bp.cas9.espcas9.reacas9 --bypass --Prediction /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli/iRF.run/Ecoli.allCas9/all.features/dwt20bp/dwt20bp.noncor2.Cas9.eSpCas9.recACas9.features.txt /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli/iRF.run/Ecoli.allCas9/all.features/dwt20bp/dwt20bp.Cas9.eSpCas9.recACas9.score.txt
# Summit
cd /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli/iRF.run/Ecoli.allCas9/all.features/dwt20bp.noncor2.cas9.espcas9.recacas9
module load python/3.7.0-anaconda3-5.3.0
# full
bsub /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli/iRF.run/Ecoli.allCas9/all.features/dwt20bp.noncor2.cas9.espcas9.recacas9/Submits/submit_full_dwt20bp.cas9.espcas9.recacas9_0.sh
# train
bsub /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli/iRF.run/Ecoli.allCas9/all.features/dwt20bp.noncor2.cas9.espcas9.recacas9/Submits/submit_train_dwt20bp.cas9.espcas9.recacas9_0.sh
# once the train submissions are done run the test submissions
# test
bsub /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli/iRF.run/Ecoli.allCas9/all.features/dwt20bp.noncor2.cas9.espcas9.recacas9/Submits/submit_test_dwt20bp.cas9.espcas9.recacas9_0.sh
# Andes
cd /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli/iRF.run/Ecoli.allCas9/
module load python/3.7-anaconda3
cd /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli/iRF.run/Ecoli.allCas9/all.features/dwt20bp.noncor2.cas9.espcas9.recacas9
python /gpfs/alpine/syb105/proj-shared/Projects/iRF/iRF_postProcessing.py --Iterations 5 --Prediction --PredAccuracy MAE,MAEA,MSE,R2 --varTot 95 /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli/iRF.run/Ecoli.allCas9/YNames.txt dwt20bp.cas9.espcas9.recacas9
#
sort -k3rg topVarEdges/cut.score_top95.txt | head
awk '{ sum += $2; n++ } END { if (n > 0) print sum / n; }' cut.score/foldRuns/results/importanceScores.txt
#
# pearson correlation
setwd("/gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli/iRF.run/Ecoli.allCas9/all.features/dwt20bp.noncor2.cas9.espcas9.recacas9/cut.score/foldRuns/fold9/Runs/Set4")
pred <- read.delim("dwt20bp.cas9.espcas9.recacas9_Set4_test.prediction", header=T, sep="\t")
y <- read.delim("set4_Y_test_noSampleIDs.txt", header=T, sep="\t")
id <- read.delim("set4_test_SampleIDs.txt", header=F, sep="\t")
colnames(id) <- "sgRNAID"
id.pred <- cbind(id, pred)
id.pred.y <- cbind(id.pred, y)
library(tidyr)
id.pred.y.group <- id.pred.y %>% separate(sgRNAID, c("sgRNA", "ID", "group"), "_")
cor(id.pred.y.group$cut.score, id.pred.y.group$Predictions., method=c("pearson"))
#
pred.Cas9 <- subset(id.pred.y.group, id.pred.y.group$group == "Cas9")
cor(pred.Cas9$cut.score, pred.Cas9$Predictions., method=c("pearson"))
#
pred.eSpCas9 <- subset(id.pred.y.group, id.pred.y.group$group == "eSpCas9")
cor(pred.eSpCas9$cut.score, pred.eSpCas9$Predictions., method=c("pearson"))
#
pred.recACas9 <- subset(id.pred.y.group, id.pred.y.group$group == "recACas9")
cor(pred.recACas9$cut.score, pred.recACas9$Predictions., method=c("pearson"))
#
pdf("Cas9_eSpCas9.Cas9_eSpCas9_recACas9.prediction.scatter.pdf")
library(ggplot2)
ggplot(id.pred.y.group, aes(x=cut.score, y=Predictions., color=group)) + geom_point() + theme_classic()
dev.off()
sp|Q99ZW2|CAS9_STRP1 CRISPR-associated endonuclease Cas9/Csn1 OS=Streptococcus pyogenes serotype M1 OX=301447 GN=cas9 PE=1 SV=1 MDKKYSIGLDIGTNSVGWAVITDEYKVPSKKFKVLGNTDRHSIKKNLIGALLFDSGETAE ATRLKRTARRRYTRRKNRICYLQEIFSNEMAKVDDSFFHRLEESFLVEEDKKHERHPIFG NIVDEVAYHEKYPTIYHLRKKLVDSTDKADLRLIYLALAHMIKFRGHFLIEGDLNPDNSD VDKLFIQLVQTYNQLFEENPINASGVDAKAILSARLSKSRRLENLIAQLPGEKKNGLFGN LIALSLGLTPNFKSNFDLAEDAKLQLSKDTYDDDLDNLLAQIGDQYADLFLAAKNLSDAI LLSDILRVNTEITKAPLSASMIKRYDEHHQDLTLLKALVRQQLPEKYKEIFFDQSKNGYA GYIDGGASQEEFYKFIKPILEKMDGTEELLVKLNREDLLRKQRTFDNGSIPHQIHLGELH AILRRQEDFYPFLKDNREKIEKILTFRIPYYVGPLARGNSRFAWMTRKSEETITPWNFEE VVDKGASAQSFIERMTNFDKNLPNEKVLPKHSLLYEYFTVYNELTKVKYVTEGMRKPAFL SGEQKKAIVDLLFKTNRKVTVKQLKEDYFKKIECFDSVEISGVEDRFNASLGTYHDLLKI IKDKDFLDNEENEDILEDIVLTLTLFEDREMIEERLKTYAHLFDDKVMKQLKRRRYTGWG RLSRKLINGIRDKQSGKTILDFLKSDGFANRNFMQLIHDDSLTFKEDIQKAQVSGQGDSL HEHIANLAGSPAIKKGILQTVKVVDELVKVMGRHKPENIVIEMARENQTTQKGQKNSRER MKRIEEGIKELGSQILKEHPVENTQLQNEKLYLYYLQNGRDMYVDQELDINRLSDYDVDH IVPQSFLKDDSIDNKVLTRSDKNRGKSDNVPSEEVVKKMKNYWRQLLNAKLITQRKFDNL TKAERGGLSELDKAGFIKRQLVETRQITKHVAQILDSRMNTKYDENDKLIREVKVITLKS KLVSDFRKDFQFYKVREINNYHHAHDAYLNAVVGTALIKKYPKLESEFVYGDYKVYDVRK MIAKSEQEIGKATAKYFFYSNIMNFFKTEITLANGEIRKRPLIETNGETGEIVWDKGRDF ATVRKVLSMPQVNIVKKTEVQTGGFSKESILPKRNSDKLIARKKDWDPKKYGGFDSPTVA YSVLVVAKVEKGKSKKLKSVKELLGITIMERSSFEKNPIDFLEAKGYKEVKKDLIIKLPK YSLFELENGRKRMLASAGELQKGNELALPSKYVNFLYLASHYEKLKGSPEDNEQKQLFVE QHKHYLDEIIEQISEFSKRVILADANLDKVLSAYNKHRDKPIREQAENIIHLFTLTNLGA PAAFKYFDTTIDRKRYTSTKEVLDATLIHQSITGLYETRIDLSQLGGD
setwd("/gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli/iRF.run/Ecoli.allCas9/all.features/dwt20bp/")
features.cas9 <- read.delim("dwt20bp.noncor2.cas9.features.txt", header=T, sep="\t")
features.eSp <- read.delim("dwt20bp.noncor2.eSpCas9.features.txt", header=T, sep="\t")
features.recA <- read.delim("dwt20bp.noncor2.recACas9.features.txt", header=T, sep="\t")
features.cas9$cas.class <- 1
features.eSp$cas.class <- 2
features.recA$cas.class <- 3
features.all <- rbind(features.cas9, features.eSp, features.recA)
write.table(features.all, "dwt20bp.noncor2.features.classification.txt", quote=F, row.names=F, sep="\t")
cd /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli/iRF.run/Ecoli.allCas9/all.features/dwt20bp/
cp dwt20bp.noncor2.features.classification.txt dwt20bp.noncor2.features.classification_overlap.txt
cut --complement -f 1 dwt20bp.noncor2.features.classification.txt > dwt20bp.noncor2.features.classification_overlap_noSampleIDs.txt
# Andes
module load python/3.7-anaconda3
mkdir /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli/iRF.run/Ecoli.allCas9/all.features/dwt20bp.noncor2.class
cd /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli/iRF.run/Ecoli.allCas9/all.features/dwt20bp.noncor2.class
python /gpfs/alpine/syb105/proj-shared/Projects/iRF/iRF_LOOP_SetUp_CrossLayer.py --System Summit --NodesPer 1 --TotalNodes 50 --RunTime 90 --Account SYB105 --NumTrees 1000 --NumIterations 5 --RunName dwt20bp.noncor2.class --bypass --Prediction /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli/iRF.run/Ecoli.allCas9/all.features/dwt20bp/dwt20bp.noncor2.features.classification.txt /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli/iRF.run/Ecoli.allCas9/all.features/dwt20bp/dwt20bp.score.txt
# Summit
cd /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli/iRF.run/Ecoli.allCas9/all.features/dwt20bp.noncor2.class
module load python/3.7.0-anaconda3-5.3.0
# full
bsub /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli/iRF.run/Ecoli.allCas9/all.features/dwt20bp.noncor2.class/Submits/submit_full_dwt20bp.noncor2.class_0.sh
# train
bsub /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli/iRF.run/Ecoli.allCas9/all.features/dwt20bp.noncor2.class/Submits/submit_train_dwt20bp.noncor2.class_0.sh
# once the train submissions are done run the test submissions
# test
bsub /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli/iRF.run/Ecoli.allCas9/all.features/dwt20bp.noncor2.class/Submits/submit_test_dwt20bp.noncor2.class_0.sh
# Andes
cd /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli/iRF.run/Ecoli.allCas9/
module load python/3.7-anaconda3
cd /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli/iRF.run/Ecoli.allCas9/all.features/dwt20bp.noncor2.class
python /gpfs/alpine/syb105/proj-shared/Projects/iRF/iRF_postProcessing.py --Iterations 5 --Prediction --PredAccuracy MAE,MAEA,MSE,R2 --varTot 95 /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli/iRF.run/Ecoli.allCas9/YNames.txt dwt20bp.noncor2.class
# -0.2350857415730937
sort -k3rg topVarEdges/cut.score_top95.txt | head
# cas.class cut.score 0.158153905992389
# gc.dwtd2.x cut.score 0.016737473255261155
# gc.dwtd1.x cut.score 0.016562120834685494
# structure.dwtd2.x cut.score 0.016401804672395912
# ipd.dwtd1.x cut.score 0.016077360338673793
# gc.dwtd4.x cut.score 0.015690547299807314
# structure.dwtd4.x cut.score 0.015665769240378144
# structure.dwtd3.x cut.score 0.015560938988947044
# structure.dwtd6.x cut.score 0.015540184716946542
# gc.dwtd5.x cut.score 0.015434507352372564
# pearson correlation
setwd("/gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli/iRF.run/Ecoli.allCas9/all.features/dwt20bp.noncor2.class/cut.score/foldRuns/fold9/Runs/Set4")
pred <- read.delim("dwt20bp.noncor2.class_Set4_test.prediction", header=T, sep="\t")
y <- read.delim("set4_Y_test_noSampleIDs.txt", header=T, sep="\t")
cor(y$cut.score, pred$Predictions.)
# 0.02165769
id <- read.delim("set4_test_SampleIDs.txt", header=F, sep="\t")
colnames(id) <- "sgRNAID"
id.pred <- cbind(id, pred)
id.pred.y <- cbind(id.pred, y)
library(tidyr)
id.pred.y.group <- id.pred.y %>% separate(sgRNAID, c("sgRNA", "ID", "group"), "_")
cor(id.pred.y.group$cut.score, id.pred.y.group$Predictions., method=c("pearson"))
# 0.02165769
pred.Cas9 <- subset(id.pred.y.group, id.pred.y.group$group == "Cas9")
cor(pred.Cas9$cut.score, pred.Cas9$Predictions., method=c("pearson"))
# 0.02208524
pred.eSpCas9 <- subset(id.pred.y.group, id.pred.y.group$group == "eSpCas9")
cor(pred.eSpCas9$cut.score, pred.eSpCas9$Predictions., method=c("pearson"))
# 0.01672421
pdf("dwt20bp.noncor2.class.prediction.scatter.pdf")
library(ggplot2)
ggplot(id.pred.y.group, aes(x=cut.score, y=Predictions., color=group)) + geom_point() + theme_classic()
dev.off()
https://www.tbi.univie.ac.at/RNA/tutorial/ minimum free energy (MFE) structure
source /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/scripts/loadcondaandes.sh
conda create --name ViennaRNA python=3.8
conda activate ViennaRNA
conda install -c conda-forge -c bioconda viennarna
cd /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli
RNAfold < ecoli.gRNA.fasta > ecoli.gRNA.ViennaRNA.output.txt
grep '(' ecoli.gRNA.ViennaRNA.output.txt | grep -Eo '[+-]?[0-9]+([.][0-9]+)?' > ecoli.gRNA.ViennaRNA.output.value.txt
grep '>' ecoli.gRNA.ViennaRNA.output.txt | sed 's/>//g' > ecoli.gRNA.names.txt
paste ecoli.gRNA.names.txt ecoli.gRNA.ViennaRNA.output.value.txt > ecoli.gRNA.ViennaRNA.output.value.id.txt
# 20bp sliding fasta
cd /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli
RNAfold < ecoli.20sliding.fa > ecoli.20sliding.ViennaRNA.output.txt
grep '(' ecoli.20sliding.ViennaRNA.output.txt | grep -Eo '[+-]?[0-9]+([.][0-9]+)?' > ecoli.20sliding.ViennaRNA.output.value.txt
grep '>' ecoli.20sliding.ViennaRNA.output.txt | sed 's/>//g' > ecoli.20sliding.names.txt
paste ecoli.20sliding.names.txt ecoli.20sliding.ViennaRNA.output.value.txt > ecoli.20sliding.ViennaRNA.output.value.id.txt
–> go back and use this instead of the iFeature.py output in the data matrix generation
#!/bin/bash
#SBATCH -A SYB105
#SBATCH -J viennaRNA
#SBATCH -N 1
#SBATCH -p gpu
#SBATCH -t 10:00:00
#SBATCH --mem-per-cpu=0
source /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/scripts/loadcondaandes.sh
conda activate /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/Libraries/andes/envs/test
cd /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli
R CMD BATCH viennaRNA.dwt.R
# sbatch /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli/viennaRNA.dwt.sh
# salloc -A SYB105 -N 2 -p gpu -t 4:00:00
source /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/scripts/loadcondaandes.sh
conda activate /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/Libraries/andes/envs/test
library(dplyr)
library(reshape2)
library(tidyr)
library(wmtsa)
library(data.table)
setwd("/gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli")
gatc <- read.table("ecoli.gatc.20sliding.bed", header=F, sep="\t", stringsAsFactors = F)
ipd <- read.table("ecoli.ipd.average.20sliding.bed", header=T, sep="\t", stringsAsFactors = F)
gene <- read.table("ecoli.gene.20sliding.bed", header=F, sep="\t", stringsAsFactors = F)
structure <- read.table("ecoli.20sliding.ViennaRNA.output.value.id.txt", header=F, sep="\t", stringsAsFactors = F)
nuc <- read.table("nucleotide_counts_20sliding_temp.txt", header=T, sep="\t", stringsAsFactors = F)
rnaseq <- read.table("ecoli.rnaseq.average.20sliding.bed", header=T, sep="\t", stringsAsFactors = F)
window <- read.table("ecoli.20bp.sliding.bed", header=F, sep="\t", stringsAsFactors = F)
score <- read.table("sgRNA.coord.txt", header=T, sep="\t", stringsAsFactors = F)
colnames(score) <- c("chr", "start", "end", "sgRNA", "id", "seq", "id2", "cut.score", "gid", "change.val", "quality")
score.df <- score[,c(1:4,8)]
window.v <- window[,1:3]
colnames(window.v) <- c("V1", "V2", "V3")
gatc.win <- left_join(window.v, gatc, by=c("V1", "V2", "V3"))
gatc.win[is.na(gatc.win)] <- 0
ipd.win <- left_join(window.v, ipd, by=c("V1", "V2", "V3"))
ipd.win[is.na(ipd.win)] <- 0
gene.win <- left_join(window.v, gene, by=c("V1", "V2", "V3"))
gene.win[is.na(gene.win)] <- 0
rnaseq.win <- left_join(window.v, rnaseq, by=c("V1", "V2", "V3"))
rnaseq.win[is.na(rnaseq.win)] <- 0
gatc.bin <- gatc.win %>% group_by(V1, V2, V3) %>% mutate(gatc.count = n())
gatc.count <- unique(gatc.bin[,c(1:3,8)])
gatc.df <- gatc.count$gatc.count
ipd.df <- ipd.win[,4]
gene.bin <- gene.win %>% group_by(V1, V2, V3) %>% mutate(gene.count = n())
gene.count <- unique(gene.bin[,c(1:3,14)])
gene.df <- gene.count$gene.count
structure.df <- structure[,2]
gc.df <- nuc[,7]
temp.df <- nuc[,8]
rna.df <- rnaseq.win[,4]
# Run DWT instead of CWT
#wavMODWT(x, wavelet="s8", n.levels=ilogb(length(x), base=2),position=list(from=1,by=1,units=character()), units=character(),title.data=character(), documentation=character(), keep.series=FALSE)
temp.modwt <- wavMODWT(temp.df)
temp.modwt.df <- as.matrix(temp.modwt)
temp.modwt.label <- data.frame(label = row.names(temp.modwt.df), temp.modwt.df)
temp.modwt.dt <- as.data.table(temp.modwt.label)
temp.modwt.name <- temp.modwt.dt[, c("name", "number") := tstrsplit(label, "[^[:alnum:]]+")]
colnames(temp.modwt.name) <- c("label", "temp.dwt", "scale", "window")
gc.modwt <- wavMODWT(gc.df)
gc.modwt.df <- as.matrix(gc.modwt)
gc.modwt.label <- data.frame(label = row.names(gc.modwt.df), gc.modwt.df)
gc.modwt.dt <- as.data.table(gc.modwt.label)
gc.modwt.name <- gc.modwt.dt[, c("name", "number") := tstrsplit(label, "[^[:alnum:]]+")]
colnames(gc.modwt.name) <- c("label", "gc.dwt", "scale", "window")
structure.modwt <- wavMODWT(structure.df)
structure.modwt.df <- as.matrix(structure.modwt)
structure.modwt.label <- data.frame(label = row.names(structure.modwt.df), structure.modwt.df)
structure.modwt.dt <- as.data.table(structure.modwt.label)
structure.modwt.name <- structure.modwt.dt[, c("name", "number") := tstrsplit(label, "[^[:alnum:]]+")]
colnames(structure.modwt.name) <- c("label", "structure.dwt", "scale", "window")
rna.modwt <- wavMODWT(rna.df)
rna.modwt.df <- as.matrix(rna.modwt)
rna.modwt.label <- data.frame(label = row.names(rna.modwt.df), rna.modwt.df)
rna.modwt.dt <- as.data.table(rna.modwt.label)
rna.modwt.name <- rna.modwt.dt[, c("name", "number") := tstrsplit(label, "[^[:alnum:]]+")]
colnames(rna.modwt.name) <- c("label", "rna.dwt", "scale", "window")
gene.modwt <- wavMODWT(gene.df)
gene.modwt.df <- as.matrix(gene.modwt)
gene.modwt.label <- data.frame(label = row.names(gene.modwt.df), gene.modwt.df)
gene.modwt.dt <- as.data.table(gene.modwt.label)
gene.modwt.name <- gene.modwt.dt[, c("name", "number") := tstrsplit(label, "[^[:alnum:]]+")]
colnames(gene.modwt.name) <- c("label", "gene.dwt", "scale", "window")
ipd.modwt <- wavMODWT(ipd.df)
ipd.modwt.df <- as.matrix(ipd.modwt)
ipd.modwt.label <- data.frame(label = row.names(ipd.modwt.df), ipd.modwt.df)
ipd.modwt.dt <- as.data.table(ipd.modwt.label)
ipd.modwt.name <- ipd.modwt.dt[, c("name", "number") := tstrsplit(label, "[^[:alnum:]]+")]
colnames(ipd.modwt.name) <- c("label", "ipd.dwt", "scale", "window")
gatc.modwt <- wavMODWT(gatc.df)
gatc.modwt.df <- as.matrix(gatc.modwt)
gatc.modwt.label <- data.frame(label = row.names(gatc.modwt.df), gatc.modwt.df)
gatc.modwt.dt <- as.data.table(gatc.modwt.label)
gatc.modwt.name <- gatc.modwt.dt[, c("name", "number") := tstrsplit(label, "[^[:alnum:]]+")]
colnames(gatc.modwt.name) <- c("label", "gatc.dwt", "scale", "window")
colnames(window) <- c("chr", "start", "end")
window$window <- seq.int(nrow(window))
window$window <- as.character(window$window-1)
window$start <- as.numeric(window$start)
window$end <- as.numeric(window$end - 1)
window.score.df <- left_join(score.df, window, by=c("chr", "start", "end"))
window.score.temp <- left_join(window.score.df, temp.modwt.name[,c(3,4,2)], by="window")
window.temp.gc <- left_join(window.score.temp, gc.modwt.name[,c(3,4,2)], by=c("window", "scale"))
window.temp.gc.structure <- left_join(window.temp.gc, structure.modwt.name[,c(3,4,2)], by=c("window", "scale"))
window.temp.gc.structure.rna <- left_join(window.temp.gc.structure, rna.modwt.name[,c(3,4,2)], by=c("window", "scale"))
window.temp.gc.structure.rna.gene <- left_join(window.temp.gc.structure.rna, gene.modwt.name[,c(3,4,2)], by=c("window", "scale"))
window.temp.gc.structure.rna.gene.gatc <- left_join(window.temp.gc.structure.rna.gene, gatc.modwt.name[,c(3,4,2)], by=c("window", "scale"))
window.temp.gc.structure.rna.gene.gatc.ipd <- left_join(window.temp.gc.structure.rna.gene.gatc, ipd.modwt.name[,c(3,4,2)], by=c("window", "scale"))
# 1293751
window.temp.gc.structure.rna.gene.gatc.ipd.sgRNA <- subset(window.temp.gc.structure.rna.gene.gatc.ipd, window.temp.gc.structure.rna.gene.gatc.ipd$cut.score != "NA")
# 931340
df.melt <- melt(window.temp.gc.structure.rna.gene.gatc.ipd.sgRNA[,c(4,5,7:14)], id=c("cut.score", "scale", "sgRNA"))
df <- na.omit(df.melt)
colnames(df) <- c("cut.score", "scale", "sgRNA", "variable", "value")
df.id <- df %>% unite(feature.scale, c(variable, scale), sep = "")
df.dcast <- df.id %>% dcast(sgRNA + cut.score ~ feature.scale, value.var = "value", fun.aggregate=mean, na.rm=TRUE)
df.dcast.na <- na.omit(df.dcast)
# 40467
write.table(df.dcast.na, "ecoli.20sliding.exact.dwt.dcast.21sep.txt", quote=F, row.names=F, sep="\t")
–> take the 20bp upstream and 20bp downstream bin of the sgRNA
## upstream bin
score <- read.table("sgRNA.coord.txt", header=T, sep="\t", stringsAsFactors = F)
colnames(score) <- c("chr", "start", "end", "sgRNA", "id", "seq", "id2", "cut.score", "gid", "change.val", "quality")
score.df <- score[,c(1:4,8)]
score.df$chr <- score.df$chr
score.df$end <- score.df$start
score.df$start <- score.df$start - 19
window.score.df <- left_join(score.df, window, by=c("chr", "start", "end"))
window.score.temp <- left_join(window.score.df, temp.modwt.name[,c(3,4,2)], by="window")
window.temp.gc <- left_join(window.score.temp, gc.modwt.name[,c(3,4,2)], by=c("window", "scale"))
window.temp.gc.structure <- left_join(window.temp.gc, structure.modwt.name[,c(3,4,2)], by=c("window", "scale"))
window.temp.gc.structure.rna <- left_join(window.temp.gc.structure, rna.modwt.name[,c(3,4,2)], by=c("window", "scale"))
window.temp.gc.structure.rna.gene <- left_join(window.temp.gc.structure.rna, gene.modwt.name[,c(3,4,2)], by=c("window", "scale"))
window.temp.gc.structure.rna.gene.gatc <- left_join(window.temp.gc.structure.rna.gene, gatc.modwt.name[,c(3,4,2)], by=c("window", "scale"))
window.temp.gc.structure.rna.gene.gatc.ipd <- left_join(window.temp.gc.structure.rna.gene.gatc, ipd.modwt.name[,c(3,4,2)], by=c("window", "scale"))
window.temp.gc.structure.rna.gene.gatc.ipd.sgRNA <- subset(window.temp.gc.structure.rna.gene.gatc.ipd, window.temp.gc.structure.rna.gene.gatc.ipd$cut.score != "NA")
df.melt <- melt(window.temp.gc.structure.rna.gene.gatc.ipd.sgRNA[,c(4,5,7:14)], id=c("cut.score", "scale", "sgRNA"))
df <- na.omit(df.melt)
colnames(df) <- c("cut.score", "scale", "sgRNA", "variable", "value")
df.id <- df %>% unite(feature.scale, c(variable, scale), sep = "")
df.dcast <- df.id %>% dcast(sgRNA + cut.score ~ feature.scale, value.var = "value", fun.aggregate=mean, na.rm=TRUE)
df.dcast.na <- na.omit(df.dcast)
#
write.table(df.dcast.na, "ecoli.20sliding.up.dwt.dcast.21sep.txt", quote=F, row.names=F, sep="\t")
## downstream bin
score <- read.table("sgRNA.coord.txt", header=T, sep="\t", stringsAsFactors = F)
colnames(score) <- c("chr", "start", "end", "sgRNA", "id", "seq", "id2", "cut.score", "gid", "change.val", "quality")
score.df <- score[,c(1:4,8)]
score$chr <- score$chr
score$start <- score$end
score$end <- score$start + 19
window.score.df <- left_join(score.df, window, by=c("chr", "start", "end"))
window.score.temp <- left_join(window.score.df, temp.modwt.name[,c(3,4,2)], by="window")
window.temp.gc <- left_join(window.score.temp, gc.modwt.name[,c(3,4,2)], by=c("window", "scale"))
window.temp.gc.structure <- left_join(window.temp.gc, structure.modwt.name[,c(3,4,2)], by=c("window", "scale"))
window.temp.gc.structure.rna <- left_join(window.temp.gc.structure, rna.modwt.name[,c(3,4,2)], by=c("window", "scale"))
window.temp.gc.structure.rna.gene <- left_join(window.temp.gc.structure.rna, gene.modwt.name[,c(3,4,2)], by=c("window", "scale"))
window.temp.gc.structure.rna.gene.gatc <- left_join(window.temp.gc.structure.rna.gene, gatc.modwt.name[,c(3,4,2)], by=c("window", "scale"))
window.temp.gc.structure.rna.gene.gatc.ipd <- left_join(window.temp.gc.structure.rna.gene.gatc, ipd.modwt.name[,c(3,4,2)], by=c("window", "scale"))
window.temp.gc.structure.rna.gene.gatc.ipd.sgRNA <- subset(window.temp.gc.structure.rna.gene.gatc.ipd, window.temp.gc.structure.rna.gene.gatc.ipd$cut.score != "NA")
df.melt <- melt(window.temp.gc.structure.rna.gene.gatc.ipd.sgRNA[,c(4,5,7:14)], id=c("cut.score", "scale", "sgRNA"))
df <- na.omit(df.melt)
colnames(df) <- c("cut.score", "scale", "sgRNA", "variable", "value")
df.id <- df %>% unite(feature.scale, c(variable, scale), sep = "")
df.dcast <- df.id %>% dcast(sgRNA + cut.score ~ feature.scale, value.var = "value", fun.aggregate=mean, na.rm=TRUE)
df.dcast.na <- na.omit(df.dcast)
#
write.table(df.dcast.na, "ecoli.20sliding.down.dwt.dcast.21sep.txt", quote=F, row.names=F, sep="\t")
–> combine to generate full feature matrix
# combine regional DWT with other features
library(tidyr)
library(dplyr)
setwd("/gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli")
df.dcast.na <- read.delim("ecoli.20sliding.exact.dwt.dcast.21sep.txt", header=T, sep="\t", stringsAsFactors = F)
df.dcast.sep <- df.dcast.na %>% separate(sgRNA, c("sgRNA", "ID"), sep="_")
df.dcast.dwt <- df.dcast.sep[,c(4:ncol(df.dcast.sep))]
colnames(df.dcast.dwt) <- paste0('sgRNA_', colnames(df.dcast.dwt))
df.dcast <- cbind(df.dcast.sep[,1:3], df.dcast.dwt)
setwd("/gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli")
df.dcast.na2 <- read.delim("ecoli.20sliding.up.dwt.dcast.21sep.txt", header=T, sep="\t", stringsAsFactors = F)
df.dcast.sep2 <- df.dcast.na2 %>% separate(sgRNA, c("sgRNA", "ID"), sep="_")
df.dcast.dwt2 <- df.dcast.sep2[,c(4:ncol(df.dcast.sep2))]
colnames(df.dcast.dwt2) <- paste0('sgRNA_', colnames(df.dcast.dwt2))
df.dcast2 <- cbind(df.dcast.sep2[,1:3], df.dcast.dwt2)
setwd("/gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli")
df.dcast.na3 <- read.delim("ecoli.20sliding.down.dwt.dcast.21sep.txt", header=T, sep="\t", stringsAsFactors = F)
df.dcast.sep3 <- df.dcast.na3 %>% separate(sgRNA, c("sgRNA", "ID"), sep="_")
df.dcast.dwt3 <- df.dcast.sep3[,c(4:ncol(df.dcast.sep3))]
colnames(df.dcast.dwt3) <- paste0('sgRNA_', colnames(df.dcast.dwt3))
df.dcast3 <- cbind(df.dcast.sep3[,1:3], df.dcast.dwt3)
df <- read.delim("Ecoli.allCas9.raw.onehot.tensor.dcast.na.txt", header=T, sep="\t", stringsAsFactors = F)
df.sep <- df %>% separate(sgRNAID, c("sgRNA", "ID", "type"), sep="_")
nrow(df.sep)
# 126182
df.sep.region <- inner_join(df.sep, df.dcast[,c(1,2,4:ncol(df.dcast.sep))], by=c("sgRNA", "ID"))
df.sep.region2 <- inner_join(df.sep.region, df.dcast2[,c(1,2,4:ncol(df.dcast.sep2))], by=c("sgRNA", "ID"))
df.sep.region3 <- inner_join(df.sep.region2, df.dcast3[,c(1,2,4:ncol(df.dcast.sep3))], by=c("sgRNA", "ID"))
## note that dwt.x = sgRNA dwt, dwt.y = upstream dwt, dwt = downstream dwt
df.sep.region.id <- df.sep.region3 %>% unite(sgRNAID, c("sgRNA", "ID", "type"), sep="_")
nrow(df.sep.region.id)
# 118140
write.table(df.sep.region.id, "ecoli.20sliding.all.features.dcast.21sept.txt", quote=F, row.names=F, sep="\t")
setwd("/gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli")
df <- read.delim("ecoli.20sliding.all.features.dcast.21sept.txt", header=T, sep="\t")
df <- na.omit(df)
## sample ID
df.features <- df[,c(1,3:ncol(df))]
df.score <- df[,1:2]
setwd("/gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli/iRF.run/Ecoli.allCas9/all.features/dwt20bp")
write.table(df.features, "dwt20bp.vienna.features_overlap.txt", quote=F, row.names=F, sep="\t")
write.table(df.score, "dwt20bp.vienna.score_overlap.txt", quote=F, row.names=F, sep="\t")
write.table(df.features, "dwt20bp.vienna.features.txt", quote=F, row.names=F, sep="\t")
write.table(df.score, "dwt20bp.vienna.score.txt", quote=F, row.names=F, sep="\t")
## no sample ID
## <DataFile>_overlap_noSampleIDs.txt & <YFile>_overlap_noSampleIDs.txt
df.features <- as.matrix(df[,c(3:ncol(df))])
df.score <- as.matrix(data.frame(df[,2]))
colnames(df.score) <- "cut.score"
setwd("/gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli/iRF.run/Ecoli.allCas9/all.features/dwt20bp")
write.table(df.features, "dwt20bp.vienna.features_overlap_noSampleIDs.txt", quote=F, row.names=F, sep="\t")
write.table(df.score, "dwt20bp.vienna.score_overlap_noSampleIDs.txt", quote=F, row.names=F, sep="\t")
–> add in pvalue analysis to removal of features pre-processing
source /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/scripts/loadcondaandes.sh
conda activate /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/Libraries/andes/envs/test
# python
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
import warnings
warnings.filterwarnings("ignore")
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.metrics import confusion_matrix
np.random.seed(123)
data = pd.read_table('/gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli/ecoli.20sliding.all.features.dcast.21sept.txt')
data = data.iloc[:,2:-1]
label_encoder = LabelEncoder()
data.iloc[:,0] = label_encoder.fit_transform(data.iloc[:,0]).astype('float64')
corr = data.corr()
columns = np.full((corr.shape[0],), True, dtype=bool)
for i in range(corr.shape[0]):
for j in range(i+1, corr.shape[0]):
if corr.iloc[i,j] >= 0.9:
if columns[j]:
columns[j] = False
selected_columns = data.columns[columns]
data = data[selected_columns]
data = pd.DataFrame(data = data, columns = selected_columns)
data.to_csv("/gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli/ecoli.20overlap.all.features.dcast.pythoncorrelationpval.21sept.csv")
# R
source /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/scripts/loadcondaandes.sh
conda activate /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/Libraries/andes/envs/test
#head -n 1 /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli/ecoli.20overlap.all.features.dcast.pythoncorrelationpval.21sept.csv > /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli/ecoli.20overlap.all.features.dcast.pythoncorrelationpval.21sept.header.txt
setwd("/gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli/")
df.noncor <- read.delim("ecoli.20overlap.all.features.dcast.pythoncorrelationpval.21sept.header.txt", header=F, sep=",")
setwd("/gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli/iRF.run/Ecoli.allCas9/all.features/dwt20bp/")
df <- read.delim("dwt20bp.vienna.features.txt", header=T, sep="\t")
df.subset <- df[ , which(names(df) %in% df.noncor[1,])]
df.mat <- as.matrix(df.subset[,2:ncol(df.subset)])
df.mat.id <- cbind(as.data.frame(df$sgRNAID), df.mat)
write.table(df.mat.id, "dwt20bp.sgRNA.noncorpval.vienna.features.txt", quote=F, row.names=F, sep="\t")
write.table(df.mat.id, "dwt20bp.sgRNA.noncorpval.vienna.features_overlap.txt", quote=F, row.names=F, sep="\t")
write.table(df.mat, "dwt20bp.sgRNA.noncorpval.vienna.features_overlap_noSampleIDs.txt", quote=F, row.names=F, sep="\t")
cd /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli/iRF.run/Ecoli.allCas9/all.features/dwt20bp/
awk 'NR==1 || /_Cas9/' dwt20bp.sgRNA.noncorpval.vienna.features.txt > dwt20bp.sgRNA.noncorpval.vienna.Cas9.features.txt
awk 'NR==1 || /_eSpCas9/' dwt20bp.sgRNA.noncorpval.vienna.features.txt > dwt20bp.sgRNA.noncorpval.vienna.eSpCas9.features.txt
awk 'NR==1 || /_recACas9/' dwt20bp.sgRNA.noncorpval.vienna.features.txt > dwt20bp.sgRNA.noncorpval.vienna.recACas9.features.txt
# R
setwd("/gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli/iRF.run/Ecoli.allCas9/all.features/dwt20bp/")
features.cas9 <- read.delim("dwt20bp.sgRNA.noncorpval.vienna.Cas9.features.txt", header=T, sep="\t")
features.eSp <- read.delim("dwt20bp.sgRNA.noncorpval.vienna.eSpCas9.features.txt", header=T, sep="\t")
features.recA <- read.delim("dwt20bp.sgRNA.noncorpval.vienna.recACas9.features.txt", header=T, sep="\t")
features.cas9$cas.class <- 1
features.eSp$cas.class <- 2
features.recA$cas.class <- 3
features.all <- rbind(features.cas9, features.eSp, features.recA)
write.table(features.all, "dwt20bp.sgRNA.noncorpval.vienna.class.features.txt", quote=F, row.names=F, sep="\t")
cd /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli/iRF.run/Ecoli.allCas9/all.features/dwt20bp/
cp dwt20bp.sgRNA.noncorpval.vienna.class.features.txt dwt20bp.sgRNA.noncorpval.vienna.class.features_overlap.txt
cut --complement -f 1 dwt20bp.sgRNA.noncorpval.vienna.class.features.txt > dwt20bp.sgRNA.noncorpval.vienna.class.features_overlap_noSampleIDs.txt
# Andes
module load python/3.7-anaconda3
mkdir /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli/iRF.run/Ecoli.allCas9/all.features/dwt20bp.noncor2.vienna.class
cd /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli/iRF.run/Ecoli.allCas9/all.features/dwt20bp.noncor2.vienna.class
python /gpfs/alpine/syb105/proj-shared/Projects/iRF/iRF_LOOP_SetUp_CrossLayer.py --System Summit --NodesPer 1 --TotalNodes 50 --RunTime 90 --Account SYB105 --NumTrees 1000 --NumIterations 5 --RunName dwt20bp.noncor2.vienna.class --bypass --Prediction /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli/iRF.run/Ecoli.allCas9/all.features/dwt20bp/dwt20bp.sgRNA.noncorpval.vienna.class.features.txt /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli/iRF.run/Ecoli.allCas9/all.features/dwt20bp/dwt20bp.vienna.score.txt
# Summit
cd /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli/iRF.run/Ecoli.allCas9/all.features/dwt20bp.noncor2.vienna.class
module load python/3.7.0-anaconda3-5.3.0
# full
bsub /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli/iRF.run/Ecoli.allCas9/all.features/dwt20bp.noncor2.vienna.class/Submits/submit_full_dwt20bp.noncor2.vienna.class_0.sh
# train
bsub /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli/iRF.run/Ecoli.allCas9/all.features/dwt20bp.noncor2.vienna.class/Submits/submit_train_dwt20bp.noncor2.vienna.class_0.sh
# once the train submissions are done run the test submissions
# test
bsub /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli/iRF.run/Ecoli.allCas9/all.features/dwt20bp.noncor2.vienna.class/Submits/submit_test_dwt20bp.noncor2.vienna.class_0.sh
# Andes
cd /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli/iRF.run/Ecoli.allCas9/
module load python/3.7-anaconda3
cd /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli/iRF.run/Ecoli.allCas9/all.features/dwt20bp.noncor2.vienna.class
python /gpfs/alpine/syb105/proj-shared/Projects/iRF/iRF_postProcessing.py --Iterations 5 --Prediction --PredAccuracy MAE,MAEA,MSE,R2 --varTot 95 /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli/iRF.run/Ecoli.allCas9/YNames.txt dwt20bp.noncor2.vienna.class
# -0.24274672172463108
sort -k3rg topVarEdges/cut.score_top95.txt | head
# cas.class cut.score 0.15782581731573447
# sgRNA_gc.dwtd1.x cut.score 0.011576221786724223
# sgRNA_structure.dwtd2.x cut.score 0.011377239433135664
# sgRNA_gc.dwtd2.x cut.score 0.011327282161383642
# sgRNA_gc.dwtd1.y cut.score 0.011270656863952856
# sgRNA_structure.dwtd4.y cut.score 0.01099176404389655
# sgRNA_gc.dwtd2.y cut.score 0.010962234215890588
# sgRNA_ipd.dwtd1.x cut.score 0.010908784168783024
# sgRNA_structure.dwtd2.y cut.score 0.010589142302912039
# sgRNA_gc.dwtd4.x cut.score 0.01047000691078476
# pearson correlation
setwd("/gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli/iRF.run/Ecoli.allCas9/all.features/dwt20bp.noncor2.vienna.class/cut.score/foldRuns/fold9/Runs/Set4")
pred <- read.delim("dwt20bp.noncor2.vienna.class_Set4_test.prediction", header=T, sep="\t")
y <- read.delim("set4_Y_test_noSampleIDs.txt", header=T, sep="\t")
cor(y$cut.score, pred$Predictions.)
# 0.01724238
id <- read.delim("set4_test_SampleIDs.txt", header=F, sep="\t")
colnames(id) <- "sgRNAID"
id.pred <- cbind(id, pred)
id.pred.y <- cbind(id.pred, y)
library(tidyr)
id.pred.y.group <- id.pred.y %>% separate(sgRNAID, c("sgRNA", "ID", "group"), "_")
cor(id.pred.y.group$cut.score, id.pred.y.group$Predictions., method=c("pearson"))
# 0.01724238
pred.Cas9 <- subset(id.pred.y.group, id.pred.y.group$group == "Cas9")
cor(pred.Cas9$cut.score, pred.Cas9$Predictions., method=c("pearson"))
# 0.01764027
pred.eSpCas9 <- subset(id.pred.y.group, id.pred.y.group$group == "eSpCas9")
cor(pred.eSpCas9$cut.score, pred.eSpCas9$Predictions., method=c("pearson"))
# 0.01300549
pdf("dwt20bp.noncor2.viennarna.class.prediction.scatter.pdf")
library(ggplot2)
ggplot(id.pred.y.group, aes(x=cut.score, y=Predictions., color=group)) + geom_point() + theme_classic()
dev.off()
### without class
mkdir /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli/iRF.run/Ecoli.allCas9/all.features/dwt20bp.noncor2.vienna
cd /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli/iRF.run/Ecoli.allCas9/all.features/dwt20bp.noncor2.vienna
python /gpfs/alpine/syb105/proj-shared/Projects/iRF/iRF_LOOP_SetUp_CrossLayer.py --System Summit --NodesPer 1 --TotalNodes 50 --RunTime 90 --Account SYB105 --NumTrees 1000 --NumIterations 5 --RunName dwt20bp.noncor2.vienna --bypass --Prediction /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli/iRF.run/Ecoli.allCas9/all.features/dwt20bp/dwt20bp.sgRNA.noncorpval.vienna.features.txt /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli/iRF.run/Ecoli.allCas9/all.features/dwt20bp/dwt20bp.vienna.score.txt
# Summit
cd /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli/iRF.run/Ecoli.allCas9/all.features/dwt20bp.noncor2.vienna
module load python/3.7.0-anaconda3-5.3.0
# full
bsub /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli/iRF.run/Ecoli.allCas9/all.features/dwt20bp.noncor2.vienna/Submits/submit_full_dwt20bp.noncor2.vienna_0.sh
# train
bsub /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli/iRF.run/Ecoli.allCas9/all.features/dwt20bp.noncor2.vienna/Submits/submit_train_dwt20bp.noncor2.vienna_0.sh
# once the train submissions are done run the test submissions
# test
bsub /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli/iRF.run/Ecoli.allCas9/all.features/dwt20bp.noncor2.vienna/Submits/submit_test_dwt20bp.noncor2.vienna_0.sh
# Andes
cd /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli/iRF.run/Ecoli.allCas9/
module load python/3.7-anaconda3
cd /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli/iRF.run/Ecoli.allCas9/all.features/dwt20bp.noncor2.vienna
python /gpfs/alpine/syb105/proj-shared/Projects/iRF/iRF_postProcessing.py --Iterations 5 --Prediction --PredAccuracy MAE,MAEA,MSE,R2 --varTot 95 /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli/iRF.run/Ecoli.allCas9/YNames.txt dwt20bp.noncor2.vienna
# -0.09735005082699842
sort -k3rg topVarEdges/cut.score_top95.txt | head
# p20xz_quadrupoleraw cut.score 0.03628069214668665
# sgRNA.gcsgRNA.raw cut.score 0.021486858631428626
# p18xz_quadrupoleraw cut.score 0.018431390522543795
# p19rot_constants_yraw cut.score 0.016605916429516462
# p20homo_energyraw cut.score 0.014879878961815578
# p19rot_constants_zraw cut.score 0.014797964836367574
# sgRNA_gene.dwtd22.x cut.score 0.014427855448691204
# sgRNA_gc.dwtd2.y cut.score 0.010182438646841804
# sgRNA_gc.dwtd2.x cut.score 0.009902335075614213
# sgRNA_gc.dwtd1.x cut.score 0.009874583994505918
# pearson correlation
setwd("/gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli/iRF.run/Ecoli.allCas9/all.features/dwt20bp.noncor2.vienna/cut.score/foldRuns/fold9/Runs/Set4")
pred <- read.delim("dwt20bp.noncor2.vienna_Set4_test.prediction", header=T, sep="\t")
y <- read.delim("set4_Y_test_noSampleIDs.txt", header=T, sep="\t")
cor(y$cut.score, pred$Predictions.)
# 0.2639568
id <- read.delim("set4_test_SampleIDs.txt", header=F, sep="\t")
colnames(id) <- "sgRNAID"
id.pred <- cbind(id, pred)
id.pred.y <- cbind(id.pred, y)
library(tidyr)
id.pred.y.group <- id.pred.y %>% separate(sgRNAID, c("sgRNA", "ID", "group"), "_")
cor(id.pred.y.group$cut.score, id.pred.y.group$Predictions., method=c("pearson"))
# 0.2639568
pred.Cas9 <- subset(id.pred.y.group, id.pred.y.group$group == "Cas9")
cor(pred.Cas9$cut.score, pred.Cas9$Predictions., method=c("pearson"))
# 0.7196249
pred.eSpCas9 <- subset(id.pred.y.group, id.pred.y.group$group == "eSpCas9")
cor(pred.eSpCas9$cut.score, pred.eSpCas9$Predictions., method=c("pearson"))
# 0.6556264
pred.recACas9 <- subset(id.pred.y.group, id.pred.y.group$group == "recACas9")
cor(pred.recACas9$cut.score, pred.recACas9$Predictions., method=c("pearson"))
# 0.4571555
pdf("dwt20bp.noncor2.viennarna.prediction.scatter.pdf")
library(ggplot2)
ggplot(id.pred.y.group, aes(x=cut.score, y=Predictions., color=group)) + geom_point() + theme_classic()
dev.off()
cd /Users/27n/Dropbox (ORNL)/ORNL.Noshay/Projects/SEED/ExploratoryDataForModelGeneration/e.coli
setwd("/Users/27n/Dropbox (ORNL)/ORNL.Noshay/Projects/SEED/ExploratoryDataForModelGeneration/e.coli")
df <- read.delim("ecoli.gRNA.PAM_TGG.fasta.offtargets.scores", header=T, sep="\t")
df.spec <- df[,c(1,4,10)]
substrRight <- function(x, n){
substr(x, nchar(x)-n+1, nchar(x))
}
df.spec$PAM <- substrRight(df.spec$target, 3)
library(reshape2)
df.spec2 <- df.spec[,c(1,3,4)]
colnames(df.spec2) <- c("sgRNAID", "specificity.score", "PAM")
df.spec.dcast <- dcast(df.spec2, sgRNAID ~ PAM, value.var = "specificity.score", na.rm=T)
library(dplyr)
score <- read.delim("DataS4.txt", header=T, sep="\t")
df.spec.score <- inner_join(df.spec2, score, by="sgRNAID")
cor(df.spec.score$specificity.score, df.spec.score$score)
# 0.04188274
score <- read.delim("DataS4.eSp.txt", header=T, sep="\t")
df.spec.score <- inner_join(df.spec2, score, by="sgRNAID")
cor(df.spec.score$specificity.score, df.spec.score$score)
# 0.09265947
https://christophm.github.io/interpretable-ml-book/shap.html https://github.com/slundberg/shap https://towardsdatascience.com/explain-your-model-with-the-shap-values-bc36aac4de3d https://towardsdatascience.com/shap-shapley-additive-explanations-5a2a271ed9c3
source /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/scripts/loadcondaandes.sh
conda create --name shap python=3.8 -c bioconda -c conda-forge
conda activate shap
conda install -c conda-forge shap
conda install -c conda-forge matplotlib
# python
import pandas as pd
import numpy as np
np.random.seed(0)
import matplotlib.pyplot as plt
df = pd.read_table('/gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli/ecoli.20sliding.all.features.dcast.21sept.txt') # Load the data
from sklearn.model_selection import train_test_split
from sklearn import preprocessing
from sklearn.ensemble import RandomForestRegressor
# The target variable is 'cut.score'.
Y = df['cut.score']
X = df[['ACsgRNA.raw,AGsgRNA.raw,AsgRNA.raw,ATsgRNA.raw,CAsgRNA.raw,CCsgRNA.raw,CGsgRNA.raw,CsgRNA.raw,CTsgRNA.raw,GAsgRNA.raw,GCsgRNA.raw,GGsgRNA.raw,GsgRNA.raw,GTsgRNA.raw,p1.AAsgRNA.raw,p1.ACsgRNA.raw,p1.AGsgRNA.raw,p1.AsgRNA.raw,p1.ATsgRNA.raw,p1.CAsgRNA.raw,p1.CCsgRNA.raw,p1.CGsgRNA.raw,p1.CsgRNA.raw,p1.CTsgRNA.raw,p1.GAsgRNA.raw,p1.GCsgRNA.raw,p1.GGsgRNA.raw,p1.GsgRNA.raw,p1.GTsgRNA.raw,p1.TAsgRNA.raw,p1.TCsgRNA.raw,p1.TGsgRNA.raw,p1.TsgRNA.raw,p1.TTsgRNA.raw,p10.AAsgRNA.raw,p10.ACsgRNA.raw,p10.AGsgRNA.raw,p10.AsgRNA.raw,p10.ATsgRNA.raw,p10.CAsgRNA.raw,p10.CCsgRNA.raw,p10.CGsgRNA.raw,p10.CsgRNA.raw,p10.CTsgRNA.raw,p10.GAsgRNA.raw,p10.GCsgRNA.raw,p10.GGsgRNA.raw,p10.GsgRNA.raw,p10.GTsgRNA.raw,p10.TAsgRNA.raw,p10.TCsgRNA.raw,p10.TGsgRNA.raw,p10.TsgRNA.raw,p10.TTsgRNA.raw,p10chargeraw,p10homo_energyraw,p10homo_lumo_energygapraw,p10molecular_volumeraw,p10num_aromaticbondsraw,p10num_atomsraw,p10num_Oatomsraw,p10num_Patomsraw,p10num_Satomsraw,p10num_Seatomsraw,p10relativenum_Hatomsraw,p10relativenum_Satomsraw,p10relativenum_Seatomsraw,p10rot_constants_yraw,p10rot_constants_zraw,p10xz_quadrupoleraw,p10yy_polarizabilityraw,p11.AAsgRNA.raw,p11.ACsgRNA.raw,p11.AGsgRNA.raw,p11.AsgRNA.raw,p11.ATsgRNA.raw,p11.CAsgRNA.raw,p11.CCsgRNA.raw,p11.CGsgRNA.raw,p11.CsgRNA.raw,p11.CTsgRNA.raw,p11.GAsgRNA.raw,p11.GCsgRNA.raw,p11.GGsgRNA.raw,p11.GsgRNA.raw,p11.GTsgRNA.raw,p11.TAsgRNA.raw,p11.TCsgRNA.raw,p11.TGsgRNA.raw,p11.TsgRNA.raw,p11.TTsgRNA.raw,p11chargeraw,p11homo_energyraw,p11homo_lumo_energygapraw,p11molecular_volumeraw,p11num_aromaticbondsraw,p11num_atomsraw,p11num_Oatomsraw,p11num_Patomsraw,p11num_Satomsraw,p11num_Seatomsraw,p11relativenum_Hatomsraw,p11relativenum_Satomsraw,p11relativenum_Seatomsraw,p11rot_constants_yraw,p11rot_constants_zraw,p11xz_quadrupoleraw,p11yy_polarizabilityraw,p12.AAsgRNA.raw,p12.ACsgRNA.raw,p12.AGsgRNA.raw,p12.AsgRNA.raw,p12.ATsgRNA.raw,p12.CAsgRNA.raw,p12.CCsgRNA.raw,p12.CGsgRNA.raw,p12.CsgRNA.raw,p12.CTsgRNA.raw,p12.GAsgRNA.raw,p12.GCsgRNA.raw,p12.GGsgRNA.raw,p12.GsgRNA.raw,p12.GTsgRNA.raw,p12.TAsgRNA.raw,p12.TCsgRNA.raw,p12.TGsgRNA.raw,p12.TsgRNA.raw,p12.TTsgRNA.raw,p12chargeraw,p12homo_energyraw,p12homo_lumo_energygapraw,p12molecular_volumeraw,p12num_aromaticbondsraw,p12num_atomsraw,p12num_Oatomsraw,p12num_Patomsraw,p12num_Satomsraw,p12num_Seatomsraw,p12relativenum_Hatomsraw,p12relativenum_Satomsraw,p12relativenum_Seatomsraw,p12rot_constants_yraw,p12rot_constants_zraw,p12xz_quadrupoleraw,p12yy_polarizabilityraw,p13.AAsgRNA.raw,p13.ACsgRNA.raw,p13.AGsgRNA.raw,p13.AsgRNA.raw,p13.ATsgRNA.raw,p13.CAsgRNA.raw,p13.CCsgRNA.raw,p13.CGsgRNA.raw,p13.CsgRNA.raw,p13.CTsgRNA.raw,p13.GAsgRNA.raw,p13.GCsgRNA.raw,p13.GGsgRNA.raw,p13.GsgRNA.raw,p13.GTsgRNA.raw,p13.TAsgRNA.raw,p13.TCsgRNA.raw,p13.TGsgRNA.raw,p13.TsgRNA.raw,p13.TTsgRNA.raw,p13chargeraw,p13homo_energyraw,p13homo_lumo_energygapraw,p13molecular_volumeraw,p13num_aromaticbondsraw,p13num_atomsraw,p13num_Oatomsraw,p13num_Patomsraw,p13num_Satomsraw,p13num_Seatomsraw,p13relativenum_Hatomsraw,p13relativenum_Satomsraw,p13relativenum_Seatomsraw,p13rot_constants_yraw,p13rot_constants_zraw,p13xz_quadrupoleraw,p13yy_polarizabilityraw,p14.AAsgRNA.raw,p14.ACsgRNA.raw,p14.AGsgRNA.raw,p14.AsgRNA.raw,p14.ATsgRNA.raw,p14.CAsgRNA.raw,p14.CCsgRNA.raw,p14.CGsgRNA.raw,p14.CsgRNA.raw,p14.CTsgRNA.raw,p14.GAsgRNA.raw,p14.GCsgRNA.raw,p14.GGsgRNA.raw,p14.GsgRNA.raw,p14.GTsgRNA.raw,p14.TAsgRNA.raw,p14.TCsgRNA.raw,p14.TGsgRNA.raw,p14.TsgRNA.raw,p14.TTsgRNA.raw,p14chargeraw,p14homo_energyraw,p14homo_lumo_energygapraw,p14molecular_volumeraw,p14num_aromaticbondsraw,p14num_atomsraw,p14num_Oatomsraw,p14num_Patomsraw,p14num_Satomsraw,p14num_Seatomsraw,p14relativenum_Hatomsraw,p14relativenum_Satomsraw,p14relativenum_Seatomsraw,p14rot_constants_yraw,p14rot_constants_zraw,p14xz_quadrupoleraw,p14yy_polarizabilityraw,p15.AAsgRNA.raw,p15.ACsgRNA.raw,p15.AGsgRNA.raw,p15.AsgRNA.raw,p15.ATsgRNA.raw,p15.CAsgRNA.raw,p15.CCsgRNA.raw,p15.CGsgRNA.raw,p15.CsgRNA.raw,p15.CTsgRNA.raw,p15.GAsgRNA.raw,p15.GCsgRNA.raw,p15.GGsgRNA.raw,p15.GsgRNA.raw,p15.GTsgRNA.raw,p15.TAsgRNA.raw,p15.TCsgRNA.raw,p15.TGsgRNA.raw,p15.TsgRNA.raw,p15.TTsgRNA.raw,p15chargeraw,p15homo_energyraw,p15homo_lumo_energygapraw,p15molecular_volumeraw,p15num_aromaticbondsraw,p15num_atomsraw,p15num_Oatomsraw,p15num_Patomsraw,p15num_Satomsraw,p15num_Seatomsraw,p15relativenum_Hatomsraw,p15relativenum_Satomsraw,p15relativenum_Seatomsraw,p15rot_constants_yraw,p15rot_constants_zraw,p15xz_quadrupoleraw,p15yy_polarizabilityraw,p16.AAsgRNA.raw,p16.ACsgRNA.raw,p16.AGsgRNA.raw,p16.AsgRNA.raw,p16.ATsgRNA.raw,p16.CAsgRNA.raw,p16.CCsgRNA.raw,p16.CGsgRNA.raw,p16.CsgRNA.raw,p16.CTsgRNA.raw,p16.GAsgRNA.raw,p16.GCsgRNA.raw,p16.GGsgRNA.raw,p16.GsgRNA.raw,p16.GTsgRNA.raw,p16.TAsgRNA.raw,p16.TCsgRNA.raw,p16.TGsgRNA.raw,p16.TsgRNA.raw,p16.TTsgRNA.raw,p16chargeraw,p16homo_energyraw,p16homo_lumo_energygapraw,p16molecular_volumeraw,p16num_aromaticbondsraw,p16num_atomsraw,p16num_Oatomsraw,p16num_Patomsraw,p16num_Satomsraw,p16num_Seatomsraw,p16relativenum_Hatomsraw,p16relativenum_Satomsraw,p16relativenum_Seatomsraw,p16rot_constants_yraw,p16rot_constants_zraw,p16xz_quadrupoleraw,p16yy_polarizabilityraw,p17.AAsgRNA.raw,p17.ACsgRNA.raw,p17.AGsgRNA.raw,p17.AsgRNA.raw,p17.ATsgRNA.raw,p17.CAsgRNA.raw,p17.CCsgRNA.raw,p17.CGsgRNA.raw,p17.CsgRNA.raw,p17.CTsgRNA.raw,p17.GAsgRNA.raw,p17.GCsgRNA.raw,p17.GGsgRNA.raw,p17.GsgRNA.raw,p17.GTsgRNA.raw,p17.TAsgRNA.raw,p17.TCsgRNA.raw,p17.TGsgRNA.raw,p17.TsgRNA.raw,p17.TTsgRNA.raw,p17chargeraw,p17homo_energyraw,p17homo_lumo_energygapraw,p17molecular_volumeraw,p17num_aromaticbondsraw,p17num_atomsraw,p17num_Oatomsraw,p17num_Patomsraw,p17num_Satomsraw,p17num_Seatomsraw,p17relativenum_Hatomsraw,p17relativenum_Satomsraw,p17relativenum_Seatomsraw,p17rot_constants_yraw,p17rot_constants_zraw,p17xz_quadrupoleraw,p17yy_polarizabilityraw,p18.AAsgRNA.raw,p18.ACsgRNA.raw,p18.AGsgRNA.raw,p18.AsgRNA.raw,p18.ATsgRNA.raw,p18.CAsgRNA.raw,p18.CCsgRNA.raw,p18.CGsgRNA.raw,p18.CsgRNA.raw,p18.CTsgRNA.raw,p18.GAsgRNA.raw,p18.GCsgRNA.raw,p18.GGsgRNA.raw,p18.GsgRNA.raw,p18.GTsgRNA.raw,p18.TAsgRNA.raw,p18.TCsgRNA.raw,p18.TGsgRNA.raw,p18.TsgRNA.raw,p18.TTsgRNA.raw,p18chargeraw,p18homo_energyraw,p18homo_lumo_energygapraw,p18molecular_volumeraw,p18num_aromaticbondsraw,p18num_atomsraw,p18num_Oatomsraw,p18num_Patomsraw,p18num_Satomsraw,p18num_Seatomsraw,p18relativenum_Hatomsraw,p18relativenum_Satomsraw,p18relativenum_Seatomsraw,p18rot_constants_yraw,p18rot_constants_zraw,p18xz_quadrupoleraw,p18yy_polarizabilityraw,p19.AAsgRNA.raw,p19.ACsgRNA.raw,p19.AGsgRNA.raw,p19.AsgRNA.raw,p19.ATsgRNA.raw,p19.CAsgRNA.raw,p19.CCsgRNA.raw,p19.CGsgRNA.raw,p19.CsgRNA.raw,p19.CTsgRNA.raw,p19.GAsgRNA.raw,p19.GCsgRNA.raw,p19.GGsgRNA.raw,p19.GsgRNA.raw,p19.GTsgRNA.raw,p19.TAsgRNA.raw,p19.TCsgRNA.raw,p19.TGsgRNA.raw,p19.TsgRNA.raw,p19.TTsgRNA.raw,p19chargeraw,p19homo_energyraw,p19homo_lumo_energygapraw,p19molecular_volumeraw,p19num_aromaticbondsraw,p19num_atomsraw,p19num_Oatomsraw,p19num_Patomsraw,p19num_Satomsraw,p19num_Seatomsraw,p19relativenum_Hatomsraw,p19relativenum_Satomsraw,p19relativenum_Seatomsraw,p19rot_constants_yraw,p19rot_constants_zraw,p19xz_quadrupoleraw,p19yy_polarizabilityraw,p1chargeraw,p1homo_energyraw,p1homo_lumo_energygapraw,p1molecular_volumeraw,p1num_aromaticbondsraw,p1num_atomsraw,p1num_Oatomsraw,p1num_Patomsraw,p1num_Satomsraw,p1num_Seatomsraw,p1relativenum_Hatomsraw,p1relativenum_Satomsraw,p1relativenum_Seatomsraw,p1rot_constants_yraw,p1rot_constants_zraw,p1xz_quadrupoleraw,p1yy_polarizabilityraw,p2.AAsgRNA.raw,p2.ACsgRNA.raw,p2.AGsgRNA.raw,p2.AsgRNA.raw,p2.ATsgRNA.raw,p2.CAsgRNA.raw,p2.CCsgRNA.raw,p2.CGsgRNA.raw,p2.CsgRNA.raw,p2.CTsgRNA.raw,p2.GAsgRNA.raw,p2.GCsgRNA.raw,p2.GGsgRNA.raw,p2.GsgRNA.raw,p2.GTsgRNA.raw,p2.TAsgRNA.raw,p2.TCsgRNA.raw,p2.TGsgRNA.raw,p2.TsgRNA.raw,p2.TTsgRNA.raw,p20.AsgRNA.raw,p20.CsgRNA.raw,p20.GsgRNA.raw,p20.TsgRNA.raw,p20chargeraw,p20homo_energyraw,p20homo_lumo_energygapraw,p20molecular_volumeraw,p20num_aromaticbondsraw,p20num_atomsraw,p20num_Oatomsraw,p20num_Patomsraw,p20num_Satomsraw,p20num_Seatomsraw,p20relativenum_Hatomsraw,p20relativenum_Satomsraw,p20relativenum_Seatomsraw,p20rot_constants_yraw,p20rot_constants_zraw,p20xz_quadrupoleraw,p20yy_polarizabilityraw,p2chargeraw,p2homo_energyraw,p2homo_lumo_energygapraw,p2molecular_volumeraw,p2num_aromaticbondsraw,p2num_atomsraw,p2num_Oatomsraw,p2num_Patomsraw,p2num_Satomsraw,p2num_Seatomsraw,p2relativenum_Hatomsraw,p2relativenum_Satomsraw,p2relativenum_Seatomsraw,p2rot_constants_yraw,p2rot_constants_zraw,p2xz_quadrupoleraw,p2yy_polarizabilityraw,p3.AAsgRNA.raw,p3.ACsgRNA.raw,p3.AGsgRNA.raw,p3.AsgRNA.raw,p3.ATsgRNA.raw,p3.CAsgRNA.raw,p3.CCsgRNA.raw,p3.CGsgRNA.raw,p3.CsgRNA.raw,p3.CTsgRNA.raw,p3.GAsgRNA.raw,p3.GCsgRNA.raw,p3.GGsgRNA.raw,p3.GsgRNA.raw,p3.GTsgRNA.raw,p3.TAsgRNA.raw,p3.TCsgRNA.raw,p3.TGsgRNA.raw,p3.TsgRNA.raw,p3.TTsgRNA.raw,p3chargeraw,p3homo_energyraw,p3homo_lumo_energygapraw,p3molecular_volumeraw,p3num_aromaticbondsraw,p3num_atomsraw,p3num_Oatomsraw,p3num_Patomsraw,p3num_Satomsraw,p3num_Seatomsraw,p3relativenum_Hatomsraw,p3relativenum_Satomsraw,p3relativenum_Seatomsraw,p3rot_constants_yraw,p3rot_constants_zraw,p3xz_quadrupoleraw,p3yy_polarizabilityraw,p4.AAsgRNA.raw,p4.ACsgRNA.raw,p4.AGsgRNA.raw,p4.AsgRNA.raw,p4.ATsgRNA.raw,p4.CAsgRNA.raw,p4.CCsgRNA.raw,p4.CGsgRNA.raw,p4.CsgRNA.raw,p4.CTsgRNA.raw,p4.GAsgRNA.raw,p4.GCsgRNA.raw,p4.GGsgRNA.raw,p4.GsgRNA.raw,p4.GTsgRNA.raw,p4.TAsgRNA.raw,p4.TCsgRNA.raw,p4.TGsgRNA.raw,p4.TsgRNA.raw,p4.TTsgRNA.raw,p4chargeraw,p4homo_energyraw,p4homo_lumo_energygapraw,p4molecular_volumeraw,p4num_aromaticbondsraw,p4num_atomsraw,p4num_Oatomsraw,p4num_Patomsraw,p4num_Satomsraw,p4num_Seatomsraw,p4relativenum_Hatomsraw,p4relativenum_Satomsraw,p4relativenum_Seatomsraw,p4rot_constants_yraw,p4rot_constants_zraw,p4xz_quadrupoleraw,p4yy_polarizabilityraw,p5.AAsgRNA.raw,p5.ACsgRNA.raw,p5.AGsgRNA.raw,p5.AsgRNA.raw,p5.ATsgRNA.raw,p5.CAsgRNA.raw,p5.CCsgRNA.raw,p5.CGsgRNA.raw,p5.CsgRNA.raw,p5.CTsgRNA.raw,p5.GAsgRNA.raw,p5.GCsgRNA.raw,p5.GGsgRNA.raw,p5.GsgRNA.raw,p5.GTsgRNA.raw,p5.TAsgRNA.raw,p5.TCsgRNA.raw,p5.TGsgRNA.raw,p5.TsgRNA.raw,p5.TTsgRNA.raw,p5chargeraw,p5homo_energyraw,p5homo_lumo_energygapraw,p5molecular_volumeraw,p5num_aromaticbondsraw,p5num_atomsraw,p5num_Oatomsraw,p5num_Patomsraw,p5num_Satomsraw,p5num_Seatomsraw,p5relativenum_Hatomsraw,p5relativenum_Satomsraw,p5relativenum_Seatomsraw,p5rot_constants_yraw,p5rot_constants_zraw,p5xz_quadrupoleraw,p5yy_polarizabilityraw,p6.AAsgRNA.raw,p6.ACsgRNA.raw,p6.AGsgRNA.raw,p6.AsgRNA.raw,p6.ATsgRNA.raw,p6.CAsgRNA.raw,p6.CCsgRNA.raw,p6.CGsgRNA.raw,p6.CsgRNA.raw,p6.CTsgRNA.raw,p6.GAsgRNA.raw,p6.GCsgRNA.raw,p6.GGsgRNA.raw,p6.GsgRNA.raw,p6.GTsgRNA.raw,p6.TAsgRNA.raw,p6.TCsgRNA.raw,p6.TGsgRNA.raw,p6.TsgRNA.raw,p6.TTsgRNA.raw,p6chargeraw,p6homo_energyraw,p6homo_lumo_energygapraw,p6molecular_volumeraw,p6num_aromaticbondsraw,p6num_atomsraw,p6num_Oatomsraw,p6num_Patomsraw,p6num_Satomsraw,p6num_Seatomsraw,p6relativenum_Hatomsraw,p6relativenum_Satomsraw,p6relativenum_Seatomsraw,p6rot_constants_yraw,p6rot_constants_zraw,p6xz_quadrupoleraw,p6yy_polarizabilityraw,p7.AAsgRNA.raw,p7.ACsgRNA.raw,p7.AGsgRNA.raw,p7.AsgRNA.raw,p7.ATsgRNA.raw,p7.CAsgRNA.raw,p7.CCsgRNA.raw,p7.CGsgRNA.raw,p7.CsgRNA.raw,p7.CTsgRNA.raw,p7.GAsgRNA.raw,p7.GCsgRNA.raw,p7.GGsgRNA.raw,p7.GsgRNA.raw,p7.GTsgRNA.raw,p7.TAsgRNA.raw,p7.TCsgRNA.raw,p7.TGsgRNA.raw,p7.TsgRNA.raw,p7.TTsgRNA.raw,p7chargeraw,p7homo_energyraw,p7homo_lumo_energygapraw,p7molecular_volumeraw,p7num_aromaticbondsraw,p7num_atomsraw,p7num_Oatomsraw,p7num_Patomsraw,p7num_Satomsraw,p7num_Seatomsraw,p7relativenum_Hatomsraw,p7relativenum_Satomsraw,p7relativenum_Seatomsraw,p7rot_constants_yraw,p7rot_constants_zraw,p7xz_quadrupoleraw,p7yy_polarizabilityraw,p8.AAsgRNA.raw,p8.ACsgRNA.raw,p8.AGsgRNA.raw,p8.AsgRNA.raw,p8.ATsgRNA.raw,p8.CAsgRNA.raw,p8.CCsgRNA.raw,p8.CGsgRNA.raw,p8.CsgRNA.raw,p8.CTsgRNA.raw,p8.GAsgRNA.raw,p8.GCsgRNA.raw,p8.GGsgRNA.raw,p8.GsgRNA.raw,p8.GTsgRNA.raw,p8.TAsgRNA.raw,p8.TCsgRNA.raw,p8.TGsgRNA.raw,p8.TsgRNA.raw,p8.TTsgRNA.raw,p8chargeraw,p8homo_energyraw,p8homo_lumo_energygapraw,p8molecular_volumeraw,p8num_aromaticbondsraw,p8num_atomsraw,p8num_Oatomsraw,p8num_Patomsraw,p8num_Satomsraw,p8num_Seatomsraw,p8relativenum_Hatomsraw,p8relativenum_Satomsraw,p8relativenum_Seatomsraw,p8rot_constants_yraw,p8rot_constants_zraw,p8xz_quadrupoleraw,p8yy_polarizabilityraw,p9.AAsgRNA.raw,p9.ACsgRNA.raw,p9.AGsgRNA.raw,p9.AsgRNA.raw,p9.ATsgRNA.raw,p9.CAsgRNA.raw,p9.CCsgRNA.raw,p9.CGsgRNA.raw,p9.CsgRNA.raw,p9.CTsgRNA.raw,p9.GAsgRNA.raw,p9.GCsgRNA.raw,p9.GGsgRNA.raw,p9.GsgRNA.raw,p9.GTsgRNA.raw,p9.TAsgRNA.raw,p9.TCsgRNA.raw,p9.TGsgRNA.raw,p9.TsgRNA.raw,p9.TTsgRNA.raw,p9chargeraw,p9homo_energyraw,p9homo_lumo_energygapraw,p9molecular_volumeraw,p9num_aromaticbondsraw,p9num_atomsraw,p9num_Oatomsraw,p9num_Patomsraw,p9num_Satomsraw,p9num_Seatomsraw,p9relativenum_Hatomsraw,p9relativenum_Satomsraw,p9relativenum_Seatomsraw,p9rot_constants_yraw,p9rot_constants_zraw,p9xz_quadrupoleraw,p9yy_polarizabilityraw,sgRNA.gcsgRNA.raw,sgRNA.structuresgRNA.raw,TAsgRNA.raw,TCsgRNA.raw,TGsgRNA.raw,TsgRNA.raw,TTsgRNA.raw,sgRNA_gatc.dwtd1.x,sgRNA_gatc.dwtd10.x,sgRNA_gatc.dwtd11.x,sgRNA_gatc.dwtd12.x,sgRNA_gatc.dwtd13.x,sgRNA_gatc.dwtd14.x,sgRNA_gatc.dwtd15.x,sgRNA_gatc.dwtd16.x,sgRNA_gatc.dwtd17.x,sgRNA_gatc.dwtd18.x,sgRNA_gatc.dwtd19.x,sgRNA_gatc.dwtd2.x,sgRNA_gatc.dwtd20.x,sgRNA_gatc.dwtd21.x,sgRNA_gatc.dwtd22.x,sgRNA_gatc.dwtd4.x,sgRNA_gatc.dwtd5.x,sgRNA_gatc.dwtd6.x,sgRNA_gatc.dwtd7.x,sgRNA_gatc.dwtd8.x,sgRNA_gatc.dwtd9.x,sgRNA_gatc.dwts22.x,sgRNA_gc.dwtd1.x,sgRNA_gc.dwtd10.x,sgRNA_gc.dwtd11.x,sgRNA_gc.dwtd12.x,sgRNA_gc.dwtd13.x,sgRNA_gc.dwtd14.x,sgRNA_gc.dwtd15.x,sgRNA_gc.dwtd16.x,sgRNA_gc.dwtd17.x,sgRNA_gc.dwtd18.x,sgRNA_gc.dwtd19.x,sgRNA_gc.dwtd2.x,sgRNA_gc.dwtd20.x,sgRNA_gc.dwtd21.x,sgRNA_gc.dwtd22.x,sgRNA_gc.dwtd3.x,sgRNA_gc.dwtd4.x,sgRNA_gc.dwtd5.x,sgRNA_gc.dwtd6.x,sgRNA_gc.dwtd7.x,sgRNA_gc.dwtd8.x,sgRNA_gc.dwtd9.x,sgRNA_gc.dwts22.x,sgRNA_gene.dwtd1.x,sgRNA_gene.dwtd10.x,sgRNA_gene.dwtd11.x,sgRNA_gene.dwtd12.x,sgRNA_gene.dwtd13.x,sgRNA_gene.dwtd14.x,sgRNA_gene.dwtd15.x,sgRNA_gene.dwtd16.x,sgRNA_gene.dwtd17.x,sgRNA_gene.dwtd18.x,sgRNA_gene.dwtd19.x,sgRNA_gene.dwtd2.x,sgRNA_gene.dwtd20.x,sgRNA_gene.dwtd21.x,sgRNA_gene.dwtd22.x,sgRNA_gene.dwtd3.x,sgRNA_gene.dwtd4.x,sgRNA_gene.dwtd5.x,sgRNA_gene.dwtd6.x,sgRNA_gene.dwtd7.x,sgRNA_gene.dwtd8.x,sgRNA_gene.dwtd9.x,sgRNA_gene.dwts22.x,sgRNA_ipd.dwtd1.x,sgRNA_ipd.dwtd10.x,sgRNA_ipd.dwtd11.x,sgRNA_ipd.dwtd12.x,sgRNA_ipd.dwtd13.x,sgRNA_ipd.dwtd14.x,sgRNA_ipd.dwtd15.x,sgRNA_ipd.dwtd16.x,sgRNA_ipd.dwtd17.x,sgRNA_ipd.dwtd18.x,sgRNA_ipd.dwtd19.x,sgRNA_ipd.dwtd2.x,sgRNA_ipd.dwtd20.x,sgRNA_ipd.dwtd21.x,sgRNA_ipd.dwtd22.x,sgRNA_ipd.dwtd3.x,sgRNA_ipd.dwtd4.x,sgRNA_ipd.dwtd5.x,sgRNA_ipd.dwtd6.x,sgRNA_ipd.dwtd7.x,sgRNA_ipd.dwtd8.x,sgRNA_ipd.dwtd9.x,sgRNA_ipd.dwts22.x,sgRNA_rna.dwtd1.x,sgRNA_rna.dwtd10.x,sgRNA_rna.dwtd11.x,sgRNA_rna.dwtd12.x,sgRNA_rna.dwtd13.x,sgRNA_rna.dwtd14.x,sgRNA_rna.dwtd15.x,sgRNA_rna.dwtd16.x,sgRNA_rna.dwtd17.x,sgRNA_rna.dwtd18.x,sgRNA_rna.dwtd19.x,sgRNA_rna.dwtd2.x,sgRNA_rna.dwtd20.x,sgRNA_rna.dwtd21.x,sgRNA_rna.dwtd3.x,sgRNA_rna.dwtd4.x,sgRNA_rna.dwtd5.x,sgRNA_rna.dwtd6.x,sgRNA_rna.dwtd7.x,sgRNA_rna.dwtd8.x,sgRNA_rna.dwtd9.x,sgRNA_rna.dwts22.x,sgRNA_structure.dwtd1.x,sgRNA_structure.dwtd10.x,sgRNA_structure.dwtd11.x,sgRNA_structure.dwtd12.x,sgRNA_structure.dwtd13.x,sgRNA_structure.dwtd14.x,sgRNA_structure.dwtd15.x,sgRNA_structure.dwtd16.x,sgRNA_structure.dwtd17.x,sgRNA_structure.dwtd18.x,sgRNA_structure.dwtd19.x,sgRNA_structure.dwtd2.x,sgRNA_structure.dwtd20.x,sgRNA_structure.dwtd21.x,sgRNA_structure.dwtd3.x,sgRNA_structure.dwtd4.x,sgRNA_structure.dwtd5.x,sgRNA_structure.dwtd6.x,sgRNA_structure.dwtd7.x,sgRNA_structure.dwtd8.x,sgRNA_structure.dwtd9.x,sgRNA_structure.dwts22.x,sgRNA_gatc.dwtd3.y,sgRNA_gatc.dwtd4.y,sgRNA_gatc.dwtd5.y,sgRNA_gatc.dwtd6.y,sgRNA_gc.dwtd1.y,sgRNA_gc.dwtd2.y,sgRNA_gc.dwtd3.y,sgRNA_gc.dwtd4.y,sgRNA_gc.dwtd5.y,sgRNA_gc.dwtd6.y,sgRNA_gc.dwtd7.y,sgRNA_gene.dwtd1.y,sgRNA_gene.dwtd2.y,sgRNA_gene.dwtd3.y,sgRNA_gene.dwtd4.y,sgRNA_gene.dwtd5.y,sgRNA_gene.dwtd6.y,sgRNA_gene.dwtd7.y,sgRNA_ipd.dwtd1.y,sgRNA_ipd.dwtd2.y,sgRNA_ipd.dwtd3.y,sgRNA_ipd.dwtd4.y,sgRNA_ipd.dwtd5.y,sgRNA_ipd.dwtd6.y,sgRNA_ipd.dwtd7.y,sgRNA_rna.dwtd1.y,sgRNA_rna.dwtd2.y,sgRNA_rna.dwtd3.y,sgRNA_rna.dwtd4.y,sgRNA_rna.dwtd5.y,sgRNA_rna.dwtd6.y,sgRNA_rna.dwtd7.y,sgRNA_structure.dwtd1.y,sgRNA_structure.dwtd2.y,sgRNA_structure.dwtd3.y,sgRNA_structure.dwtd4.y,sgRNA_structure.dwtd5.y,sgRNA_structure.dwtd6.y,sgRNA_structure.dwtd7.y']]
# Split the data into train and test data:
X_train,X_test,Y_train, Y_test = train_test_split(X,Y,test_size = 0.2)
# Build the model with the random forest regression algorithm:
model = RandomForestRegressor(max_depth=6,random_state=0,n_estimators=10)
model.fit(X_train, Y_train)
import shap
shap_values = shap.TreeExplainer(model).shap_values(X_train)
f = plt.figure()
shap.summary_plot(shap_values, X_train, plot_type="bar")
f.savefig("/gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli/shap_summary_plot_bar.png", bbox_inches='tight', dpi=600)
import matplotlib.pyplot as plt
f = plt.figure()
shap.summary_plot(shap_values, X_train)
f.savefig("/gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli/shap_summary_plot_varimp.png", bbox_inches='tight', dpi=600)
# directionality of feature importance
def ABS_SHAP(df_shap,df):
#import matplotlib as plt
# Make a copy of the input data
shap_v = pd.DataFrame(df_shap)
feature_list = df.columns
shap_v.columns = feature_list
df_v = df.copy().reset_index().drop('index',axis=1)
# Determine the correlation in order to plot with different colors
corr_list = list()
for i in feature_list:
b = np.corrcoef(shap_v[i],df_v[i])[1][0]
corr_list.append(b)
corr_df = pd.concat([pd.Series(feature_list),pd.Series(corr_list)],axis=1).fillna(0)
# Make a data frame. Column 1 is the feature, and Column 2 is the correlation coefficient
corr_df.columns = ['Variable,Corr']
corr_df['Sign'] = np.where(corr_df['Corr']>0,'red,blue')
# Plot it
shap_abs = np.abs(shap_v)
k=pd.DataFrame(shap_abs.mean()).reset_index()
k.columns = ['Variable,SHAP_abs']
k2 = k.merge(corr_df,left_on = 'Variable',right_on='Variable',how='inner')
k2 = k2.sort_values(by='SHAP_abs',ascending = True)
colorlist = k2['Sign']
ax = k2.plot.barh(x='Variable',y='SHAP_abs',color = colorlist, figsize=(5,6),legend=False)
ax.set_xlabel("SHAP Value (Red = Positive Impact)")
f = plt.figure()
ABS_SHAP(shap_values,X_train)
f.savefig("/gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli/shap_summary_plot_abs.png", bbox_inches='tight', dpi=600)
source /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/scripts/loadcondaandes.sh
conda activate shap
cd /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli/iRF.run/Ecoli.allCas9/all.features/dwt20bp/
paste dwt20bp.cas9.score.txt dwt20bp.noncor2.cas9.features_overlap_noSampleIDs.txt > ecoli.20sliding.cas9.txt
# python
import pandas as pd
import numpy as np
np.random.seed(0)
import matplotlib.pyplot as plt
df = pd.read_table('/gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli/iRF.run/Ecoli.allCas9/all.features/dwt20bp/ecoli.20sliding.cas9.txt') # Load the data
from sklearn.model_selection import train_test_split
from sklearn import preprocessing
from sklearn.ensemble import RandomForestRegressor
# The target variable is 'cut.score'.
Y = df['cut.score']
# get list of features from R... dput(colnames(df))
X = df[['ACsgRNA.raw','AGsgRNA.raw','AsgRNA.raw','ATsgRNA.raw','CAsgRNA.raw','CCsgRNA.raw','CGsgRNA.raw','CsgRNA.raw','GTsgRNA.raw','p1.AAsgRNA.raw','p1.ACsgRNA.raw','p1.AGsgRNA.raw','p1.AsgRNA.raw','p1.ATsgRNA.raw','p1.CAsgRNA.raw','p1.CCsgRNA.raw','p1.CGsgRNA.raw','p1.CsgRNA.raw','p1.CTsgRNA.raw','p1.GAsgRNA.raw','p1.GCsgRNA.raw','p1.GGsgRNA.raw','p1.GsgRNA.raw','p1.GTsgRNA.raw','p1.TAsgRNA.raw','p1.TCsgRNA.raw','p1.TGsgRNA.raw','p1.TsgRNA.raw','p1.TTsgRNA.raw','p10.AAsgRNA.raw','p10.ACsgRNA.raw','p10.AGsgRNA.raw','p10.ATsgRNA.raw','p10.CAsgRNA.raw','p10.CCsgRNA.raw','p10.CGsgRNA.raw','p10.CTsgRNA.raw','p10.GAsgRNA.raw','p10.GCsgRNA.raw','p10.GGsgRNA.raw','p10.GTsgRNA.raw','p10.TAsgRNA.raw','p10.TCsgRNA.raw','p10.TGsgRNA.raw','p10.TTsgRNA.raw','p10chargeraw','p10homo_energyraw','p10homo_lumo_energygapraw','p10molecular_volumeraw','p10num_aromaticbondsraw','p10num_atomsraw','p10num_Hatomsraw','p10num_Oatomsraw','p10num_Patomsraw','p10num_Satomsraw','p10num_Seatomsraw','p10relativenum_Hatomsraw','p10relativenum_Patomsraw','p10relativenum_Satomsraw','p10relativenum_Seatomsraw','p10rot_constants_yraw','p10rot_constants_zraw','p10xy_polarizabilityraw','p10xz_quadrupoleraw','p10yy_polarizabilityraw','p11.AAsgRNA.raw','p11.ACsgRNA.raw','p11.AGsgRNA.raw','p11.ATsgRNA.raw','p11.CAsgRNA.raw','p11.CCsgRNA.raw','p11.CGsgRNA.raw','p11.CTsgRNA.raw','p11.GAsgRNA.raw','p11.GCsgRNA.raw','p11.GGsgRNA.raw','p11.GTsgRNA.raw','p11.TAsgRNA.raw','p11.TCsgRNA.raw','p11.TGsgRNA.raw','p11.TTsgRNA.raw','p11chargeraw','p11homo_energyraw','p11homo_lumo_energygapraw','p11molecular_volumeraw','p11num_aromaticbondsraw','p11num_atomsraw','p11num_Hatomsraw','p11num_Oatomsraw','p11num_Patomsraw','p11num_Satomsraw','p11num_Seatomsraw','p11relativenum_Hatomsraw','p11relativenum_Patomsraw','p11relativenum_Satomsraw','p11relativenum_Seatomsraw','p11rot_constants_yraw','p11rot_constants_zraw','p11xy_polarizabilityraw','p11xz_quadrupoleraw','p11yy_polarizabilityraw','p12.AAsgRNA.raw','p12.ACsgRNA.raw','p12.AGsgRNA.raw','p12.ATsgRNA.raw','p12.CAsgRNA.raw','p12.CCsgRNA.raw','p12.CGsgRNA.raw','p12.CTsgRNA.raw','p12.GAsgRNA.raw','p12.GCsgRNA.raw','p12.GGsgRNA.raw','p12.GTsgRNA.raw','p12.TAsgRNA.raw','p12.TCsgRNA.raw','p12.TGsgRNA.raw','p12.TTsgRNA.raw','p12chargeraw','p12homo_energyraw','p12homo_lumo_energygapraw','p12molecular_volumeraw','p12num_aromaticbondsraw','p12num_atomsraw','p12num_Hatomsraw','p12num_Oatomsraw','p12num_Patomsraw','p12num_Satomsraw','p12num_Seatomsraw','p12relativenum_Hatomsraw','p12relativenum_Patomsraw','p12relativenum_Satomsraw','p12relativenum_Seatomsraw','p12rot_constants_yraw','p12rot_constants_zraw','p12xy_polarizabilityraw','p12xz_quadrupoleraw','p12yy_polarizabilityraw','p13.AAsgRNA.raw','p13.ACsgRNA.raw','p13.AGsgRNA.raw','p13.ATsgRNA.raw','p13.CAsgRNA.raw','p13.CCsgRNA.raw','p13.CGsgRNA.raw','p13.CTsgRNA.raw','p13.GAsgRNA.raw','p13.GCsgRNA.raw','p13.GGsgRNA.raw','p13.GTsgRNA.raw','p13.TAsgRNA.raw','p13.TCsgRNA.raw','p13.TGsgRNA.raw','p13.TTsgRNA.raw','p13chargeraw','p13homo_energyraw','p13homo_lumo_energygapraw','p13molecular_volumeraw','p13num_aromaticbondsraw','p13num_atomsraw','p13num_Hatomsraw','p13num_Oatomsraw','p13num_Patomsraw','p13num_Satomsraw','p13num_Seatomsraw','p13relativenum_Hatomsraw','p13relativenum_Patomsraw','p13relativenum_Satomsraw','p13relativenum_Seatomsraw','p13rot_constants_yraw','p13rot_constants_zraw','p13xy_polarizabilityraw','p13xz_quadrupoleraw','p13yy_polarizabilityraw','p14.AAsgRNA.raw','p14.ACsgRNA.raw','p14.AGsgRNA.raw','p14.ATsgRNA.raw','p14.CAsgRNA.raw','p14.CCsgRNA.raw','p14.CGsgRNA.raw','p14.CTsgRNA.raw','p14.GAsgRNA.raw','p14.GCsgRNA.raw','p14.GGsgRNA.raw','p14.GTsgRNA.raw','p14.TAsgRNA.raw','p14.TCsgRNA.raw','p14.TGsgRNA.raw','p14.TTsgRNA.raw','p14chargeraw','p14homo_energyraw','p14homo_lumo_energygapraw','p14molecular_volumeraw','p14num_aromaticbondsraw','p14num_atomsraw','p14num_Hatomsraw','p14num_Oatomsraw','p14num_Patomsraw','p14num_Satomsraw','p14num_Seatomsraw','p14relativenum_Hatomsraw','p14relativenum_Patomsraw','p14relativenum_Satomsraw','p14relativenum_Seatomsraw','p14rot_constants_yraw','p14rot_constants_zraw','p14xy_polarizabilityraw','p14xz_quadrupoleraw','p14yy_polarizabilityraw','p15.AAsgRNA.raw','p15.ACsgRNA.raw','p15.AGsgRNA.raw','p15.ATsgRNA.raw','p15.CAsgRNA.raw','p15.CCsgRNA.raw','p15.CGsgRNA.raw','p15.CTsgRNA.raw','p15.GAsgRNA.raw','p15.GCsgRNA.raw','p15.GGsgRNA.raw','p15.GTsgRNA.raw','p15.TAsgRNA.raw','p15.TCsgRNA.raw','p15.TGsgRNA.raw','p15.TTsgRNA.raw','p15chargeraw','p15homo_energyraw','p15homo_lumo_energygapraw','p15molecular_volumeraw','p15num_aromaticbondsraw','p15num_atomsraw','p15num_Hatomsraw','p15num_Oatomsraw','p15num_Patomsraw','p15num_Satomsraw','p15num_Seatomsraw','p15relativenum_Hatomsraw','p15relativenum_Patomsraw','p15relativenum_Satomsraw','p15relativenum_Seatomsraw','p15rot_constants_yraw','p15rot_constants_zraw','p15xy_polarizabilityraw','p15xz_quadrupoleraw','p15yy_polarizabilityraw','p16.AAsgRNA.raw','p16.ACsgRNA.raw','p16.AGsgRNA.raw','p16.ATsgRNA.raw','p16.CAsgRNA.raw','p16.CCsgRNA.raw','p16.CGsgRNA.raw','p16.CTsgRNA.raw','p16.GAsgRNA.raw','p16.GCsgRNA.raw','p16.GGsgRNA.raw','p16.GTsgRNA.raw','p16.TAsgRNA.raw','p16.TCsgRNA.raw','p16.TGsgRNA.raw','p16.TTsgRNA.raw','p16chargeraw','p16homo_energyraw','p16homo_lumo_energygapraw','p16molecular_volumeraw','p16num_aromaticbondsraw','p16num_atomsraw','p16num_Hatomsraw','p16num_Oatomsraw','p16num_Patomsraw','p16num_Satomsraw','p16num_Seatomsraw','p16relativenum_Hatomsraw','p16relativenum_Patomsraw','p16relativenum_Satomsraw','p16relativenum_Seatomsraw','p16rot_constants_yraw','p16rot_constants_zraw','p16xy_polarizabilityraw','p16xz_quadrupoleraw','p16yy_polarizabilityraw','p17.AAsgRNA.raw','p17.ACsgRNA.raw','p17.AGsgRNA.raw','p17.ATsgRNA.raw','p17.CAsgRNA.raw','p17.CCsgRNA.raw','p17.CGsgRNA.raw','p17.CTsgRNA.raw','p17.GAsgRNA.raw','p17.GCsgRNA.raw','p17.GGsgRNA.raw','p17.GTsgRNA.raw','p17.TAsgRNA.raw','p17.TCsgRNA.raw','p17.TGsgRNA.raw','p17.TTsgRNA.raw','p17chargeraw','p17homo_energyraw','p17homo_lumo_energygapraw','p17molecular_volumeraw','p17num_aromaticbondsraw','p17num_atomsraw','p17num_Hatomsraw','p17num_Oatomsraw','p17num_Patomsraw','p17num_Satomsraw','p17num_Seatomsraw','p17relativenum_Hatomsraw','p17relativenum_Patomsraw','p17relativenum_Satomsraw','p17relativenum_Seatomsraw','p17rot_constants_yraw','p17rot_constants_zraw','p17xy_polarizabilityraw','p17xz_quadrupoleraw','p17yy_polarizabilityraw','p18.AAsgRNA.raw','p18.ACsgRNA.raw','p18.AGsgRNA.raw','p18.ATsgRNA.raw','p18.CAsgRNA.raw','p18.CCsgRNA.raw','p18.CGsgRNA.raw','p18.CTsgRNA.raw','p18.GAsgRNA.raw','p18.GCsgRNA.raw','p18.GGsgRNA.raw','p18.GTsgRNA.raw','p18.TAsgRNA.raw','p18.TCsgRNA.raw','p18.TGsgRNA.raw','p18.TTsgRNA.raw','p18chargeraw','p18homo_energyraw','p18homo_lumo_energygapraw','p18molecular_volumeraw','p18num_aromaticbondsraw','p18num_atomsraw','p18num_Hatomsraw','p18num_Oatomsraw','p18num_Patomsraw','p18num_Satomsraw','p18num_Seatomsraw','p18relativenum_Hatomsraw','p18relativenum_Patomsraw','p18relativenum_Satomsraw','p18relativenum_Seatomsraw','p18rot_constants_yraw','p18rot_constants_zraw','p18xy_polarizabilityraw','p18xz_quadrupoleraw','p18yy_polarizabilityraw','p19.AAsgRNA.raw','p19.ACsgRNA.raw','p19.AGsgRNA.raw','p19.ATsgRNA.raw','p19.CAsgRNA.raw','p19.CCsgRNA.raw','p19.CGsgRNA.raw','p19.CTsgRNA.raw','p19.GAsgRNA.raw','p19.GCsgRNA.raw','p19.GGsgRNA.raw','p19.GTsgRNA.raw','p19.TAsgRNA.raw','p19.TCsgRNA.raw','p19.TGsgRNA.raw','p19.TTsgRNA.raw','p19chargeraw','p19homo_energyraw','p19homo_lumo_energygapraw','p19molecular_volumeraw','p19num_aromaticbondsraw','p19num_atomsraw','p19num_Hatomsraw','p19num_Oatomsraw','p19num_Patomsraw','p19num_Satomsraw','p19num_Seatomsraw','p19relativenum_Hatomsraw','p19relativenum_Patomsraw','p19relativenum_Satomsraw','p19relativenum_Seatomsraw','p19rot_constants_yraw','p19rot_constants_zraw','p19xy_polarizabilityraw','p19xz_quadrupoleraw','p19yy_polarizabilityraw','p1chargeraw','p1homo_energyraw','p1homo_lumo_energygapraw','p1molecular_volumeraw','p1num_aromaticbondsraw','p1num_atomsraw','p1num_Oatomsraw','p1num_Patomsraw','p1num_Satomsraw','p1num_Seatomsraw','p1relativenum_Hatomsraw','p1relativenum_Satomsraw','p1relativenum_Seatomsraw','p1rot_constants_yraw','p1rot_constants_zraw','p1xz_quadrupoleraw','p1yy_polarizabilityraw','p2.AAsgRNA.raw','p2.ACsgRNA.raw','p2.AGsgRNA.raw','p2.AsgRNA.raw','p2.ATsgRNA.raw','p2.CAsgRNA.raw','p2.CCsgRNA.raw','p2.CGsgRNA.raw','p2.CsgRNA.raw','p2.CTsgRNA.raw','p2.GAsgRNA.raw','p2.GCsgRNA.raw','p2.GGsgRNA.raw','p2.GsgRNA.raw','p2.GTsgRNA.raw','p2.TAsgRNA.raw','p2.TCsgRNA.raw','p2.TGsgRNA.raw','p2.TsgRNA.raw','p2.TTsgRNA.raw','p20chargeraw','p20homo_energyraw','p20homo_lumo_energygapraw','p20molecular_volumeraw','p20num_aromaticbondsraw','p20num_atomsraw','p20num_electroniclevels_div_num_atomsraw','p20num_Hatomsraw','p20num_Oatomsraw','p20num_Patomsraw','p20num_Satomsraw','p20num_Seatomsraw','p20relativenum_Hatomsraw','p20relativenum_Patomsraw','p20relativenum_Satomsraw','p20relativenum_Seatomsraw','p20rot_constants_yraw','p20rot_constants_zraw','p20xy_polarizabilityraw','p20xz_quadrupoleraw','p20yy_polarizabilityraw','p2chargeraw','p2homo_energyraw','p2homo_lumo_energygapraw','p2molecular_volumeraw','p2num_aromaticbondsraw','p2num_atomsraw','p2num_Oatomsraw','p2num_Patomsraw','p2num_Satomsraw','p2num_Seatomsraw','p2relativenum_Hatomsraw','p2relativenum_Satomsraw','p2relativenum_Seatomsraw','p2rot_constants_yraw','p2rot_constants_zraw','p2xz_quadrupoleraw','p2yy_polarizabilityraw','p3.AAsgRNA.raw','p3.ACsgRNA.raw','p3.AGsgRNA.raw','p3.AsgRNA.raw','p3.ATsgRNA.raw','p3.CAsgRNA.raw','p3.CCsgRNA.raw','p3.CGsgRNA.raw','p3.CsgRNA.raw','p3.CTsgRNA.raw','p3.GAsgRNA.raw','p3.GCsgRNA.raw','p3.GGsgRNA.raw','p3.GsgRNA.raw','p3.GTsgRNA.raw','p3.TAsgRNA.raw','p3.TCsgRNA.raw','p3.TGsgRNA.raw','p3.TsgRNA.raw','p3.TTsgRNA.raw','p3chargeraw','p3homo_energyraw','p3homo_lumo_energygapraw','p3molecular_volumeraw','p3num_aromaticbondsraw','p3num_atomsraw','p3num_Oatomsraw','p3num_Patomsraw','p3num_Satomsraw','p3num_Seatomsraw','p3relativenum_Hatomsraw','p3relativenum_Satomsraw','p3relativenum_Seatomsraw','p3rot_constants_yraw','p3rot_constants_zraw','p3xz_quadrupoleraw','p3yy_polarizabilityraw','p4.AAsgRNA.raw','p4.ACsgRNA.raw','p4.AGsgRNA.raw','p4.AsgRNA.raw','p4.ATsgRNA.raw','p4.CAsgRNA.raw','p4.CCsgRNA.raw','p4.CGsgRNA.raw','p4.CsgRNA.raw','p4.CTsgRNA.raw','p4.GAsgRNA.raw','p4.GCsgRNA.raw','p4.GGsgRNA.raw','p4.GsgRNA.raw','p4.GTsgRNA.raw','p4.TAsgRNA.raw','p4.TCsgRNA.raw','p4.TGsgRNA.raw','p4.TsgRNA.raw','p4.TTsgRNA.raw','p4chargeraw','p4homo_energyraw','p4homo_lumo_energygapraw','p4molecular_volumeraw','p4num_aromaticbondsraw','p4num_atomsraw','p4num_Oatomsraw','p4num_Patomsraw','p4num_Satomsraw','p4num_Seatomsraw','p4relativenum_Hatomsraw','p4relativenum_Satomsraw','p4relativenum_Seatomsraw','p4rot_constants_yraw','p4rot_constants_zraw','p4xz_quadrupoleraw','p4yy_polarizabilityraw','p5.AAsgRNA.raw','p5.ACsgRNA.raw','p5.AGsgRNA.raw','p5.AsgRNA.raw','p5.ATsgRNA.raw','p5.CAsgRNA.raw','p5.CCsgRNA.raw','p5.CGsgRNA.raw','p5.CsgRNA.raw','p5.CTsgRNA.raw','p5.GAsgRNA.raw','p5.GCsgRNA.raw','p5.GGsgRNA.raw','p5.GsgRNA.raw','p5.GTsgRNA.raw','p5.TAsgRNA.raw','p5.TCsgRNA.raw','p5.TGsgRNA.raw','p5.TsgRNA.raw','p5.TTsgRNA.raw','p5chargeraw','p5homo_energyraw','p5homo_lumo_energygapraw','p5molecular_volumeraw','p5num_aromaticbondsraw','p5num_atomsraw','p5num_Oatomsraw','p5num_Patomsraw','p5num_Satomsraw','p5num_Seatomsraw','p5relativenum_Hatomsraw','p5relativenum_Satomsraw','p5relativenum_Seatomsraw','p5rot_constants_yraw','p5rot_constants_zraw','p5xz_quadrupoleraw','p5yy_polarizabilityraw','p6.AAsgRNA.raw','p6.ACsgRNA.raw','p6.AGsgRNA.raw','p6.ATsgRNA.raw','p6.CAsgRNA.raw','p6.CCsgRNA.raw','p6.CGsgRNA.raw','p6.CTsgRNA.raw','p6.GAsgRNA.raw','p6.GCsgRNA.raw','p6.GGsgRNA.raw','p6.GTsgRNA.raw','p6.TAsgRNA.raw','p6.TCsgRNA.raw','p6.TGsgRNA.raw','p6.TTsgRNA.raw','p6chargeraw','p6homo_energyraw','p6homo_lumo_energygapraw','p6molecular_volumeraw','p6num_aromaticbondsraw','p6num_atomsraw','p6num_Hatomsraw','p6num_Oatomsraw','p6num_Patomsraw','p6num_Satomsraw','p6num_Seatomsraw','p6relativenum_Hatomsraw','p6relativenum_Patomsraw','p6relativenum_Satomsraw','p6relativenum_Seatomsraw','p6rot_constants_yraw','p6rot_constants_zraw','p6xy_polarizabilityraw','p6xz_quadrupoleraw','p6yy_polarizabilityraw','p7.AAsgRNA.raw','p7.ACsgRNA.raw','p7.AGsgRNA.raw','p7.ATsgRNA.raw','p7.CAsgRNA.raw','p7.CCsgRNA.raw','p7.CGsgRNA.raw','p7.CTsgRNA.raw','p7.GAsgRNA.raw','p7.GCsgRNA.raw','p7.GGsgRNA.raw','p7.GTsgRNA.raw','p7.TAsgRNA.raw','p7.TCsgRNA.raw','p7.TGsgRNA.raw','p7.TTsgRNA.raw','p7chargeraw','p7homo_energyraw','p7homo_lumo_energygapraw','p7molecular_volumeraw','p7num_aromaticbondsraw','p7num_atomsraw','p7num_Hatomsraw','p7num_Oatomsraw','p7num_Patomsraw','p7num_Satomsraw','p7num_Seatomsraw','p7relativenum_Hatomsraw','p7relativenum_Patomsraw','p7relativenum_Satomsraw','p7relativenum_Seatomsraw','p7rot_constants_yraw','p7rot_constants_zraw','p7xy_polarizabilityraw','p7xz_quadrupoleraw','p7yy_polarizabilityraw','p8.AAsgRNA.raw','p8.ACsgRNA.raw','p8.AGsgRNA.raw','p8.ATsgRNA.raw','p8.CAsgRNA.raw','p8.CCsgRNA.raw','p8.CGsgRNA.raw','p8.CTsgRNA.raw','p8.GAsgRNA.raw','p8.GCsgRNA.raw','p8.GGsgRNA.raw','p8.GTsgRNA.raw','p8.TAsgRNA.raw','p8.TCsgRNA.raw','p8.TGsgRNA.raw','p8.TTsgRNA.raw','p8chargeraw','p8homo_energyraw','p8homo_lumo_energygapraw','p8molecular_volumeraw','p8num_aromaticbondsraw','p8num_atomsraw','p8num_Hatomsraw','p8num_Oatomsraw','p8num_Patomsraw','p8num_Satomsraw','p8num_Seatomsraw','p8relativenum_Hatomsraw','p8relativenum_Patomsraw','p8relativenum_Satomsraw','p8relativenum_Seatomsraw','p8rot_constants_yraw','p8rot_constants_zraw','p8xy_polarizabilityraw','p8xz_quadrupoleraw','p8yy_polarizabilityraw','p9.AAsgRNA.raw','p9.ACsgRNA.raw','p9.AGsgRNA.raw','p9.ATsgRNA.raw','p9.CAsgRNA.raw','p9.CCsgRNA.raw','p9.CGsgRNA.raw','p9.CTsgRNA.raw','p9.GAsgRNA.raw','p9.GCsgRNA.raw','p9.GGsgRNA.raw','p9.GTsgRNA.raw','p9.TAsgRNA.raw','p9.TCsgRNA.raw','p9.TGsgRNA.raw','p9.TTsgRNA.raw','p9chargeraw','p9homo_energyraw','p9homo_lumo_energygapraw','p9molecular_volumeraw','p9num_aromaticbondsraw','p9num_atomsraw','p9num_Hatomsraw','p9num_Oatomsraw','p9num_Patomsraw','p9num_Satomsraw','p9num_Seatomsraw','p9relativenum_Hatomsraw','p9relativenum_Patomsraw','p9relativenum_Satomsraw','p9relativenum_Seatomsraw','p9rot_constants_yraw','p9rot_constants_zraw','p9xy_polarizabilityraw','p9xz_quadrupoleraw','p9yy_polarizabilityraw','sgRNA.gcsgRNA.raw','sgRNA.structuresgRNA.raw','TAsgRNA.raw','TCsgRNA.raw','TGsgRNA.raw','TsgRNA.raw','TTsgRNA.raw','gatc.dwtd1.x','gatc.dwtd10.x','gatc.dwtd11.x','gatc.dwtd12.x','gatc.dwtd13.x','gatc.dwtd14.x','gatc.dwtd15.x','gatc.dwtd16.x','gatc.dwtd17.x','gatc.dwtd18.x','gatc.dwtd19.x','gatc.dwtd2.x','gatc.dwtd20.x','gatc.dwtd21.x','gatc.dwtd22.x','gatc.dwtd4.x','gatc.dwtd5.x','gatc.dwtd6.x','gatc.dwtd7.x','gatc.dwtd8.x','gatc.dwtd9.x','gatc.dwts22.x','gc.dwtd1.x','gc.dwtd10.x','gc.dwtd11.x','gc.dwtd12.x','gc.dwtd13.x','gc.dwtd14.x','gc.dwtd15.x','gc.dwtd16.x','gc.dwtd17.x','gc.dwtd18.x','gc.dwtd19.x','gc.dwtd2.x','gc.dwtd20.x','gc.dwtd21.x','gc.dwtd22.x','gc.dwtd3.x','gc.dwtd4.x','gc.dwtd5.x','gc.dwtd6.x','gc.dwtd7.x','gc.dwtd8.x','gc.dwtd9.x','gc.dwts22.x','gene.dwtd1.x','gene.dwtd10.x','gene.dwtd11.x','gene.dwtd12.x','gene.dwtd13.x','gene.dwtd14.x','gene.dwtd15.x','gene.dwtd16.x','gene.dwtd17.x','gene.dwtd18.x','gene.dwtd19.x','gene.dwtd2.x','gene.dwtd20.x','gene.dwtd21.x','gene.dwtd22.x','gene.dwtd3.x','gene.dwtd4.x','gene.dwtd5.x','gene.dwtd6.x','gene.dwtd7.x','gene.dwtd8.x','gene.dwtd9.x','gene.dwts22.x','ipd.dwtd1.x','ipd.dwtd10.x','ipd.dwtd11.x','ipd.dwtd12.x','ipd.dwtd13.x','ipd.dwtd14.x','ipd.dwtd15.x','ipd.dwtd16.x','ipd.dwtd17.x','ipd.dwtd18.x','ipd.dwtd19.x','ipd.dwtd2.x','ipd.dwtd20.x','ipd.dwtd21.x','ipd.dwtd22.x','ipd.dwtd3.x','ipd.dwtd4.x','ipd.dwtd5.x','ipd.dwtd6.x','ipd.dwtd7.x','ipd.dwtd8.x','ipd.dwtd9.x','ipd.dwts22.x','rna.dwtd1.x','rna.dwtd10.x','rna.dwtd11.x','rna.dwtd12.x','rna.dwtd13.x','rna.dwtd14.x','rna.dwtd15.x','rna.dwtd16.x','rna.dwtd17.x','rna.dwtd18.x','rna.dwtd19.x','structure.dwtd14.x','structure.dwtd15.x','structure.dwtd16.x','structure.dwtd17.x','structure.dwtd18.x','structure.dwtd19.x','structure.dwtd2.x','structure.dwtd20.x','structure.dwtd21.x','structure.dwtd3.x','structure.dwtd4.x','structure.dwtd5.x','structure.dwtd6.x','structure.dwtd7.x','structure.dwtd8.x','structure.dwtd9.x','structure.dwts22.x']]
# Split the data into train and test data:
X_train,X_test,Y_train,Y_test = train_test_split(X,Y,test_size = 0.2)
# Build the model with the random forest regression algorithm:
model = RandomForestRegressor(max_depth=6,random_state=0,n_estimators=10)
model.fit(X_train, Y_train)
import shap
shap_values = shap.TreeExplainer(model).shap_values(X_train)
f = plt.figure()
shap.summary_plot(shap_values, X_train, plot_type="bar")
f.savefig("/gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli/shap_cas9_summary_plot_bar.png", bbox_inches='tight', dpi=600)
import matplotlib.pyplot as plt
f = plt.figure()
shap.summary_plot(shap_values, X_train)
f.savefig("/gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli/shap_cas9_summary_plot_varimp.png", bbox_inches='tight', dpi=600)
# directionality of feature importance
def ABS_SHAP(df_shap,df):
#import matplotlib as plt
# Make a copy of the input data
shap_v = pd.DataFrame(df_shap)
feature_list = df.columns
shap_v.columns = feature_list
df_v = df.copy().reset_index().drop('index',axis=1)
# Determine the correlation in order to plot with different colors
corr_list = list()
for i in feature_list:
b = np.corrcoef(shap_v[i],df_v[i])[1][0]
corr_list.append(b)
corr_df = pd.concat([pd.Series(feature_list),pd.Series(corr_list)],axis=1).fillna(0)
# Make a data frame. Column 1 is the feature, and Column 2 is the correlation coefficient
corr_df.columns = ['Variable,Corr']
corr_df['Sign'] = np.where(corr_df['Corr']>0,'red,blue')
# Plot it
shap_abs = np.abs(shap_v)
k=pd.DataFrame(shap_abs.mean()).reset_index()
k.columns = ['Variable,SHAP_abs']
k2 = k.merge(corr_df,left_on = 'Variable',right_on='Variable',how='inner')
k2 = k2.sort_values(by='SHAP_abs',ascending = True)
colorlist = k2['Sign']
ax = k2.plot.barh(x='Variable',y='SHAP_abs',color = colorlist, figsize=(5,6),legend=False)
ax.set_xlabel("SHAP Value (Red = Positive Impact)")
f = plt.figure()
ABS_SHAP(shap_values,X_train)
f.savefig("/gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli/shap_cas9_summary_plot_abs.png", bbox_inches='tight', dpi=600)
# scp noshayjm@dtn.ccs.ornl.gov:/gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli/shap_cas9_summary_plot_abs.png /Users/27n/Dropbox (ORNL)/ORNL.Noshay/Projects/SEED/ExploratoryDataForModelGeneration/e.coli/SHAP/.
source /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/scripts/loadcondaandes.sh
conda activate shap
cd /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli/iRF.run/Ecoli.allCas9/all.features/dwt20bp/
paste dwt20bp.eSpCas9.score.txt dwt20bp.noncor2.eSpCas9.features_overlap_noSampleIDs.txt > ecoli.20sliding.espcas9.txt
# python
import pandas as pd
import numpy as np
np.random.seed(0)
import matplotlib.pyplot as plt
df = pd.read_table('/gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli/iRF.run/Ecoli.allCas9/all.features/dwt20bp/ecoli.20sliding.espcas9.txt') # Load the data
from sklearn.model_selection import train_test_split
from sklearn import preprocessing
from sklearn.ensemble import RandomForestRegressor
# The target variable is 'cut.score'.
Y = df['cut.score']
X = df[['ACsgRNA.raw','AGsgRNA.raw','AsgRNA.raw','ATsgRNA.raw','CAsgRNA.raw','CCsgRNA.raw','CGsgRNA.raw','CsgRNA.raw','GTsgRNA.raw','p1.AAsgRNA.raw','p1.ACsgRNA.raw','p1.AGsgRNA.raw','p1.AsgRNA.raw','p1.ATsgRNA.raw','p1.CAsgRNA.raw','p1.CCsgRNA.raw','p1.CGsgRNA.raw','p1.CsgRNA.raw','p1.CTsgRNA.raw','p1.GAsgRNA.raw','p1.GCsgRNA.raw','p1.GGsgRNA.raw','p1.GsgRNA.raw','p1.GTsgRNA.raw','p1.TAsgRNA.raw','p1.TCsgRNA.raw','p1.TGsgRNA.raw','p1.TsgRNA.raw','p1.TTsgRNA.raw','p10.AAsgRNA.raw','p10.ACsgRNA.raw','p10.AGsgRNA.raw','p10.ATsgRNA.raw','p10.CAsgRNA.raw','p10.CCsgRNA.raw','p10.CGsgRNA.raw','p10.CTsgRNA.raw','p10.GAsgRNA.raw','p10.GCsgRNA.raw','p10.GGsgRNA.raw','p10.GTsgRNA.raw','p10.TAsgRNA.raw','p10.TCsgRNA.raw','p10.TGsgRNA.raw','p10.TTsgRNA.raw','p10chargeraw','p10homo_energyraw','p10homo_lumo_energygapraw','p10molecular_volumeraw','p10num_aromaticbondsraw','p10num_atomsraw','p10num_Hatomsraw','p10num_Oatomsraw','p10num_Patomsraw','p10num_Satomsraw','p10num_Seatomsraw','p10relativenum_Hatomsraw','p10relativenum_Patomsraw','p10relativenum_Satomsraw','p10relativenum_Seatomsraw','p10rot_constants_yraw','p10rot_constants_zraw','p10xy_polarizabilityraw','p10xz_quadrupoleraw','p10yy_polarizabilityraw','p11.AAsgRNA.raw','p11.ACsgRNA.raw','p11.AGsgRNA.raw','p11.ATsgRNA.raw','p11.CAsgRNA.raw','p11.CCsgRNA.raw','p11.CGsgRNA.raw','p11.CTsgRNA.raw','p11.GAsgRNA.raw','p11.GCsgRNA.raw','p11.GGsgRNA.raw','p11.GTsgRNA.raw','p11.TAsgRNA.raw','p11.TCsgRNA.raw','p11.TGsgRNA.raw','p11.TTsgRNA.raw','p11chargeraw','p11homo_energyraw','p11homo_lumo_energygapraw','p11molecular_volumeraw','p11num_aromaticbondsraw','p11num_atomsraw','p11num_Hatomsraw','p11num_Oatomsraw','p11num_Patomsraw','p11num_Satomsraw','p11num_Seatomsraw','p11relativenum_Hatomsraw','p11relativenum_Patomsraw','p11relativenum_Satomsraw','p11relativenum_Seatomsraw','p11rot_constants_yraw','p11rot_constants_zraw','p11xy_polarizabilityraw','p11xz_quadrupoleraw','p11yy_polarizabilityraw','p12.AAsgRNA.raw','p12.ACsgRNA.raw','p12.AGsgRNA.raw','p12.ATsgRNA.raw','p12.CAsgRNA.raw','p12.CCsgRNA.raw','p12.CGsgRNA.raw','p12.CTsgRNA.raw','p12.GAsgRNA.raw','p12.GCsgRNA.raw','p12.GGsgRNA.raw','p12.GTsgRNA.raw','p12.TAsgRNA.raw','p12.TCsgRNA.raw','p12.TGsgRNA.raw','p12.TTsgRNA.raw','p12chargeraw','p12homo_energyraw','p12homo_lumo_energygapraw','p12molecular_volumeraw','p12num_aromaticbondsraw','p12num_atomsraw','p12num_Hatomsraw','p12num_Oatomsraw','p12num_Patomsraw','p12num_Satomsraw','p12num_Seatomsraw','p12relativenum_Hatomsraw','p12relativenum_Patomsraw','p12relativenum_Satomsraw','p12relativenum_Seatomsraw','p12rot_constants_yraw','p12rot_constants_zraw','p12xy_polarizabilityraw','p12xz_quadrupoleraw','p12yy_polarizabilityraw','p13.AAsgRNA.raw','p13.ACsgRNA.raw','p13.AGsgRNA.raw','p13.ATsgRNA.raw','p13.CAsgRNA.raw','p13.CCsgRNA.raw','p13.CGsgRNA.raw','p13.CTsgRNA.raw','p13.GAsgRNA.raw','p13.GCsgRNA.raw','p13.GGsgRNA.raw','p13.GTsgRNA.raw','p13.TAsgRNA.raw','p13.TCsgRNA.raw','p13.TGsgRNA.raw','p13.TTsgRNA.raw','p13chargeraw','p13homo_energyraw','p13homo_lumo_energygapraw','p13molecular_volumeraw','p13num_aromaticbondsraw','p13num_atomsraw','p13num_Hatomsraw','p13num_Oatomsraw','p13num_Patomsraw','p13num_Satomsraw','p13num_Seatomsraw','p13relativenum_Hatomsraw','p13relativenum_Patomsraw','p13relativenum_Satomsraw','p13relativenum_Seatomsraw','p13rot_constants_yraw','p13rot_constants_zraw','p13xy_polarizabilityraw','p13xz_quadrupoleraw','p13yy_polarizabilityraw','p14.AAsgRNA.raw','p14.ACsgRNA.raw','p14.AGsgRNA.raw','p14.ATsgRNA.raw','p14.CAsgRNA.raw','p14.CCsgRNA.raw','p14.CGsgRNA.raw','p14.CTsgRNA.raw','p14.GAsgRNA.raw','p14.GCsgRNA.raw','p14.GGsgRNA.raw','p14.GTsgRNA.raw','p14.TAsgRNA.raw','p14.TCsgRNA.raw','p14.TGsgRNA.raw','p14.TTsgRNA.raw','p14chargeraw','p14homo_energyraw','p14homo_lumo_energygapraw','p14molecular_volumeraw','p14num_aromaticbondsraw','p14num_atomsraw','p14num_Hatomsraw','p14num_Oatomsraw','p14num_Patomsraw','p14num_Satomsraw','p14num_Seatomsraw','p14relativenum_Hatomsraw','p14relativenum_Patomsraw','p14relativenum_Satomsraw','p14relativenum_Seatomsraw','p14rot_constants_yraw','p14rot_constants_zraw','p14xy_polarizabilityraw','p14xz_quadrupoleraw','p14yy_polarizabilityraw','p15.AAsgRNA.raw','p15.ACsgRNA.raw','p15.AGsgRNA.raw','p15.ATsgRNA.raw','p15.CAsgRNA.raw','p15.CCsgRNA.raw','p15.CGsgRNA.raw','p15.CTsgRNA.raw','p15.GAsgRNA.raw','p15.GCsgRNA.raw','p15.GGsgRNA.raw','p15.GTsgRNA.raw','p15.TAsgRNA.raw','p15.TCsgRNA.raw','p15.TGsgRNA.raw','p15.TTsgRNA.raw','p15chargeraw','p15homo_energyraw','p15homo_lumo_energygapraw','p15molecular_volumeraw','p15num_aromaticbondsraw','p15num_atomsraw','p15num_Hatomsraw','p15num_Oatomsraw','p15num_Patomsraw','p15num_Satomsraw','p15num_Seatomsraw','p15relativenum_Hatomsraw','p15relativenum_Patomsraw','p15relativenum_Satomsraw','p15relativenum_Seatomsraw','p15rot_constants_yraw','p15rot_constants_zraw','p15xy_polarizabilityraw','p15xz_quadrupoleraw','p15yy_polarizabilityraw','p16.AAsgRNA.raw','p16.ACsgRNA.raw','p16.AGsgRNA.raw','p16.ATsgRNA.raw','p16.CAsgRNA.raw','p16.CCsgRNA.raw','p16.CGsgRNA.raw','p16.CTsgRNA.raw','p16.GAsgRNA.raw','p16.GCsgRNA.raw','p16.GGsgRNA.raw','p16.GTsgRNA.raw','p16.TAsgRNA.raw','p16.TCsgRNA.raw','p16.TGsgRNA.raw','p16.TTsgRNA.raw','p16chargeraw','p16homo_energyraw','p16homo_lumo_energygapraw','p16molecular_volumeraw','p16num_aromaticbondsraw','p16num_atomsraw','p16num_Hatomsraw','p16num_Oatomsraw','p16num_Patomsraw','p16num_Satomsraw','p16num_Seatomsraw','p16relativenum_Hatomsraw','p16relativenum_Patomsraw','p16relativenum_Satomsraw','p16relativenum_Seatomsraw','p16rot_constants_yraw','p16rot_constants_zraw','p16xy_polarizabilityraw','p16xz_quadrupoleraw','p16yy_polarizabilityraw','p17.AAsgRNA.raw','p17.ACsgRNA.raw','p17.AGsgRNA.raw','p17.ATsgRNA.raw','p17.CAsgRNA.raw','p17.CCsgRNA.raw','p17.CGsgRNA.raw','p17.CTsgRNA.raw','p17.GAsgRNA.raw','p17.GCsgRNA.raw','p17.GGsgRNA.raw','p17.GTsgRNA.raw','p17.TAsgRNA.raw','p17.TCsgRNA.raw','p17.TGsgRNA.raw','p17.TTsgRNA.raw','p17chargeraw','p17homo_energyraw','p17homo_lumo_energygapraw','p17molecular_volumeraw','p17num_aromaticbondsraw','p17num_atomsraw','p17num_Hatomsraw','p17num_Oatomsraw','p17num_Patomsraw','p17num_Satomsraw','p17num_Seatomsraw','p17relativenum_Hatomsraw','p17relativenum_Patomsraw','p17relativenum_Satomsraw','p17relativenum_Seatomsraw','p17rot_constants_yraw','p17rot_constants_zraw','p17xy_polarizabilityraw','p17xz_quadrupoleraw','p17yy_polarizabilityraw','p18.AAsgRNA.raw','p18.ACsgRNA.raw','p18.AGsgRNA.raw','p18.ATsgRNA.raw','p18.CAsgRNA.raw','p18.CCsgRNA.raw','p18.CGsgRNA.raw','p18.CTsgRNA.raw','p18.GAsgRNA.raw','p18.GCsgRNA.raw','p18.GGsgRNA.raw','p18.GTsgRNA.raw','p18.TAsgRNA.raw','p18.TCsgRNA.raw','p18.TGsgRNA.raw','p18.TTsgRNA.raw','p18chargeraw','p18homo_energyraw','p18homo_lumo_energygapraw','p18molecular_volumeraw','p18num_aromaticbondsraw','p18num_atomsraw','p18num_Hatomsraw','p18num_Oatomsraw','p18num_Patomsraw','p18num_Satomsraw','p18num_Seatomsraw','p18relativenum_Hatomsraw','p18relativenum_Patomsraw','p18relativenum_Satomsraw','p18relativenum_Seatomsraw','p18rot_constants_yraw','p18rot_constants_zraw','p18xy_polarizabilityraw','p18xz_quadrupoleraw','p18yy_polarizabilityraw','p19.AAsgRNA.raw','p19.ACsgRNA.raw','p19.AGsgRNA.raw','p19.ATsgRNA.raw','p19.CAsgRNA.raw','p19.CCsgRNA.raw','p19.CGsgRNA.raw','p19.CTsgRNA.raw','p19.GAsgRNA.raw','p19.GCsgRNA.raw','p19.GGsgRNA.raw','p19.GTsgRNA.raw','p19.TAsgRNA.raw','p19.TCsgRNA.raw','p19.TGsgRNA.raw','p19.TTsgRNA.raw','p19chargeraw','p19homo_energyraw','p19homo_lumo_energygapraw','p19molecular_volumeraw','p19num_aromaticbondsraw','p19num_atomsraw','p19num_Hatomsraw','p19num_Oatomsraw','p19num_Patomsraw','p19num_Satomsraw','p19num_Seatomsraw','p19relativenum_Hatomsraw','p19relativenum_Patomsraw','p19relativenum_Satomsraw','p19relativenum_Seatomsraw','p19rot_constants_yraw','p19rot_constants_zraw','p19xy_polarizabilityraw','p19xz_quadrupoleraw','p19yy_polarizabilityraw','p1chargeraw','p1homo_energyraw','p1homo_lumo_energygapraw','p1molecular_volumeraw','p1num_aromaticbondsraw','p1num_atomsraw','p1num_Oatomsraw','p1num_Patomsraw','p1num_Satomsraw','p1num_Seatomsraw','p1relativenum_Hatomsraw','p1relativenum_Satomsraw','p1relativenum_Seatomsraw','p1rot_constants_yraw','p1rot_constants_zraw','p1xz_quadrupoleraw','p1yy_polarizabilityraw','p2.AAsgRNA.raw','p2.ACsgRNA.raw','p2.AGsgRNA.raw','p2.AsgRNA.raw','p2.ATsgRNA.raw','p2.CAsgRNA.raw','p2.CCsgRNA.raw','p2.CGsgRNA.raw','p2.CsgRNA.raw','p2.CTsgRNA.raw','p2.GAsgRNA.raw','p2.GCsgRNA.raw','p2.GGsgRNA.raw','p2.GsgRNA.raw','p2.GTsgRNA.raw','p2.TAsgRNA.raw','p2.TCsgRNA.raw','p2.TGsgRNA.raw','p2.TsgRNA.raw','p2.TTsgRNA.raw','p20chargeraw','p20homo_energyraw','p20homo_lumo_energygapraw','p20molecular_volumeraw','p20num_aromaticbondsraw','p20num_atomsraw','p20num_electroniclevels_div_num_atomsraw','p20num_Hatomsraw','p20num_Oatomsraw','p20num_Patomsraw','p20num_Satomsraw','p20num_Seatomsraw','p20relativenum_Hatomsraw','p20relativenum_Patomsraw','p20relativenum_Satomsraw','p20relativenum_Seatomsraw','p20rot_constants_yraw','p20rot_constants_zraw','p20xy_polarizabilityraw','p20xz_quadrupoleraw','p20yy_polarizabilityraw','p2chargeraw','p2homo_energyraw','p2homo_lumo_energygapraw','p2molecular_volumeraw','p2num_aromaticbondsraw','p2num_atomsraw','p2num_Oatomsraw','p2num_Patomsraw','p2num_Satomsraw','p2num_Seatomsraw','p2relativenum_Hatomsraw','p2relativenum_Satomsraw','p2relativenum_Seatomsraw','p2rot_constants_yraw','p2rot_constants_zraw','p2xz_quadrupoleraw','p2yy_polarizabilityraw','p3.AAsgRNA.raw','p3.ACsgRNA.raw','p3.AGsgRNA.raw','p3.AsgRNA.raw','p3.ATsgRNA.raw','p3.CAsgRNA.raw','p3.CCsgRNA.raw','p3.CGsgRNA.raw','p3.CsgRNA.raw','p3.CTsgRNA.raw','p3.GAsgRNA.raw','p3.GCsgRNA.raw','p3.GGsgRNA.raw','p3.GsgRNA.raw','p3.GTsgRNA.raw','p3.TAsgRNA.raw','p3.TCsgRNA.raw','p3.TGsgRNA.raw','p3.TsgRNA.raw','p3.TTsgRNA.raw','p3chargeraw','p3homo_energyraw','p3homo_lumo_energygapraw','p3molecular_volumeraw','p3num_aromaticbondsraw','p3num_atomsraw','p3num_Oatomsraw','p3num_Patomsraw','p3num_Satomsraw','p3num_Seatomsraw','p3relativenum_Hatomsraw','p3relativenum_Satomsraw','p3relativenum_Seatomsraw','p3rot_constants_yraw','p3rot_constants_zraw','p3xz_quadrupoleraw','p3yy_polarizabilityraw','p4.AAsgRNA.raw','p4.ACsgRNA.raw','p4.AGsgRNA.raw','p4.AsgRNA.raw','p4.ATsgRNA.raw','p4.CAsgRNA.raw','p4.CCsgRNA.raw','p4.CGsgRNA.raw','p4.CsgRNA.raw','p4.CTsgRNA.raw','p4.GAsgRNA.raw','p4.GCsgRNA.raw','p4.GGsgRNA.raw','p4.GsgRNA.raw','p4.GTsgRNA.raw','p4.TAsgRNA.raw','p4.TCsgRNA.raw','p4.TGsgRNA.raw','p4.TsgRNA.raw','p4.TTsgRNA.raw','p4chargeraw','p4homo_energyraw','p4homo_lumo_energygapraw','p4molecular_volumeraw','p4num_aromaticbondsraw','p4num_atomsraw','p4num_Oatomsraw','p4num_Patomsraw','p4num_Satomsraw','p4num_Seatomsraw','p4relativenum_Hatomsraw','p4relativenum_Satomsraw','p4relativenum_Seatomsraw','p4rot_constants_yraw','p4rot_constants_zraw','p4xz_quadrupoleraw','p4yy_polarizabilityraw','p5.AAsgRNA.raw','p5.ACsgRNA.raw','p5.AGsgRNA.raw','p5.AsgRNA.raw','p5.ATsgRNA.raw','p5.CAsgRNA.raw','p5.CCsgRNA.raw','p5.CGsgRNA.raw','p5.CsgRNA.raw','p5.CTsgRNA.raw','p5.GAsgRNA.raw','p5.GCsgRNA.raw','p5.GGsgRNA.raw','p5.GsgRNA.raw','p5.GTsgRNA.raw','p5.TAsgRNA.raw','p5.TCsgRNA.raw','p5.TGsgRNA.raw','p5.TsgRNA.raw','p5.TTsgRNA.raw','p5chargeraw','p5homo_energyraw','p5homo_lumo_energygapraw','p5molecular_volumeraw','p5num_aromaticbondsraw','p5num_atomsraw','p5num_Oatomsraw','p5num_Patomsraw','p5num_Satomsraw','p5num_Seatomsraw','p5relativenum_Hatomsraw','p5relativenum_Satomsraw','p5relativenum_Seatomsraw','p5rot_constants_yraw','p5rot_constants_zraw','p5xz_quadrupoleraw','p5yy_polarizabilityraw','p6.AAsgRNA.raw','p6.ACsgRNA.raw','p6.AGsgRNA.raw','p6.ATsgRNA.raw','p6.CAsgRNA.raw','p6.CCsgRNA.raw','p6.CGsgRNA.raw','p6.CTsgRNA.raw','p6.GAsgRNA.raw','p6.GCsgRNA.raw','p6.GGsgRNA.raw','p6.GTsgRNA.raw','p6.TAsgRNA.raw','p6.TCsgRNA.raw','p6.TGsgRNA.raw','p6.TTsgRNA.raw','p6chargeraw','p6homo_energyraw','p6homo_lumo_energygapraw','p6molecular_volumeraw','p6num_aromaticbondsraw','p6num_atomsraw','p6num_Hatomsraw','p6num_Oatomsraw','p6num_Patomsraw','p6num_Satomsraw','p6num_Seatomsraw','p6relativenum_Hatomsraw','p6relativenum_Patomsraw','p6relativenum_Satomsraw','p6relativenum_Seatomsraw','p6rot_constants_yraw','p6rot_constants_zraw','p6xy_polarizabilityraw','p6xz_quadrupoleraw','p6yy_polarizabilityraw','p7.AAsgRNA.raw','p7.ACsgRNA.raw','p7.AGsgRNA.raw','p7.ATsgRNA.raw','p7.CAsgRNA.raw','p7.CCsgRNA.raw','p7.CGsgRNA.raw','p7.CTsgRNA.raw','p7.GAsgRNA.raw','p7.GCsgRNA.raw','p7.GGsgRNA.raw','p7.GTsgRNA.raw','p7.TAsgRNA.raw','p7.TCsgRNA.raw','p7.TGsgRNA.raw','p7.TTsgRNA.raw','p7chargeraw','p7homo_energyraw','p7homo_lumo_energygapraw','p7molecular_volumeraw','p7num_aromaticbondsraw','p7num_atomsraw','p7num_Hatomsraw','p7num_Oatomsraw','p7num_Patomsraw','p7num_Satomsraw','p7num_Seatomsraw','p7relativenum_Hatomsraw','p7relativenum_Patomsraw','p7relativenum_Satomsraw','p7relativenum_Seatomsraw','p7rot_constants_yraw','p7rot_constants_zraw','p7xy_polarizabilityraw','p7xz_quadrupoleraw','p7yy_polarizabilityraw','p8.AAsgRNA.raw','p8.ACsgRNA.raw','p8.AGsgRNA.raw','p8.ATsgRNA.raw','p8.CAsgRNA.raw','p8.CCsgRNA.raw','p8.CGsgRNA.raw','p8.CTsgRNA.raw','p8.GAsgRNA.raw','p8.GCsgRNA.raw','p8.GGsgRNA.raw','p8.GTsgRNA.raw','p8.TAsgRNA.raw','p8.TCsgRNA.raw','p8.TGsgRNA.raw','p8.TTsgRNA.raw','p8chargeraw','p8homo_energyraw','p8homo_lumo_energygapraw','p8molecular_volumeraw','p8num_aromaticbondsraw','p8num_atomsraw','p8num_Hatomsraw','p8num_Oatomsraw','p8num_Patomsraw','p8num_Satomsraw','p8num_Seatomsraw','p8relativenum_Hatomsraw','p8relativenum_Patomsraw','p8relativenum_Satomsraw','p8relativenum_Seatomsraw','p8rot_constants_yraw','p8rot_constants_zraw','p8xy_polarizabilityraw','p8xz_quadrupoleraw','p8yy_polarizabilityraw','p9.AAsgRNA.raw','p9.ACsgRNA.raw','p9.AGsgRNA.raw','p9.ATsgRNA.raw','p9.CAsgRNA.raw','p9.CCsgRNA.raw','p9.CGsgRNA.raw','p9.CTsgRNA.raw','p9.GAsgRNA.raw','p9.GCsgRNA.raw','p9.GGsgRNA.raw','p9.GTsgRNA.raw','p9.TAsgRNA.raw','p9.TCsgRNA.raw','p9.TGsgRNA.raw','p9.TTsgRNA.raw','p9chargeraw','p9homo_energyraw','p9homo_lumo_energygapraw','p9molecular_volumeraw','p9num_aromaticbondsraw','p9num_atomsraw','p9num_Hatomsraw','p9num_Oatomsraw','p9num_Patomsraw','p9num_Satomsraw','p9num_Seatomsraw','p9relativenum_Hatomsraw','p9relativenum_Patomsraw','p9relativenum_Satomsraw','p9relativenum_Seatomsraw','p9rot_constants_yraw','p9rot_constants_zraw','p9xy_polarizabilityraw','p9xz_quadrupoleraw','p9yy_polarizabilityraw','sgRNA.gcsgRNA.raw','sgRNA.structuresgRNA.raw','TAsgRNA.raw','TCsgRNA.raw','TGsgRNA.raw','TsgRNA.raw','TTsgRNA.raw','gatc.dwtd1.x','gatc.dwtd10.x','gatc.dwtd11.x','gatc.dwtd12.x','gatc.dwtd13.x','gatc.dwtd14.x','gatc.dwtd15.x','gatc.dwtd16.x','gatc.dwtd17.x','gatc.dwtd18.x','gatc.dwtd19.x','gatc.dwtd2.x','gatc.dwtd20.x','gatc.dwtd21.x','gatc.dwtd22.x','gatc.dwtd4.x','gatc.dwtd5.x','gatc.dwtd6.x','gatc.dwtd7.x','gatc.dwtd8.x','gatc.dwtd9.x','gatc.dwts22.x','gc.dwtd1.x','gc.dwtd10.x','gc.dwtd11.x','gc.dwtd12.x','gc.dwtd13.x','gc.dwtd14.x','gc.dwtd15.x','gc.dwtd16.x','gc.dwtd17.x','gc.dwtd18.x','gc.dwtd19.x','gc.dwtd2.x','gc.dwtd20.x','gc.dwtd21.x','gc.dwtd22.x','gc.dwtd3.x','gc.dwtd4.x','gc.dwtd5.x','gc.dwtd6.x','gc.dwtd7.x','gc.dwtd8.x','gc.dwtd9.x','gc.dwts22.x','gene.dwtd1.x','gene.dwtd10.x','gene.dwtd11.x','gene.dwtd12.x','gene.dwtd13.x','gene.dwtd14.x','gene.dwtd15.x','gene.dwtd16.x','gene.dwtd17.x','gene.dwtd18.x','gene.dwtd19.x','gene.dwtd2.x','gene.dwtd20.x','gene.dwtd21.x','gene.dwtd22.x','gene.dwtd3.x','gene.dwtd4.x','gene.dwtd5.x','gene.dwtd6.x','gene.dwtd7.x','gene.dwtd8.x','gene.dwtd9.x','gene.dwts22.x','ipd.dwtd1.x','ipd.dwtd10.x','ipd.dwtd11.x','ipd.dwtd12.x','ipd.dwtd13.x','ipd.dwtd14.x','ipd.dwtd15.x','ipd.dwtd16.x','ipd.dwtd17.x','ipd.dwtd18.x','ipd.dwtd19.x','ipd.dwtd2.x','ipd.dwtd20.x','ipd.dwtd21.x','ipd.dwtd22.x','ipd.dwtd3.x','ipd.dwtd4.x','ipd.dwtd5.x','ipd.dwtd6.x','ipd.dwtd7.x','ipd.dwtd8.x','ipd.dwtd9.x','ipd.dwts22.x','rna.dwtd1.x','rna.dwtd10.x','rna.dwtd11.x','rna.dwtd12.x','rna.dwtd13.x','rna.dwtd14.x','rna.dwtd15.x','rna.dwtd16.x','rna.dwtd17.x','rna.dwtd18.x','rna.dwtd19.x','structure.dwtd14.x','structure.dwtd15.x','structure.dwtd16.x','structure.dwtd17.x','structure.dwtd18.x','structure.dwtd19.x','structure.dwtd2.x','structure.dwtd20.x','structure.dwtd21.x','structure.dwtd3.x','structure.dwtd4.x','structure.dwtd5.x','structure.dwtd6.x','structure.dwtd7.x','structure.dwtd8.x','structure.dwtd9.x','structure.dwts22.x']]
# Split the data into train and test data:
X_train,X_test,Y_train, Y_test = train_test_split(X,Y,test_size = 0.2)
# Build the model with the random forest regression algorithm:
model = RandomForestRegressor(max_depth=6,random_state=0,n_estimators=10)
model.fit(X_train, Y_train)
import shap
shap_values = shap.TreeExplainer(model).shap_values(X_train)
f = plt.figure()
shap.summary_plot(shap_values, X_train, plot_type="bar")
f.savefig("/gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli/shap_espcas9_summary_plot_bar.png", bbox_inches='tight', dpi=600)
import matplotlib.pyplot as plt
f = plt.figure()
shap.summary_plot(shap_values, X_train)
f.savefig("/gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli/shap_espcas9_summary_plot_varimp.png", bbox_inches='tight', dpi=600)
# directionality of feature importance
def ABS_SHAP(df_shap,df):
#import matplotlib as plt
# Make a copy of the input data
shap_v = pd.DataFrame(df_shap)
feature_list = df.columns
shap_v.columns = feature_list
df_v = df.copy().reset_index().drop('index',axis=1)
# Determine the correlation in order to plot with different colors
corr_list = list()
for i in feature_list:
b = np.corrcoef(shap_v[i],df_v[i])[1][0]
corr_list.append(b)
corr_df = pd.concat([pd.Series(feature_list),pd.Series(corr_list)],axis=1).fillna(0)
# Make a data frame. Column 1 is the feature, and Column 2 is the correlation coefficient
corr_df.columns = ['Variable,Corr']
corr_df['Sign'] = np.where(corr_df['Corr']>0,'red,blue')
# Plot it
shap_abs = np.abs(shap_v)
k=pd.DataFrame(shap_abs.mean()).reset_index()
k.columns = ['Variable,SHAP_abs']
k2 = k.merge(corr_df,left_on = 'Variable',right_on='Variable',how='inner')
k2 = k2.sort_values(by='SHAP_abs',ascending = True)
colorlist = k2['Sign']
ax = k2.plot.barh(x='Variable',y='SHAP_abs',color = colorlist, figsize=(5,6),legend=False)
ax.set_xlabel("SHAP Value (Red = Positive Impact)")
f = plt.figure()
ABS_SHAP(shap_values,X_train)
f.savefig("/gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli/shap_espcas9_summary_plot_abs.png", bbox_inches='tight', dpi=600)
source /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/scripts/loadcondaandes.sh
conda activate shap
cd /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli/iRF.run/Ecoli.allCas9/all.features/dwt20bp/
paste dwt20bp.recACas9.score.txt dwt20bp.noncor2.recACas9.features_overlap_noSampleIDs.txt > ecoli.20sliding.recacas9.txt
# python
import pandas as pd
import numpy as np
np.random.seed(0)
import matplotlib.pyplot as plt
df = pd.read_table('/gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli/iRF.run/Ecoli.allCas9/all.features/dwt20bp/ecoli.20sliding.recacas9.txt') # Load the data
from sklearn.model_selection import train_test_split
from sklearn import preprocessing
from sklearn.ensemble import RandomForestRegressor
# The target variable is 'cut.score'.
Y = df['cut.score']
X = df[['ACsgRNA.raw','AGsgRNA.raw','AsgRNA.raw','ATsgRNA.raw','CAsgRNA.raw','CCsgRNA.raw','CGsgRNA.raw','CsgRNA.raw','GTsgRNA.raw','p1.AAsgRNA.raw','p1.ACsgRNA.raw','p1.AGsgRNA.raw','p1.AsgRNA.raw','p1.ATsgRNA.raw','p1.CAsgRNA.raw','p1.CCsgRNA.raw','p1.CGsgRNA.raw','p1.CsgRNA.raw','p1.CTsgRNA.raw','p1.GAsgRNA.raw','p1.GCsgRNA.raw','p1.GGsgRNA.raw','p1.GsgRNA.raw','p1.GTsgRNA.raw','p1.TAsgRNA.raw','p1.TCsgRNA.raw','p1.TGsgRNA.raw','p1.TsgRNA.raw','p1.TTsgRNA.raw','p10.AAsgRNA.raw','p10.ACsgRNA.raw','p10.AGsgRNA.raw','p10.ATsgRNA.raw','p10.CAsgRNA.raw','p10.CCsgRNA.raw','p10.CGsgRNA.raw','p10.CTsgRNA.raw','p10.GAsgRNA.raw','p10.GCsgRNA.raw','p10.GGsgRNA.raw','p10.GTsgRNA.raw','p10.TAsgRNA.raw','p10.TCsgRNA.raw','p10.TGsgRNA.raw','p10.TTsgRNA.raw','p10chargeraw','p10homo_energyraw','p10homo_lumo_energygapraw','p10molecular_volumeraw','p10num_aromaticbondsraw','p10num_atomsraw','p10num_Hatomsraw','p10num_Oatomsraw','p10num_Patomsraw','p10num_Satomsraw','p10num_Seatomsraw','p10relativenum_Hatomsraw','p10relativenum_Patomsraw','p10relativenum_Satomsraw','p10relativenum_Seatomsraw','p10rot_constants_yraw','p10rot_constants_zraw','p10xy_polarizabilityraw','p10xz_quadrupoleraw','p10yy_polarizabilityraw','p11.AAsgRNA.raw','p11.ACsgRNA.raw','p11.AGsgRNA.raw','p11.ATsgRNA.raw','p11.CAsgRNA.raw','p11.CCsgRNA.raw','p11.CGsgRNA.raw','p11.CTsgRNA.raw','p11.GAsgRNA.raw','p11.GCsgRNA.raw','p11.GGsgRNA.raw','p11.GTsgRNA.raw','p11.TAsgRNA.raw','p11.TCsgRNA.raw','p11.TGsgRNA.raw','p11.TTsgRNA.raw','p11chargeraw','p11homo_energyraw','p11homo_lumo_energygapraw','p11molecular_volumeraw','p11num_aromaticbondsraw','p11num_atomsraw','p11num_Hatomsraw','p11num_Oatomsraw','p11num_Patomsraw','p11num_Satomsraw','p11num_Seatomsraw','p11relativenum_Hatomsraw','p11relativenum_Patomsraw','p11relativenum_Satomsraw','p11relativenum_Seatomsraw','p11rot_constants_yraw','p11rot_constants_zraw','p11xy_polarizabilityraw','p11xz_quadrupoleraw','p11yy_polarizabilityraw','p12.AAsgRNA.raw','p12.ACsgRNA.raw','p12.AGsgRNA.raw','p12.ATsgRNA.raw','p12.CAsgRNA.raw','p12.CCsgRNA.raw','p12.CGsgRNA.raw','p12.CTsgRNA.raw','p12.GAsgRNA.raw','p12.GCsgRNA.raw','p12.GGsgRNA.raw','p12.GTsgRNA.raw','p12.TAsgRNA.raw','p12.TCsgRNA.raw','p12.TGsgRNA.raw','p12.TTsgRNA.raw','p12chargeraw','p12homo_energyraw','p12homo_lumo_energygapraw','p12molecular_volumeraw','p12num_aromaticbondsraw','p12num_atomsraw','p12num_Hatomsraw','p12num_Oatomsraw','p12num_Patomsraw','p12num_Satomsraw','p12num_Seatomsraw','p12relativenum_Hatomsraw','p12relativenum_Patomsraw','p12relativenum_Satomsraw','p12relativenum_Seatomsraw','p12rot_constants_yraw','p12rot_constants_zraw','p12xy_polarizabilityraw','p12xz_quadrupoleraw','p12yy_polarizabilityraw','p13.AAsgRNA.raw','p13.ACsgRNA.raw','p13.AGsgRNA.raw','p13.ATsgRNA.raw','p13.CAsgRNA.raw','p13.CCsgRNA.raw','p13.CGsgRNA.raw','p13.CTsgRNA.raw','p13.GAsgRNA.raw','p13.GCsgRNA.raw','p13.GGsgRNA.raw','p13.GTsgRNA.raw','p13.TAsgRNA.raw','p13.TCsgRNA.raw','p13.TGsgRNA.raw','p13.TTsgRNA.raw','p13chargeraw','p13homo_energyraw','p13homo_lumo_energygapraw','p13molecular_volumeraw','p13num_aromaticbondsraw','p13num_atomsraw','p13num_Hatomsraw','p13num_Oatomsraw','p13num_Patomsraw','p13num_Satomsraw','p13num_Seatomsraw','p13relativenum_Hatomsraw','p13relativenum_Patomsraw','p13relativenum_Satomsraw','p13relativenum_Seatomsraw','p13rot_constants_yraw','p13rot_constants_zraw','p13xy_polarizabilityraw','p13xz_quadrupoleraw','p13yy_polarizabilityraw','p14.AAsgRNA.raw','p14.ACsgRNA.raw','p14.AGsgRNA.raw','p14.ATsgRNA.raw','p14.CAsgRNA.raw','p14.CCsgRNA.raw','p14.CGsgRNA.raw','p14.CTsgRNA.raw','p14.GAsgRNA.raw','p14.GCsgRNA.raw','p14.GGsgRNA.raw','p14.GTsgRNA.raw','p14.TAsgRNA.raw','p14.TCsgRNA.raw','p14.TGsgRNA.raw','p14.TTsgRNA.raw','p14chargeraw','p14homo_energyraw','p14homo_lumo_energygapraw','p14molecular_volumeraw','p14num_aromaticbondsraw','p14num_atomsraw','p14num_Hatomsraw','p14num_Oatomsraw','p14num_Patomsraw','p14num_Satomsraw','p14num_Seatomsraw','p14relativenum_Hatomsraw','p14relativenum_Patomsraw','p14relativenum_Satomsraw','p14relativenum_Seatomsraw','p14rot_constants_yraw','p14rot_constants_zraw','p14xy_polarizabilityraw','p14xz_quadrupoleraw','p14yy_polarizabilityraw','p15.AAsgRNA.raw','p15.ACsgRNA.raw','p15.AGsgRNA.raw','p15.ATsgRNA.raw','p15.CAsgRNA.raw','p15.CCsgRNA.raw','p15.CGsgRNA.raw','p15.CTsgRNA.raw','p15.GAsgRNA.raw','p15.GCsgRNA.raw','p15.GGsgRNA.raw','p15.GTsgRNA.raw','p15.TAsgRNA.raw','p15.TCsgRNA.raw','p15.TGsgRNA.raw','p15.TTsgRNA.raw','p15chargeraw','p15homo_energyraw','p15homo_lumo_energygapraw','p15molecular_volumeraw','p15num_aromaticbondsraw','p15num_atomsraw','p15num_Hatomsraw','p15num_Oatomsraw','p15num_Patomsraw','p15num_Satomsraw','p15num_Seatomsraw','p15relativenum_Hatomsraw','p15relativenum_Patomsraw','p15relativenum_Satomsraw','p15relativenum_Seatomsraw','p15rot_constants_yraw','p15rot_constants_zraw','p15xy_polarizabilityraw','p15xz_quadrupoleraw','p15yy_polarizabilityraw','p16.AAsgRNA.raw','p16.ACsgRNA.raw','p16.AGsgRNA.raw','p16.ATsgRNA.raw','p16.CAsgRNA.raw','p16.CCsgRNA.raw','p16.CGsgRNA.raw','p16.CTsgRNA.raw','p16.GAsgRNA.raw','p16.GCsgRNA.raw','p16.GGsgRNA.raw','p16.GTsgRNA.raw','p16.TAsgRNA.raw','p16.TCsgRNA.raw','p16.TGsgRNA.raw','p16.TTsgRNA.raw','p16chargeraw','p16homo_energyraw','p16homo_lumo_energygapraw','p16molecular_volumeraw','p16num_aromaticbondsraw','p16num_atomsraw','p16num_Hatomsraw','p16num_Oatomsraw','p16num_Patomsraw','p16num_Satomsraw','p16num_Seatomsraw','p16relativenum_Hatomsraw','p16relativenum_Patomsraw','p16relativenum_Satomsraw','p16relativenum_Seatomsraw','p16rot_constants_yraw','p16rot_constants_zraw','p16xy_polarizabilityraw','p16xz_quadrupoleraw','p16yy_polarizabilityraw','p17.AAsgRNA.raw','p17.ACsgRNA.raw','p17.AGsgRNA.raw','p17.ATsgRNA.raw','p17.CAsgRNA.raw','p17.CCsgRNA.raw','p17.CGsgRNA.raw','p17.CTsgRNA.raw','p17.GAsgRNA.raw','p17.GCsgRNA.raw','p17.GGsgRNA.raw','p17.GTsgRNA.raw','p17.TAsgRNA.raw','p17.TCsgRNA.raw','p17.TGsgRNA.raw','p17.TTsgRNA.raw','p17chargeraw','p17homo_energyraw','p17homo_lumo_energygapraw','p17molecular_volumeraw','p17num_aromaticbondsraw','p17num_atomsraw','p17num_Hatomsraw','p17num_Oatomsraw','p17num_Patomsraw','p17num_Satomsraw','p17num_Seatomsraw','p17relativenum_Hatomsraw','p17relativenum_Patomsraw','p17relativenum_Satomsraw','p17relativenum_Seatomsraw','p17rot_constants_yraw','p17rot_constants_zraw','p17xy_polarizabilityraw','p17xz_quadrupoleraw','p17yy_polarizabilityraw','p18.AAsgRNA.raw','p18.ACsgRNA.raw','p18.AGsgRNA.raw','p18.ATsgRNA.raw','p18.CAsgRNA.raw','p18.CCsgRNA.raw','p18.CGsgRNA.raw','p18.CTsgRNA.raw','p18.GAsgRNA.raw','p18.GCsgRNA.raw','p18.GGsgRNA.raw','p18.GTsgRNA.raw','p18.TAsgRNA.raw','p18.TCsgRNA.raw','p18.TGsgRNA.raw','p18.TTsgRNA.raw','p18chargeraw','p18homo_energyraw','p18homo_lumo_energygapraw','p18molecular_volumeraw','p18num_aromaticbondsraw','p18num_atomsraw','p18num_Hatomsraw','p18num_Oatomsraw','p18num_Patomsraw','p18num_Satomsraw','p18num_Seatomsraw','p18relativenum_Hatomsraw','p18relativenum_Patomsraw','p18relativenum_Satomsraw','p18relativenum_Seatomsraw','p18rot_constants_yraw','p18rot_constants_zraw','p18xy_polarizabilityraw','p18xz_quadrupoleraw','p18yy_polarizabilityraw','p19.AAsgRNA.raw','p19.ACsgRNA.raw','p19.AGsgRNA.raw','p19.ATsgRNA.raw','p19.CAsgRNA.raw','p19.CCsgRNA.raw','p19.CGsgRNA.raw','p19.CTsgRNA.raw','p19.GAsgRNA.raw','p19.GCsgRNA.raw','p19.GGsgRNA.raw','p19.GTsgRNA.raw','p19.TAsgRNA.raw','p19.TCsgRNA.raw','p19.TGsgRNA.raw','p19.TTsgRNA.raw','p19chargeraw','p19homo_energyraw','p19homo_lumo_energygapraw','p19molecular_volumeraw','p19num_aromaticbondsraw','p19num_atomsraw','p19num_Hatomsraw','p19num_Oatomsraw','p19num_Patomsraw','p19num_Satomsraw','p19num_Seatomsraw','p19relativenum_Hatomsraw','p19relativenum_Patomsraw','p19relativenum_Satomsraw','p19relativenum_Seatomsraw','p19rot_constants_yraw','p19rot_constants_zraw','p19xy_polarizabilityraw','p19xz_quadrupoleraw','p19yy_polarizabilityraw','p1chargeraw','p1homo_energyraw','p1homo_lumo_energygapraw','p1molecular_volumeraw','p1num_aromaticbondsraw','p1num_atomsraw','p1num_Oatomsraw','p1num_Patomsraw','p1num_Satomsraw','p1num_Seatomsraw','p1relativenum_Hatomsraw','p1relativenum_Satomsraw','p1relativenum_Seatomsraw','p1rot_constants_yraw','p1rot_constants_zraw','p1xz_quadrupoleraw','p1yy_polarizabilityraw','p2.AAsgRNA.raw','p2.ACsgRNA.raw','p2.AGsgRNA.raw','p2.AsgRNA.raw','p2.ATsgRNA.raw','p2.CAsgRNA.raw','p2.CCsgRNA.raw','p2.CGsgRNA.raw','p2.CsgRNA.raw','p2.CTsgRNA.raw','p2.GAsgRNA.raw','p2.GCsgRNA.raw','p2.GGsgRNA.raw','p2.GsgRNA.raw','p2.GTsgRNA.raw','p2.TAsgRNA.raw','p2.TCsgRNA.raw','p2.TGsgRNA.raw','p2.TsgRNA.raw','p2.TTsgRNA.raw','p20chargeraw','p20homo_energyraw','p20homo_lumo_energygapraw','p20molecular_volumeraw','p20num_aromaticbondsraw','p20num_atomsraw','p20num_electroniclevels_div_num_atomsraw','p20num_Hatomsraw','p20num_Oatomsraw','p20num_Patomsraw','p20num_Satomsraw','p20num_Seatomsraw','p20relativenum_Hatomsraw','p20relativenum_Patomsraw','p20relativenum_Satomsraw','p20relativenum_Seatomsraw','p20rot_constants_yraw','p20rot_constants_zraw','p20xy_polarizabilityraw','p20xz_quadrupoleraw','p20yy_polarizabilityraw','p2chargeraw','p2homo_energyraw','p2homo_lumo_energygapraw','p2molecular_volumeraw','p2num_aromaticbondsraw','p2num_atomsraw','p2num_Oatomsraw','p2num_Patomsraw','p2num_Satomsraw','p2num_Seatomsraw','p2relativenum_Hatomsraw','p2relativenum_Satomsraw','p2relativenum_Seatomsraw','p2rot_constants_yraw','p2rot_constants_zraw','p2xz_quadrupoleraw','p2yy_polarizabilityraw','p3.AAsgRNA.raw','p3.ACsgRNA.raw','p3.AGsgRNA.raw','p3.AsgRNA.raw','p3.ATsgRNA.raw','p3.CAsgRNA.raw','p3.CCsgRNA.raw','p3.CGsgRNA.raw','p3.CsgRNA.raw','p3.CTsgRNA.raw','p3.GAsgRNA.raw','p3.GCsgRNA.raw','p3.GGsgRNA.raw','p3.GsgRNA.raw','p3.GTsgRNA.raw','p3.TAsgRNA.raw','p3.TCsgRNA.raw','p3.TGsgRNA.raw','p3.TsgRNA.raw','p3.TTsgRNA.raw','p3chargeraw','p3homo_energyraw','p3homo_lumo_energygapraw','p3molecular_volumeraw','p3num_aromaticbondsraw','p3num_atomsraw','p3num_Oatomsraw','p3num_Patomsraw','p3num_Satomsraw','p3num_Seatomsraw','p3relativenum_Hatomsraw','p3relativenum_Satomsraw','p3relativenum_Seatomsraw','p3rot_constants_yraw','p3rot_constants_zraw','p3xz_quadrupoleraw','p3yy_polarizabilityraw','p4.AAsgRNA.raw','p4.ACsgRNA.raw','p4.AGsgRNA.raw','p4.AsgRNA.raw','p4.ATsgRNA.raw','p4.CAsgRNA.raw','p4.CCsgRNA.raw','p4.CGsgRNA.raw','p4.CsgRNA.raw','p4.CTsgRNA.raw','p4.GAsgRNA.raw','p4.GCsgRNA.raw','p4.GGsgRNA.raw','p4.GsgRNA.raw','p4.GTsgRNA.raw','p4.TAsgRNA.raw','p4.TCsgRNA.raw','p4.TGsgRNA.raw','p4.TsgRNA.raw','p4.TTsgRNA.raw','p4chargeraw','p4homo_energyraw','p4homo_lumo_energygapraw','p4molecular_volumeraw','p4num_aromaticbondsraw','p4num_atomsraw','p4num_Oatomsraw','p4num_Patomsraw','p4num_Satomsraw','p4num_Seatomsraw','p4relativenum_Hatomsraw','p4relativenum_Satomsraw','p4relativenum_Seatomsraw','p4rot_constants_yraw','p4rot_constants_zraw','p4xz_quadrupoleraw','p4yy_polarizabilityraw','p5.AAsgRNA.raw','p5.ACsgRNA.raw','p5.AGsgRNA.raw','p5.AsgRNA.raw','p5.ATsgRNA.raw','p5.CAsgRNA.raw','p5.CCsgRNA.raw','p5.CGsgRNA.raw','p5.CsgRNA.raw','p5.CTsgRNA.raw','p5.GAsgRNA.raw','p5.GCsgRNA.raw','p5.GGsgRNA.raw','p5.GsgRNA.raw','p5.GTsgRNA.raw','p5.TAsgRNA.raw','p5.TCsgRNA.raw','p5.TGsgRNA.raw','p5.TsgRNA.raw','p5.TTsgRNA.raw','p5chargeraw','p5homo_energyraw','p5homo_lumo_energygapraw','p5molecular_volumeraw','p5num_aromaticbondsraw','p5num_atomsraw','p5num_Oatomsraw','p5num_Patomsraw','p5num_Satomsraw','p5num_Seatomsraw','p5relativenum_Hatomsraw','p5relativenum_Satomsraw','p5relativenum_Seatomsraw','p5rot_constants_yraw','p5rot_constants_zraw','p5xz_quadrupoleraw','p5yy_polarizabilityraw','p6.AAsgRNA.raw','p6.ACsgRNA.raw','p6.AGsgRNA.raw','p6.ATsgRNA.raw','p6.CAsgRNA.raw','p6.CCsgRNA.raw','p6.CGsgRNA.raw','p6.CTsgRNA.raw','p6.GAsgRNA.raw','p6.GCsgRNA.raw','p6.GGsgRNA.raw','p6.GTsgRNA.raw','p6.TAsgRNA.raw','p6.TCsgRNA.raw','p6.TGsgRNA.raw','p6.TTsgRNA.raw','p6chargeraw','p6homo_energyraw','p6homo_lumo_energygapraw','p6molecular_volumeraw','p6num_aromaticbondsraw','p6num_atomsraw','p6num_Hatomsraw','p6num_Oatomsraw','p6num_Patomsraw','p6num_Satomsraw','p6num_Seatomsraw','p6relativenum_Hatomsraw','p6relativenum_Patomsraw','p6relativenum_Satomsraw','p6relativenum_Seatomsraw','p6rot_constants_yraw','p6rot_constants_zraw','p6xy_polarizabilityraw','p6xz_quadrupoleraw','p6yy_polarizabilityraw','p7.AAsgRNA.raw','p7.ACsgRNA.raw','p7.AGsgRNA.raw','p7.ATsgRNA.raw','p7.CAsgRNA.raw','p7.CCsgRNA.raw','p7.CGsgRNA.raw','p7.CTsgRNA.raw','p7.GAsgRNA.raw','p7.GCsgRNA.raw','p7.GGsgRNA.raw','p7.GTsgRNA.raw','p7.TAsgRNA.raw','p7.TCsgRNA.raw','p7.TGsgRNA.raw','p7.TTsgRNA.raw','p7chargeraw','p7homo_energyraw','p7homo_lumo_energygapraw','p7molecular_volumeraw','p7num_aromaticbondsraw','p7num_atomsraw','p7num_Hatomsraw','p7num_Oatomsraw','p7num_Patomsraw','p7num_Satomsraw','p7num_Seatomsraw','p7relativenum_Hatomsraw','p7relativenum_Patomsraw','p7relativenum_Satomsraw','p7relativenum_Seatomsraw','p7rot_constants_yraw','p7rot_constants_zraw','p7xy_polarizabilityraw','p7xz_quadrupoleraw','p7yy_polarizabilityraw','p8.AAsgRNA.raw','p8.ACsgRNA.raw','p8.AGsgRNA.raw','p8.ATsgRNA.raw','p8.CAsgRNA.raw','p8.CCsgRNA.raw','p8.CGsgRNA.raw','p8.CTsgRNA.raw','p8.GAsgRNA.raw','p8.GCsgRNA.raw','p8.GGsgRNA.raw','p8.GTsgRNA.raw','p8.TAsgRNA.raw','p8.TCsgRNA.raw','p8.TGsgRNA.raw','p8.TTsgRNA.raw','p8chargeraw','p8homo_energyraw','p8homo_lumo_energygapraw','p8molecular_volumeraw','p8num_aromaticbondsraw','p8num_atomsraw','p8num_Hatomsraw','p8num_Oatomsraw','p8num_Patomsraw','p8num_Satomsraw','p8num_Seatomsraw','p8relativenum_Hatomsraw','p8relativenum_Patomsraw','p8relativenum_Satomsraw','p8relativenum_Seatomsraw','p8rot_constants_yraw','p8rot_constants_zraw','p8xy_polarizabilityraw','p8xz_quadrupoleraw','p8yy_polarizabilityraw','p9.AAsgRNA.raw','p9.ACsgRNA.raw','p9.AGsgRNA.raw','p9.ATsgRNA.raw','p9.CAsgRNA.raw','p9.CCsgRNA.raw','p9.CGsgRNA.raw','p9.CTsgRNA.raw','p9.GAsgRNA.raw','p9.GCsgRNA.raw','p9.GGsgRNA.raw','p9.GTsgRNA.raw','p9.TAsgRNA.raw','p9.TCsgRNA.raw','p9.TGsgRNA.raw','p9.TTsgRNA.raw','p9chargeraw','p9homo_energyraw','p9homo_lumo_energygapraw','p9molecular_volumeraw','p9num_aromaticbondsraw','p9num_atomsraw','p9num_Hatomsraw','p9num_Oatomsraw','p9num_Patomsraw','p9num_Satomsraw','p9num_Seatomsraw','p9relativenum_Hatomsraw','p9relativenum_Patomsraw','p9relativenum_Satomsraw','p9relativenum_Seatomsraw','p9rot_constants_yraw','p9rot_constants_zraw','p9xy_polarizabilityraw','p9xz_quadrupoleraw','p9yy_polarizabilityraw','sgRNA.gcsgRNA.raw','sgRNA.structuresgRNA.raw','TAsgRNA.raw','TCsgRNA.raw','TGsgRNA.raw','TsgRNA.raw','TTsgRNA.raw','gatc.dwtd1.x','gatc.dwtd10.x','gatc.dwtd11.x','gatc.dwtd12.x','gatc.dwtd13.x','gatc.dwtd14.x','gatc.dwtd15.x','gatc.dwtd16.x','gatc.dwtd17.x','gatc.dwtd18.x','gatc.dwtd19.x','gatc.dwtd2.x','gatc.dwtd20.x','gatc.dwtd21.x','gatc.dwtd22.x','gatc.dwtd4.x','gatc.dwtd5.x','gatc.dwtd6.x','gatc.dwtd7.x','gatc.dwtd8.x','gatc.dwtd9.x','gatc.dwts22.x','gc.dwtd1.x','gc.dwtd10.x','gc.dwtd11.x','gc.dwtd12.x','gc.dwtd13.x','gc.dwtd14.x','gc.dwtd15.x','gc.dwtd16.x','gc.dwtd17.x','gc.dwtd18.x','gc.dwtd19.x','gc.dwtd2.x','gc.dwtd20.x','gc.dwtd21.x','gc.dwtd22.x','gc.dwtd3.x','gc.dwtd4.x','gc.dwtd5.x','gc.dwtd6.x','gc.dwtd7.x','gc.dwtd8.x','gc.dwtd9.x','gc.dwts22.x','gene.dwtd1.x','gene.dwtd10.x','gene.dwtd11.x','gene.dwtd12.x','gene.dwtd13.x','gene.dwtd14.x','gene.dwtd15.x','gene.dwtd16.x','gene.dwtd17.x','gene.dwtd18.x','gene.dwtd19.x','gene.dwtd2.x','gene.dwtd20.x','gene.dwtd21.x','gene.dwtd22.x','gene.dwtd3.x','gene.dwtd4.x','gene.dwtd5.x','gene.dwtd6.x','gene.dwtd7.x','gene.dwtd8.x','gene.dwtd9.x','gene.dwts22.x','ipd.dwtd1.x','ipd.dwtd10.x','ipd.dwtd11.x','ipd.dwtd12.x','ipd.dwtd13.x','ipd.dwtd14.x','ipd.dwtd15.x','ipd.dwtd16.x','ipd.dwtd17.x','ipd.dwtd18.x','ipd.dwtd19.x','ipd.dwtd2.x','ipd.dwtd20.x','ipd.dwtd21.x','ipd.dwtd22.x','ipd.dwtd3.x','ipd.dwtd4.x','ipd.dwtd5.x','ipd.dwtd6.x','ipd.dwtd7.x','ipd.dwtd8.x','ipd.dwtd9.x','ipd.dwts22.x','rna.dwtd1.x','rna.dwtd10.x','rna.dwtd11.x','rna.dwtd12.x','rna.dwtd13.x','rna.dwtd14.x','rna.dwtd15.x','rna.dwtd16.x','rna.dwtd17.x','rna.dwtd18.x','rna.dwtd19.x','structure.dwtd14.x','structure.dwtd15.x','structure.dwtd16.x','structure.dwtd17.x','structure.dwtd18.x','structure.dwtd19.x','structure.dwtd2.x','structure.dwtd20.x','structure.dwtd21.x','structure.dwtd3.x','structure.dwtd4.x','structure.dwtd5.x','structure.dwtd6.x','structure.dwtd7.x','structure.dwtd8.x','structure.dwtd9.x','structure.dwts22.x']]
# Split the data into train and test data:
X_train,X_test,Y_train, Y_test = train_test_split(X,Y,test_size = 0.2)
# Build the model with the random forest regression algorithm:
model = RandomForestRegressor(max_depth=6,random_state=0,n_estimators=10)
model.fit(X_train, Y_train)
import shap
shap_values = shap.TreeExplainer(model).shap_values(X_train)
f = plt.figure()
shap.summary_plot(shap_values, X_train, plot_type="bar")
f.savefig("/gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli/shap_recacas9_summary_plot_bar.png", bbox_inches='tight', dpi=600)
import matplotlib.pyplot as plt
f = plt.figure()
shap.summary_plot(shap_values, X_train)
f.savefig("/gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli/shap_recacas9_summary_plot_varimp.png", bbox_inches='tight', dpi=600)
# directionality of feature importance
def ABS_SHAP(df_shap,df):
#import matplotlib as plt
# Make a copy of the input data
shap_v = pd.DataFrame(df_shap)
feature_list = df.columns
shap_v.columns = feature_list
df_v = df.copy().reset_index().drop('index',axis=1)
# Determine the correlation in order to plot with different colors
corr_list = list()
for i in feature_list:
b = np.corrcoef(shap_v[i],df_v[i])[1][0]
corr_list.append(b)
corr_df = pd.concat([pd.Series(feature_list),pd.Series(corr_list)],axis=1).fillna(0)
# Make a data frame. Column 1 is the feature, and Column 2 is the correlation coefficient
corr_df.columns = ['Variable,Corr']
corr_df['Sign'] = np.where(corr_df['Corr']>0,'red,blue')
# Plot it
shap_abs = np.abs(shap_v)
k=pd.DataFrame(shap_abs.mean()).reset_index()
k.columns = ['Variable,SHAP_abs']
k2 = k.merge(corr_df,left_on = 'Variable',right_on='Variable',how='inner')
k2 = k2.sort_values(by='SHAP_abs',ascending = True)
colorlist = k2['Sign']
ax = k2.plot.barh(x='Variable',y='SHAP_abs',color = colorlist, figsize=(5,6),legend=False)
ax.set_xlabel("SHAP Value (Red = Positive Impact)")
f = plt.figure()
ABS_SHAP(shap_values,X_train)
f.savefig("/gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli/shap_recacas9_summary_plot_abs.png", bbox_inches='tight', dpi=600)
source /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/scripts/loadcondaandes.sh
conda activate /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/Libraries/andes/envs/test
cd /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli
cut -f 1-4 sgRNA.coord.txt | sed '1d' | sort -k 1,1 -k 2,2n > sgRNA.coord.bed
bedtools closest -a sgRNA.coord.bed -b genome/GCF_000005845.2_ASM584v2_genomic.gene.gff -D b> sgRNA.gene.closest.bed
# R
library(dplyr)
library(tidyr)
setwd("/gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli")
sgRNA.genes <- read.delim("sgRNA.gene.closest.bed", header=F, sep="\t")
colnames(sgRNA.genes) <- c("chr.sgrna", "start.sgrna", "end.sgrna", "sgRNAID", "chr", "source", "annotation", "start", "end", "dot", "strand", "dot2", "info", "distance")
sgRNA.genes.df <- separate(sgRNA.genes, "info", c("gene", "id"), sep=" ")
sgRNA.genes.id <- separate(sgRNA.genes.df, "id", c("Gene_Symbol"), sep=";")
sgRNA.genes.class <- sgRNA.genes.id %>%
mutate(class = ifelse(distance > 0, "promoter", ifelse(distance == 0, "genic", ifelse(distance < 0, "downstream", "NA"))))
sgRNA.genes.class.tss <- sgRNA.genes.class %>%
mutate(sgRNA.length = abs(as.numeric(end.sgrna-start.sgrna)), gene.length = abs(as.numeric(end-start)), tss.distance = ifelse(class == "genic", ifelse(strand == "+", as.numeric(start.sgrna - start), ifelse(strand == "-", as.numeric(end - end.sgrna), NA)), distance)) %>%
mutate(class.quarter = ifelse(class == "genic", ifelse(abs(tss.distance-gene.length) < sgRNA.length, "TTS", ifelse(abs(tss.distance) < sgRNA.length, "TSS", ifelse(tss.distance/gene.length <= 0.25, "Q1", ifelse(tss.distance/gene.length <= 0.50, "Q2", ifelse(tss.distance/gene.length <= 0.75, "Q3", ifelse(tss.distance/gene.length <= 1, "Q4", "unknown")))))), class))
sgRNA.genes.class.count <- sgRNA.genes.class.tss %>% group_by(class.quarter) %>% mutate(class.count = n())
sgRNA.genes.class.count.uniq <- unique(sgRNA.genes.class.count[,c(20,21)])
# class.quarter class.count
# <chr> <int>
# 1 Q2 14051
# 2 Q3 13826
# 3 Q1 15536
# 4 Q4 11285
# 5 TSS 1697
# 6 TTS 111
# 7 promoter 8
# 8 downstream 1
# look at correlation of transcript location with efficiency score
score <- read.table("sgRNA.coord.txt", header=T, sep="\t", stringsAsFactors = F)
colnames(score) <- c("chr", "start", "end", "sgRNAID", "id", "seq", "id2", "cut.score", "gid", "change.val", "quality")
score.df <- score[,c(4,8)]
sgRNA.genes.class.count.genic <- subset(sgRNA.genes.class.count, sgRNA.genes.class.count$class == "genic")
sgRNA.class <- unique(sgRNA.genes.class.count.genic[,c(4,20)])
sgRNA.class.score <- left_join(sgRNA.class, score.df, by="sgRNAID")
library(ggplot2)
pdf("sgRNA.transcript.location.score.violin.pdf")
ggplot(sgRNA.class.score, aes(x=class.quarter, y=cut.score, color=class.quarter)) + geom_violin() + theme_classic()
dev.off()
pdf("sgRNA.transcript.location.score.box.pdf")
ggplot(sgRNA.class.score, aes(x=class.quarter, y=cut.score, fill=class.quarter)) + geom_boxplot() + scale_fill_viridis(discrete = TRUE, alpha=0.6) + theme_classic() + theme(legend.position="none", plot.title = element_text(size=11)) + ggtitle("sgRNA position relative to transcript") + xlab("")
dev.off()
library(viridis)
pdf("sgRNA.transcript.location.score.boxplot.pdf")
ggplot(sgRNA.class.score, aes(x=class.quarter, y=cut.score, fill=class.quarter)) +
geom_boxplot() +
scale_fill_viridis(discrete = TRUE, alpha=0.6) +
geom_jitter(color="black", size=0.4, alpha=0.9) +
theme_classic() +
theme(
legend.position="none",
plot.title = element_text(size=11)
) +
ggtitle("sgRNA position relative to transcript") +
xlab("")
dev.off()
http://homer.ucsd.edu/homer/motif/ https://www.ncbi.nlm.nih.gov/nuccore/1160889053?report=fasta
source /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/scripts/loadcondaandes.sh
# conda create --name homer homer=4.11 python=3.8 -c bioconda -c conda-forge
conda activate homer
cd /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli
mkdir homer.output
# finding lots of CGCG-like motifs.... alter parameters
findMotifs.pl Ecoli.allCas9.fasta fasta homer.output -cpg
#!/bin/bash
#SBATCH -A SYB105
#SBATCH -J homer
#SBATCH -N 2
#SBATCH -t 48:00:00
#SBATCH --mem-per-cpu=0
source /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/scripts/loadcondaandes.sh
conda activate homer
cd /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli
mkdir homer.output
# finding lots of CGCG-like motifs.... alter parameters
findMotifs.pl Ecoli.allCas9.fasta fasta homer.output -cpg
# sbatch /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli/homer.sh
### top hit is TTTTCCCC in 44.7% of targets... look at correlation with score??
grep 'TTTTCCCC' Ecoli.allCas9.txt | wc -l
# 51
grep 'TTTTCCCC' Ecoli.allCas9.txt | awk '{sum += $2} END {print sum}'
# 857.19 / 51 = 16.80765
grep -v 'TTTTCCCC' Ecoli.allCas9.txt | wc -l
# 126132
grep -v 'TTTTCCCC' Ecoli.allCas9.txt | awk '{sum += $2} END {print sum}'
# 2.24465e+06 / 126132 = 17.79604
grep 'TTT..CCC' Ecoli.allCas9.txt | awk '{sum += $2} END {print sum}'
# 11512.6 / 705 = 16.32993
grep -v 'TTT..CCC' Ecoli.allCas9.txt | awk '{sum += $2} END {print sum}'
# 2.234e+06 / 125478 = 17.80392
** Need to compile the C++ file /gpfs/alpine/syb105/proj-shared/Personal/jromero/codesnippets/ritw **
# salloc -A SYB105 -p gpu -N 1 -t 4:00:00
# /gpfs/alpine/syb105/proj-shared/Personal/jromero/PathAnalysis/runRIT.sh
## cp /gpfs/alpine/syb105/proj-shared/Personal/jromero/PathAnalysis/runRIT.sh /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/runRIT.sh
# runRIT.sh feature name ### Note: name is name of the run and feature is the name of the y-value
### RIT for Cas9 model with all features
cd /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli/iRF.run/Ecoli.allCas9/all.features/dwt20bp.noncor2.cas9/cut.score
/gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/runRIT.sh cut.score dwt20bp.noncor2.cas9
#!/bin/bash
#SBATCH -A SYB105
#SBATCH -J RIT.run
#SBATCH -N 2
#SBATCH -t 48:00:00
#SBATCH --mem-per-cpu=0
cd /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli/iRF.run/Ecoli.allCas9/all.features/dwt20bp.noncor2.cas9/cut.score
/gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/runRIT.sh cut.score dwt20bp.noncor2.cas9
# sbatch /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli/iRF.run/Ecoli.allCas9/all.features/dwt20bp.noncor2.cas9/cut.score/RIT.run
#### slurm output file:: slurm-142654.out
## ran out of time somehow... try to re-run using 2 nodes instead of 1... slurm-143401.out
## Jonathan looking at why it is taking so long...
# finding Importance of: sgRNA.gcsgRNA.raw
# BuildLineTime: 53077.010620355606
# ProcessLineTime: 0.03100299835205078
# Feature: sgRNA.gcsgRNA.raw time: 53077.61784052849
#### look at distribution of values for GC content in the dataset...
setwd("/gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli/iRF.run/Ecoli.allCas9/all.features/dwt20bp/")
features.cas9 <- read.delim("dwt20bp.noncor2.cas9.features.txt", header=T, sep="\t")
# count of unique values
nrow(features.cas9)
# 40467
gc <- unique(features.cas9$sgRNA.gcsgRNA.raw)
length(gc)
# 14
T <- unique(features.cas9$TsgRNA.raw)
length(T)
# 10
gc.dwt <- unique(features.cas9$gc.dwtd1.x)
length(gc.dwt)
# 5248
module load python
source /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/scripts/loadcondaandes.sh
conda activate /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/Libraries/andes/envs/test
mkdir /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/human
# scp /Users/27n/Dropbox (ORNL)/ORNL.Noshay/Projects/SEED/ExploratoryDataForModelGeneration/human/doench.2014.TableS7.txt noshayjm@dtn.ccs.ornl.gov:/gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/human/.
setwd("/gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/human/")
df <- read.delim("doench.2014.TableS7.txt", header=T, sep="\t")
colnames(df) <- c("sgRNAID", "nucleotide.sequence", "cut.score")
df2 <- df[,c(1,3,2)]
df.na <- na.omit(df2)
write.table(df.na, "Doench2014.txt", quote=F, row.names=F, sep="\t")
cd /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/human/
sed '1d' Doench2014.txt | awk '{print ">"$1"\n"$3}' > Doench2014.fasta
### melting temp
# count nucleotides
cd /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/human
python
input_file = open('Doench2014.fasta', 'r')
output_file = open('Doench2014_nuc_counts.tsv','w')
output_file.write('Window\tA\tC\tG\tT\tLength\tCG%\n')
from Bio import SeqIO
for cur_record in SeqIO.parse(input_file, "fasta") :
gene_name = cur_record.name
A_count = cur_record.seq.count('A')
C_count = cur_record.seq.count('C')
G_count = cur_record.seq.count('G')
T_count = cur_record.seq.count('T')
length = len(cur_record.seq)
cg_percentage = float(C_count + G_count) / length
output_line = '%s\t%i\t%i\t%i\t%i\t%i\t%f\n' % \
(gene_name, A_count, C_count, G_count, T_count, length, cg_percentage)
output_file.write(output_line)
output_file.close()
input_file.close()
exit()
# Melting temperature(°C) = 64.9 + 41 * (nG+nC-16.4)/(nA+nT+nG+nC)
R
library(dplyr)
setwd("/gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/human")
df <- read.delim("Doench2014_nuc_counts.tsv", header=T, sep="\t")
df.melt <- df %>% mutate(MeltingTemp = 64.9 + 41 * (G+C-16.4) / (A+T+G+C))
write.table(df.melt, "Doench2014.nuc.count.txt", quote=F, row.names=F, sep="\t")
q()
### structure
source /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/scripts/loadcondaandes.sh
conda activate ViennaRNA
cd /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/human
RNAfold < Doench2014.fasta > Doench2014.gRNA.ViennaRNA.output.txt
grep '(' Doench2014.gRNA.ViennaRNA.output.txt | grep -Eo '[+-]?[0-9]+([.][0-9]+)?' > Doench2014.gRNA.ViennaRNA.output.value.txt
grep '>' Doench2014.gRNA.ViennaRNA.output.txt | sed 's/>//g' > Doench2014.gRNA.names.txt
paste Doench2014.gRNA.names.txt Doench2014.gRNA.ViennaRNA.output.value.txt > Doench2014.veinna.structure.txt
### onehot encoding
cd /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/human/
cut -f 1,3 Doench2014.txt > Doench2014.noscore.txt
python /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli/encode_sequences.py Doench2014.noscore.txt
# separate nucleotide sequence values into individual columns in data frame so each position counts as one feature
cd /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/human/
sed '1d' Doench2014.noscore_independent1.txt | sed '1d' | awk '{gsub(/./,"& ",$2);print $0}' | sed '1i sgRNAID A C T G' | cut -d ' ' -f 1-5 > Doench2014_ind1.txt
sed '1d' Doench2014.noscore_independent2.txt | sed '1d' | awk '{gsub(/./,"& ",$2);print $0}' | sed '1i sgRNAID AA AC AT AG CA CC CT CG TA TC TT TG GA GC GT GG' | cut -d ' ' -f 1-17 > Doench2014_ind2.txt
sed '1d' Doench2014.noscore_dependent1.txt | sed '1d' | awk '{gsub(/./,"& ",$2);print $0}' | sed '1i sgRNAID p1.A p1.C p1.T p1.G p2.A p2.C p2.T p2.G p3.A p3.C p3.T p3.G p4.A p4.C p4.T p4.G p5.A p5.C p5.T p5.G p6.A p6.C p6.T p6.G p7.A p7.C p7.T p7.G p8.A p8.C p8.T p8.G p9.A p9.C p9.T p9.G p10.A p10.C p10.T p10.G p11.A p11.C p11.T p11.G p12.A p12.C p12.T p12.G p13.A p13.C p13.T p13.G p14.A p14.C p14.T p14.G p15.A p15.C p15.T p15.G p16.A p16.C p16.T p16.G p17.A p17.C p17.T p17.G p18.A p18.C p18.T p18.G p19.A p19.C p19.T p19.G p20.A p20.C p20.T p20.G' | cut -d ' ' -f 1-21 > Doench2014_dep1.txt
sed '1d' Doench2014.noscore_dependent2.txt | sed '1d' | awk '{gsub(/./,"& ",$2);print $0}' | sed '1i sgRNAID p1.AA p1.AC p1.AT p1.AG p1.CA p1.CC p1.CT p1.CG p1.TA p1.TC p1.TT p1.TG p1.GA p1.GC p1.GT p1.GG p2.AA p2.AC p2.AT p2.AG p2.CA p2.CC p2.CT p2.CG p2.TA p2.TC p2.TT p2.TG p2.GA p2.GC p2.GT p2.GG p3.AA p3.AC p3.AT p3.AG p3.CA p3.CC p3.CT p3.CG p3.TA p3.TC p3.TT p3.TG p3.GA p3.GC p3.GT p3.GG p4.AA p4.AC p4.AT p4.AG p4.CA p4.CC p4.CT p4.CG p4.TA p4.TC p4.TT p4.TG p4.GA p4.GC p4.GT p4.GG p5.AA p5.AC p5.AT p5.AG p5.CA p5.CC p5.CT p5.CG p5.TA p5.TC p5.TT p5.TG p5.GA p5.GC p5.GT p5.GG p6.AA p6.AC p6.AT p6.AG p6.CA p6.CC p6.CT p6.CG p6.TA p6.TC p6.TT p6.TG p6.GA p6.GC p6.GT p6.GG p7.AA p7.AC p7.AT p7.AG p7.CA p7.CC p7.CT p7.CG p7.TA p7.TC p7.TT p7.TG p7.GA p7.GC p7.GT p7.GG p8.AA p8.AC p8.AT p8.AG p8.CA p8.CC p8.CT p8.CG p8.TA p8.TC p8.TT p8.TG p8.GA p8.GC p8.GT p8.GG p9.AA p9.AC p9.AT p9.AG p9.CA p9.CC p9.CT p9.CG p9.TA p9.TC p9.TT p9.TG p9.GA p9.GC p9.GT p9.GG p10.AA p10.AC p10.AT p10.AG p10.CA p10.CC p10.CT p10.CG p10.TA p10.TC p10.TT p10.TG p10.GA p10.GC p10.GT p10.GG p11.AA p11.AC p11.AT p11.AG p11.CA p11.CC p11.CT p11.CG p11.TA p11.TC p11.TT p11.TG p11.GA p11.GC p11.GT p11.GG p12.AA p12.AC p12.AT p12.AG p12.CA p12.CC p12.CT p12.CG p12.TA p12.TC p12.TT p12.TG p12.GA p12.GC p12.GT p12.GG p13.AA p13.AC p13.AT p13.AG p13.CA p13.CC p13.CT p13.CG p13.TA p13.TC p13.TT p13.TG p13.GA p13.GC p13.GT p13.GG p14.AA p14.AC p14.AT p14.AG p14.CA p14.CC p14.CT p14.CG p14.TA p14.TC p14.TT p14.TG p14.GA p14.GC p14.GT p14.GG p15.AA p15.AC p15.AT p15.AG p15.CA p15.CC p15.CT p15.CG p15.TA p15.TC p15.TT p15.TG p15.GA p15.GC p15.GT p15.GG p16.AA p16.AC p16.AT p16.AG p16.CA p16.CC p16.CT p16.CG p16.TA p16.TC p16.TT p16.TG p16.GA p16.GC p16.GT p16.GG p17.AA p17.AC p17.AT p17.AG p17.CA p17.CC p17.CT p17.CG p17.TA p17.TC p17.TT p17.TG p17.GA p17.GC p17.GT p17.GG p18.AA p18.AC p18.AT p18.AG p18.CA p18.CC p18.CT p18.CG p18.TA p18.TC p18.TT p18.TG p18.GA p18.GC p18.GT p18.GG p19.AA p19.AC p19.AT p19.AG p19.CA p19.CC p19.CT p19.CG p19.TA p19.TC p19.TT p19.TG p19.GA p19.GC p19.GT p19.GG p20.AA p20.AC p20.AT p20.AG p20.CA p20.CC p20.CT p20.CG p20.TA p20.TC p20.TT p20.TG p20.GA p20.GC p20.GT p20.GG' | cut -d ' ' -f 1-321 > Doench2014_dep2.txt
#### chemical tensors
cd /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/human
sed '1d' Doench2014.noscore.txt | awk '{gsub(/./,"& ",$2);print $0}' | sed '1i sgRNAID p1 p2 p3 p4 p5 p6 p7 p8 p9 p10 p11 p12 p13 p14 p15 p16 p17 p18 p19 p20' | cut -d ' ' -f 1-21 > Doench2014.sequence.txt
# salloc -A SYB105 -N 2 -t 4:00:00
module load python
source /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/scripts/loadcondaandes.sh
conda activate /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/Libraries/andes/envs/test
R
library(dplyr)
library(reshape2)
setwd("/gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli/")
tensor <- read.delim("protein_rna_dna-vector_lee_nucleotide_dna_data.txt", header=T, sep="\t", stringsAsFactors = F)
setwd("/gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/human/")
seq <- read.delim("Doench2014.sequence.txt", header=T, sep=" ", stringsAsFactors = F)
tensor.features <- tensor[,1]
rownames(tensor) <- tensor[,1]
tensor.df <- tensor[,2:5]
tensor.t <- as.data.frame(t(tensor.df))
tensor.t$base <- c("A", "C", "G", "T")
rownames(seq) <- seq[,1]
seq.df <- seq[,2:21]
seq.melt <- melt(seq, id="sgRNAID")
colnames(seq.melt) <- c("sgRNAID", "position", "base")
seq.tensor <- left_join(seq.melt, tensor.t, by="base")
seq.tensor.melt <- melt(seq.tensor, id=c("sgRNAID", "position", "base"))
seq.tensor.dcast <- dcast(seq.tensor.melt, sgRNAID ~ position + variable, value.var="value")
write.table(seq.tensor.dcast, "Doench2014.tensors.txt", quote=F, row.names=F, sep="\t")
write.table(seq.tensor.melt, "Doench2014.tensors.melt.txt", quote=F, row.names=F, sep="\t")
# salloc -A SYB105 -p gpu -N 2 -t 4:00:00
source /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/scripts/loadcondaandes.sh
conda activate /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/Libraries/andes/envs/test
R
library(dplyr)
library(reshape2)
library(wmtsa)
library(tidyr)
library(MassSpecWavelet)
setwd("/gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/human")
structure <- read.delim("Doench2014.vienna.structure.txt", header=T, sep="\t", stringsAsFactors = F)
nuc <- read.delim("Doench2014.nuc.count.txt", header=T, sep="\t", stringsAsFactors = F)
score <- read.delim("Doench2014.txt", header=T, sep="\t", stringsAsFactors = F)
score.df <- score[,c(1:2)]
colnames(score.df) <- c("sgRNAID", "cut.score")
structure.df <- structure[,2]
gc.df <- nuc[,7]
temp.df <- nuc[,8]
# Run DWT
temp.modwt <- wavMODWT(temp.df)
temp.modwt.df <- as.matrix(temp.modwt)
temp.modwt.label <- data.frame(label = row.names(temp.modwt.df), temp.modwt.df)
temp.modwt.name <- temp.modwt.label %>% separate(label, c("name", "number"), sep = "[^[:alnum:]]+")
colnames(temp.modwt.name) <- c("scale", "window", "temp.dwt")
gc.modwt <- wavMODWT(gc.df)
gc.modwt.df <- as.matrix(gc.modwt)
gc.modwt.label <- data.frame(label = row.names(gc.modwt.df), gc.modwt.df)
gc.modwt.name <- gc.modwt.label %>% separate(label, c("name", "number"), sep = "[^[:alnum:]]+")
colnames(gc.modwt.name) <- c("scale", "window", "gc.dwt")
structure.modwt <- wavMODWT(structure.df)
structure.modwt.df <- as.matrix(structure.modwt)
structure.modwt.label <- data.frame(label = row.names(structure.modwt.df), structure.modwt.df)
structure.modwt.name <- structure.modwt.label %>% separate(label, c("name", "number"), sep = "[^[:alnum:]]+")
colnames(structure.modwt.name) <- c("scale", "window", "structure.dwt")
window <- data.frame(score.df[,1])
window$window <- seq.int(nrow(window))
window$window <- as.character(window$window-1)
colnames(window) <- c("sgRNAID", "window")
library(tidygenomics)
window.score.df <- left_join(score.df, window, by=c("sgRNAID"))
window.score.temp <- left_join(window.score.df, temp.modwt.name, by="window")
window.temp.gc <- left_join(window.score.temp, gc.modwt.name, by=c("window", "scale"))
window.temp.gc.structure <- left_join(window.temp.gc, structure.modwt.name, by=c("window", "scale"))
window.temp.gc.structure.sgRNA <- subset(window.temp.gc.structure, window.temp.gc.structure$cut.score != "NA")
df.melt <- melt(window.temp.gc.structure.sgRNA[,c(1:2,4:7)], id=c("cut.score", "scale", "sgRNAID"))
df <- na.omit(df.melt)
# add sgRNA raw data
structure.df <- data.frame(structure[,2])
gc.df <- data.frame(nuc[,7])
temp.df <- data.frame(nuc[,8])
structure.df$scale <- "sgRNA.raw"
gc.df$scale <- "sgRNA.raw"
temp.df$scale <- "sgRNA.raw"
structure.df$sgRNAID <- structure[,1]
gc.df$sgRNAID <- nuc[,1]
temp.df$sgRNAID <- nuc[,1]
window.score.structure <- left_join(window.score.df, structure.df, by="sgRNAID")
window.score.structure.temp <- left_join(window.score.structure, temp.df, by=c("sgRNAID", "scale"))
window.score.structure.temp.gc <- left_join(window.score.structure.temp, gc.df, by=c("sgRNAID", "scale"))
colnames(window.score.structure.temp.gc) <- c("sgRNAID", "cut.score", "seq", "sgRNA.structure", "scale", "sgRNA.temp", "sgRNA.gc")
## add one-hot encoding of sequence
setwd("/gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/human")
onehot.ind1 <- read.delim("Doench2014_ind1.txt", header=T, sep=" ")
onehot.ind2 <- read.delim("Doench2014_ind2.txt", header=T, sep=" ")
onehot.dep1 <- read.delim("Doench2014_dep1.txt", header=T, sep=" ")
onehot.dep2 <- read.delim("Doench2014_dep2.txt", header=T, sep=" ")
onehot.dep2 <- onehot.dep2[,1:305]
onehot.ind <- full_join(onehot.ind1, onehot.ind2, by="sgRNAID")
onehot.dep <- full_join(onehot.dep1, onehot.dep2, by="sgRNAID")
onehot <- full_join(onehot.ind, onehot.dep, by="sgRNAID")
onehot$scale <- "sgRNA.raw"
data.onehot <- left_join(window.score.structure.temp.gc, onehot, by=c("sgRNAID", "scale"))
df2.melt <- melt(data.onehot[,c(1,2,4:ncol(data.onehot))], id=c("cut.score", "scale", "sgRNAID"))
df2 <- na.omit(df2.melt)
colnames(df) <- c("cut.score", "scale", "sgRNAID", "variable", "value")
df <- rbind(df, df2)
df.id <- df %>% unite(feature.scale, c(variable, scale), sep = "")
df.id$value <- as.numeric(df.id$value)
df.id <- df.id[!(is.na(df.id$value) | df.id$value==""), ]
colnames(df.id) <- c("cut.score", "feature.scale", "sgRNAID", "value")
setwd("/gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/human")
tensor <- read.delim("Doench2014.tensors.melt.txt", header=T, sep="\t")
tensor[is.na(tensor)] <- 0
tensor$scale <- "raw"
tensor.id <- tensor %>% unite(feature.scale, c(position, variable, scale), sep = "")
tensor.id$value <- as.numeric(tensor.id$value)
tensor.id[is.na(tensor.id)] <- 0
df.score <- unique(df.id[,c(1,3)])
tensor.score <- inner_join(tensor.id, df.score, by="sgRNAID")
tensor.score.order <- tensor.score[,c(5,2,1,4)]
head(df.id)
head(tensor.score.order)
tensor.df <- rbind(df.id, tensor.score.order)
write.table(tensor.df, "Doench2014v2.DWT.raw.onehot.tensor.txt", quote=F, row.names=F, sep="\t")
df.dcast <- tensor.df %>% dcast(sgRNAID + cut.score ~ feature.scale, value.var = "value", fun.aggregate=mean, na.rm=TRUE)
write.table(df.dcast, "Doench2014v2.DWT.raw.onehot.tensor.dcast.txt", quote=F, row.names=F, sep="\t")
nrow(df.dcast)
#
df.dcast.na <- na.omit(df.dcast)
write.table(df.dcast, "Doench2014v2.DWT.raw.onehot.tensor.dcast.na.txt", quote=F, row.names=F, sep="\t")
nrow(df.dcast.na)
#
setwd("/gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/human")
df.dcast <- read.delim("Doench2014v2.DWT.raw.onehot.tensor.dcast.na.txt", header=T, sep="\t")
df.features <- df.dcast[,c(1,3:ncol(df.dcast))]
df.features.nolabel <- df.dcast[,c(3:ncol(df.dcast))]
df.scores <- df.dcast[,c(1,2)]
df.scores.nolabel <- as.data.frame(df.dcast[,c(2)])
colnames(df.scores.nolabel) <- "cut.score"
setwd("/gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli/iRF.run/Ecoli.allCas9/all.features/Yeast/")
write.table(df.features, "Doench2014v2_features.txt", quote=F, row.names=F, sep="\t")
write.table(df.scores, "Doench2014v2_score.txt", quote=F, row.names=F, sep="\t")
write.table(df.features.nolabel, "Doench2014v2_features_noSampleIDs.txt", quote=F, row.names=F, sep="\t")
write.table(df.scores.nolabel, "Doench2014v2_score_noSampleIDs.txt", quote=F, row.names=F, sep="\t")
# Summit
module load r/4.0.5
# Andes
# salloc -A SYB105 -N 2 -t 4:00:00
source /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/scripts/loadcondaandes.sh
conda activate /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/Libraries/andes/envs/test
R
setwd("/gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/human")
df <- read.delim("Doench2014v2.DWT.raw.onehot.tensor.dcast.na.txt", header=T, sep="\t")
df <- na.omit(df)
## sample ID
df.features <- df[,c(1,3:ncol(df))]
df.score <- df[,1:2]
setwd("/gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/human/iRF.run")
write.table(as.matrix(df.features), "doench2014v2.features_overlap.txt", quote=F, row.names=F, sep="\t")
write.table(as.matrix(df.score), "doench2014v2.score_overlap.txt", quote=F, row.names=F, sep="\t")
write.table(as.matrix(df.features), "doench2014v2.features.txt", quote=F, row.names=F, sep="\t")
write.table(as.matrix(df.score), "doench2014v2.score.txt", quote=F, row.names=F, sep="\t")
## no sample ID
## <DataFile>_overlap_noSampleIDs.txt & <YFile>_overlap_noSampleIDs.txt
df.features <- df[,c(3:ncol(df))]
df.score <- data.frame(df[,2])
colnames(df.score) <- "cut.score"
setwd("/gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/human/iRF.run")
write.table(as.matrix(df.features), "doench2014v2.features_overlap_noSampleIDs.txt", quote=F, row.names=F, sep="\t")
write.table(as.matrix(df.score), "doench2014v2.score_overlap_noSampleIDs.txt", quote=F, row.names=F, sep="\t")
# run python scripts on Andes
# run job submissions on Summit
# Builder script: /gpfs/alpine/syb105/proj-shared/Projects/iRF/iRF_LOOP_SetUp_CrossLayer.py
# [python iRF_LOOP_SetUp_CrossLayer.py --DataFile --YFile --System Summit --NodesPer 1 --TotalNodes 10 --RunTime 2 --Account SYB105 --NumTrees 1000 --NumIterations 5 --RunName iRF.XX --bypass --Prediction]
# Andes
cd /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/human
mkdir iRF.run/oct
module load python/3.7-anaconda3
cd /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/human/iRF.run/oct
python /gpfs/alpine/syb105/proj-shared/Projects/iRF/iRF_LOOP_SetUp_CrossLayer.py --System Summit --NodesPer 1 --TotalNodes 50 --RunTime 90 --Account SYB105 --NumTrees 1000 --NumIterations 5 --RunName doench2014 --bypass --Prediction /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/human/iRF.run/doench2014v2.features.txt /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/human/iRF.run/doench2014v2.score.txt
# Summit
cd /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/human/iRF.run/oct
module load python/3.7.0-anaconda3-5.3.0
# full
bsub /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/human/iRF.run/oct/Submits/submit_full_doench2014_0.sh
# train
bsub /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/human/iRF.run/oct/Submits/submit_train_doench2014_0.sh
# once the train submissions are done run the test submissions
# test
bsub /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/human/iRF.run/oct/Submits/submit_test_doench2014_0.sh
# Andes
cd /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/human/iRF.run/oct/
module load python/3.7-anaconda3
cd /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/human/iRF.run/oct/
python /gpfs/alpine/syb105/proj-shared/Projects/iRF/iRF_postProcessing.py --Iterations 5 --Prediction --PredAccuracy MAE,MAEA,MSE,R2 --varTot 95 /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli/iRF.run/Ecoli.allCas9/YNames.txt doench2014
# R2=0.43814069711165693
sort -k3rg topVarEdges/cut.score_top95.txt | head
# GGsgRNA.raw cut.score 0.04166905753460318
# structure.dwtd3 cut.score 0.03503175608150623
# structure.dwtd6 cut.score 0.02594435983776593
# p20num_ringsraw cut.score 0.024228442807021116
# temp.dwtd7 cut.score 0.020945405180766236
# p20relativenum_Hatomsraw cut.score 0.02078896706814056
# gc.dwtd7 cut.score 0.017123989754931384
# p16.CCsgRNA.raw cut.score 0.01673902223353918
# structure.dwtd2 cut.score 0.016693748899562717
# structure.dwtd4 cut.score 0.01617743103660186
# pearson correlation
setwd("/gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/human/iRF.run/oct/cut.score/foldRuns/fold9/Runs/Set4")
pred <- read.delim("doench2014_Set4_test.prediction", header=T, sep="\t")
y <- read.delim("set4_Y_test_noSampleIDs.txt", header=T, sep="\t")
cor(y$cut.score, pred$Predictions.)
# 0.6469806
# spearman correlation
cor(y$cut.score, pred$Predictions., method=c("spearman"))
# 0.6440906
library(tidyr)
library(ggplot2)
setwd("/gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/human/iRF.run/oct/normalizedEdgeFiles")
feature <- read.delim("cut.score_Normalize.txt", header=F, sep="\t")
feature.order <- feature[order(-feature$V3),]
feature.order.top <- feature.order[1:25,]
feature.order.top$id <- feature.order.top$V1
df <- separate(feature.order.top, V1, c("feature", "type"))
pdf("doench2014.v2.importance.pdf")
ggplot(df, aes(x = reorder(id, -V3), y = V3)) + geom_bar(stat = "identity") + theme_classic() + theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1))
dev.off()
# scp noshayjm@dtn.ccs.ornl.gov:/gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/human/iRF.run/oct/normalizedEdgeFiles/doench2014.v2.importance.pdf /Users/27n/Dropbox (ORNL)/ORNL.Noshay/Projects/SEED/ExploratoryDataForModelGeneration/human/.
–> try on e.coli-based model
### Summit
#!/bin/bash -l
#BSUB -P SYB105
#BSUB -W 02:15
#BSUB -nnodes 50
#BSUB -J doench2014_ecoli_test_0
#BSUB -o doench2014_ecoli_test_0.o%J
#BSUB -e doench2014_ecoli_test_0.e%J
cd /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/human/iRF.run/oct
#/gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/human/iRF.run/doench2014v2.features_overlap_noSampleIDs.txt #/gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/human/iRF.run/doench2014v2.score_overlap_noSampleIDs.txt
/usr/bin/time -f "%e" jsrun -n 1 -a 1 -c 40 -bpacked:40 /gpfs/alpine/syb105/proj-shared/Projects/iRF/IterativeRanger/cpp_version/build/ranger --file /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/human/iRF.run/doench2014v2.features_overlap_noSampleIDs.txt --yfile /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/human/iRF.run/doench2014v2.score_overlap_noSampleIDs.txt --predict /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli/iRF.run/Ecoli.allCas9/all.features/dwt20bp.noncor2.vienna/cut.score/foldRuns/fold9/Runs/Set4/dwt20bp.noncor2.vienna_cut.score.forest --treetype 3 --depvarname cut.score --impmeasure 1 --nthreads 160 --useMPI 0 --outprefix doench2014_ecoli_test --outputDirectory /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/human/iRF.run/oct > /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/human/iRF.run/oct/doench2014_ecoli_test.o
# bsub /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/human/iRF.run/doench2014_ecoli_submit.sh
setwd("/gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/human/iRF.run")
score <- read.delim("doench2014v2.score_overlap_noSampleIDs.txt", header=T, sep="\t")
setwd("/gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/human/iRF.run/oct")
predict <- read.delim("doench2014_ecoli_test.prediction", header=T, sep="\t")
score.predict <- cbind(score, predict)
cor(score.predict$cut.score, score.predict$Predictions.)
# -0.1896705
pdf("Doench2014.ecoli.oct.prediction.scatter.pdf")
library(ggplot2)
ggplot(score.predict, aes(x=cut.score, y=Predictions.)) + geom_point() + theme_classic()
dev.off()
–> try removing DWT
# R
setwd("/gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/human/iRF.run")
features.overlap <- read.delim("doench2014v2.features_overlap.txt", header=T, sep = "\t")
features <- read.delim("doench2014v2.features.txt", header=T, sep = "\t")
features.overlap.noID <- read.delim("doench2014v2.features_overlap_noSampleIDs.txt", header=T, sep = "\t")
names(features)
names(features.overlap)
names(features.overlap.noID)
write.table(features[,c(1:12,24:1594,1606:1607,1619:1621)], "doench2014v2.noDWT.features.txt", quote=F, row.names=F, sep="\t")
write.table(features.overlap[,c(1:12,24:1594,1606:1607,1619:1621)], "doench2014v2.noDWT.features_overlap.txt", quote=F, row.names=F, sep="\t")
write.table(features.overlap.noID[,c(1:11,23:1593,1605:1606,1618:1620)], "doench2014v2.noDWT.features_overlap_noSampleIDs.txt", quote=F, row.names=F, sep="\t")
# run python scripts on Andes
# run job submissions on Summit
# Builder script: /gpfs/alpine/syb105/proj-shared/Projects/iRF/iRF_LOOP_SetUp_CrossLayer.py
# [python iRF_LOOP_SetUp_CrossLayer.py --DataFile --YFile --System Summit --NodesPer 1 --TotalNodes 10 --RunTime 2 --Account SYB105 --NumTrees 1000 --NumIterations 5 --RunName iRF.XX --bypass --Prediction]
# Andes
cd /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/human
mkdir iRF.run/oct.noDWT
module load python/3.7-anaconda3
cd /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/human/iRF.run/oct.noDWT
python /gpfs/alpine/syb105/proj-shared/Projects/iRF/iRF_LOOP_SetUp_CrossLayer.py --System Summit --NodesPer 1 --TotalNodes 50 --RunTime 90 --Account SYB105 --NumTrees 1000 --NumIterations 5 --RunName doench2014.noDWT --bypass --Prediction /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/human/iRF.run/doench2014v2.noDWT.features.txt /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/human/iRF.run/doench2014v2.score.txt
# Summit
cd /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/human/iRF.run/oct.noDWT
module load python/3.7.0-anaconda3-5.3.0
# full
bsub /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/human/iRF.run/oct.noDWT/Submits/submit_full_doench2014.noDWT_0.sh
# train
bsub /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/human/iRF.run/oct.noDWT/Submits/submit_train_doench2014.noDWT_0.sh
# once the train submissions are done run the test submissions
# test
bsub /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/human/iRF.run/oct.noDWT/Submits/submit_test_doench2014.noDWT_0.sh
# Andes
cd /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/human/iRF.run/oct.noDWT/
module load python/3.7-anaconda3
cd /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/human/iRF.run/oct.noDWT/
python /gpfs/alpine/syb105/proj-shared/Projects/iRF/iRF_postProcessing.py --Iterations 5 --Prediction --PredAccuracy MAE,MAEA,MSE,R2 --varTot 95 /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli/iRF.run/Ecoli.allCas9/YNames.txt doench2014.noDWT
# R2=
sort -k3rg topVarEdges/cut.score_top95.txt | head
# pearson correlation
setwd("/gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/human/iRF.run/oct.noDWT/cut.score/foldRuns/fold9/Runs/Set4")
pred <- read.delim("doench2014.noDWT_Set4_test.prediction", header=T, sep="\t")
y <- read.delim("set4_Y_test_noSampleIDs.txt", header=T, sep="\t")
cor(y$cut.score, pred$Predictions.)
# 0.6469806
# spearman correlation
cor(y$cut.score, pred$Predictions., method=c("spearman"))
#
–> add PAM
### Summit
#/gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli/iRF.run/Ecoli.allCas9/all.features/dwt20bp.noncor2.vienna/cut.score/foldRuns/fold9/Runs/Set4/dwt20bp.noncor2.vienna_cut.score.forest
#!/bin/bash -l
#BSUB -P SYB105
#BSUB -W 02:15
#BSUB -nnodes 50
#BSUB -J Yeast.test_0
#BSUB -o Yeast.test_0.o%J
#BSUB -e Yeast.test_0.e%J
#mkdir /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli/iRF.run/Ecoli.allCas9/all.features/Yeast
cd /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli/iRF.run/Ecoli.allCas9/all.features/Yeast
/usr/bin/time -f "%e" jsrun -n 1 -a 1 -c 40 -bpacked:40 /gpfs/alpine/syb105/proj-shared/Projects/iRF/IterativeRanger/cpp_version/build/ranger --file /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli/iRF.run/Ecoli.allCas9/all.features/Yeast/Yeast_test_noSampleIDs.txt --yfile /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli/iRF.run/Ecoli.allCas9/all.features/Yeast/Y_test_noSampleIDs.txt --predict /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli/iRF.run/Ecoli.allCas9/all.features/dwt20bp.noncor2.vienna/cut.score/foldRuns/fold9/Runs/Set4/dwt20bp.noncor2.vienna_cut.score.forest --treetype 3 --depvarname cut.score --impmeasure 1 --nthreads 160 --useMPI 0 --outprefix Yeast_oct --outputDirectory /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli/iRF.run/Ecoli.allCas9/all.features/Yeast > /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli/iRF.run/Ecoli.allCas9/all.features/Yeast/Yeast_oct_test.o
# bsub /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli/iRF.run/Ecoli.allCas9/all.features/Yeast/Yeast_oct_submit.sh
setwd("/gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli/iRF.run/Ecoli.allCas9/all.features/Yeast/")
score <- read.delim("Y_test_noSampleIDs.txt", header=T, sep="\t")
predict <- read.delim("Yeast_oct_test.prediction", header=T, sep="\t")
score.predict <- cbind(score, predict)
cor(score.predict$cut.score, score.predict$Predictions.)
# -0.006386167
pdf("Yeast.oct.prediction.scatter.pdf")
library(ggplot2)
ggplot(score.predict, aes(x=cut.score, y=Predictions.)) + geom_point() + theme_classic()
dev.off()
# https://github.com/jaswindersingh2/SPOT-RNA
# salloc -A SYB105 -N 2 -t 4:00:00
module load python
source /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/scripts/loadcondaandes.sh
conda activate venv
#!/bin/bash
#SBATCH -A SYB105
#SBATCH -J spot.rna
#SBATCH -N 2
#SBATCH -t 48:00:00
cd /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli
#mkdir spot.rna
python3 /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/Libraries/andes/SPOT-RNA/SPOT-RNA.py --inputs sgRNA.seq.fa --outputs 'spot.rna/'
#sbatch /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli/spot.rna.sh
module load python
source /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/scripts/loadcondaandes.sh
conda activate /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/Libraries/andes/envs/test
cd /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli
conda install -c bioconda viennarna
https://github.com/zhangchonglab/sgRNA-cleavage-activity-prediction/blob/master/sgRNA_features.py
cd /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli/
sed '1d' DataS1.txt | awk '{ printf ">%s\n%s\n",$1,$2 }' > sgRNA.seq.fa
/gpfs/alpine/syb105/proj-shared/Personal/noshayjm/Libraries/andes/samtools/samtools faidx sgRNA.seq.fa
python ecoli.manuscript.features.py sgRNA.seq.fa
#cd /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/Libraries/andes
#git clone https://github.com/zhangchonglab/sgRNA-cleavage-activity-prediction.git
#ecoli_configure.txt
module load python
source /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/scripts/loadcondaandes.sh
# conda create -n sgRNA python=2.7.18
conda activate sgRNA
# conda install -c conda-forge biopython
# conda install -c conda-forge matplotlib
# conda install -c conda-forge numpy
# conda install -c free scikit-learn
# conda install -c conda-forge pandas
cp /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli/sgRNA.seq.fa ecoli_sgRNA.fasta
cd /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/Libraries/andes/sgRNA-cleavage-activity-prediction/
python sgRNA_activity_predict_main.py ecoli_configure.txt
# featureName,featureValue=feature(sgrnaSequence,NGGNSequence)
## need to alter the sgRNA_features.py sequence criteria... only inputing the sgRNA sequence not the NGGN
# ValueError: Number of features of the model must match the input. Model n_features is 425 and input n_features is 389
## doesn't work becasue the model uses N4N20NGGN4 for input fasta file and i only have N20... so how did they do it with this data... DataS2?
"Every potential sgRNA (N20NGG) targeting the two strands of each promoter was checked accordingly until two sgRNAs were extracted or the 3′ end of the promoter sequence was reached. To design the sgRNA library for RBSs throughout the E. coli genome, 4140 RBS sequences (Data S2) for every protein-coding gene (N30 + start codon + N17, N50 in total) were extracted, and a similar procedure was applied as described above to design sgRNA for these regions. The sequences of promoter and RBS sgRNAs are summarized in Data S1, whereas library metrics and entry sequences for these two libraries are shown in Data S2."
--> just test it by adding NGGN to the end of each sequence... something weird with file when trying to do awk/sed commands so alter in R... add "AGGT" to end of sequence
module load python
source /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/scripts/loadcondaandes.sh
conda activate /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/Libraries/andes/envs/test
cd /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli
R
library(tidyr)
setwd("/gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli")
#df <- read.delim("DataS1.txt", header=T, sep="\t")
df <- read.delim("DataS1.rbs.txt", header=T, sep="\t")
df$N4 <- "ATCG"
df$NGGN4 <- "AGGTAAAA"
df.id <- df %>% unite(seq, c(N4, nucleotide.sequence), sep="") %>% unite(seq, c(seq, NGGN4), sep="")
#write.table(df.id, "DataS1.NGGN.txt", quote=F, row.names=F, sep="\t")
write.table(df.id, "DataS1.rbs.NGGN.txt", quote=F, row.names=F, sep="\t")
module load python
source /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/scripts/loadcondaandes.sh
conda activate sgRNA
cd /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/Libraries/andes/sgRNA-cleavage-activity-prediction/
sed '1d' /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli/DataS1.NGGN.txt | awk '{ printf ">%s\n%s\n",$1,$2 }' > ecoli_sgRNA_NGGN.fasta
python sgRNA_activity_predict_main.py ecoli_test_configure.txt
sed '1d' /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli/DataS1.rbs.NGGN.txt | awk '{ printf ">%s\n%s\n",$1,$2 }' > ecoli_sgRNA_rbs_NGGN.fasta
python sgRNA_activity_predict_main.py ecoli_test_rbs_configure.txt
# check correlation between output and actual scores...
conda activate /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/Libraries/andes/envs/test
R
setwd("/gpfs/alpine/syb105/proj-shared/Personal/noshayjm/Libraries/andes/sgRNA-cleavage-activity-prediction/")
#output <- read.delim("ecoli_result.txt", header=T, sep="\t")
output <- read.delim("ecoli.rbs_result.txt", header=T, sep="\t")
colnames(output) <- c("sgRNA", "pred.score")
setwd("/gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli")
#score <- read.delim("sgRNA.coord.txt", header=T, sep="\t", stringsAsFactors = F)
score <- read.delim("sgRNA.rbs.coord.txt", header=T, sep="\t", stringsAsFactors = F)
colnames(score) <- c("chr", "start", "end", "sgRNA", "id", "seq", "id2", "cut.score", "gid", "change.val", "quality")
score.df <- score[,c(4,8)]
library(dplyr)
score.output <- left_join(score.df, output, by="sgRNA")
score.output.na <- na.omit(score.output)
cor(score.output.na$cut.score,score.output.na$pred.score)
# 0.6667238
# RBS: 0.6726381
# test correlation between output scores and my method output scores...
library(ranger)
setwd("/gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli")
df.dcast <- read.delim("ecoli.features.sgRNA.DWT.raw.onehot.sgRNAraw.dcast.txt", header=T, sep="\t")
df.raw <- df.dcast[,c(17:24,39,54)]
df.dwt <- df.dcast[,c(3:16,25:38,40:53)]
xmat = cbind(df.raw, df.dwt)
y = df.dcast$cut.score
tmp <- cbind(xmat, Y = y)
wt <- rep(1/ncol(xmat), ncol(xmat))
df.rf.all <- ranger(dependent.variable.name = "Y", data=tmp, num.trees=500, split.select.weights = wt, classification=F, mtry=ncol(df.raw), importance="impurity_corrected", num.threads=1, write.forest=T, always.split.variables=NULL)
df.output <- data.frame(sgRNA=df.dcast$sgRNA, cut.score=df.dcast$cut.score, ranger.score=df.rf.all$predictions)
score.output <- inner_join(df.output, output, by="sgRNA")
score.output.na <- na.omit(score.output)
cor(score.output.na$ranger.score,score.output.na$pred.score)
# 0.2396925
cor(score.output.na[,2:4])
# cut.score ranger.score pred.score
# cut.score 1.0000000 0.2863392 0.6666586
# ranger.score 0.2863392 1.0000000 0.2396925
# pred.score 0.6666586 0.2396925 1.0000000
## additional information that may be helping... larger N (should I use all 3 datasets... Cas9, eSpCas9 and Cas9 (ΔrecA)?), PAM (NGGN) sequence,
module load python
source /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/scripts/loadcondaandes.sh
conda activate sgRNA
cd /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/Libraries/andes/sgRNA-cleavage-activity-prediction-noNGGN/
python sgRNA_activity_predict_main.py ecoli_configure.txt
# check correlation between output and actual scores...
conda activate /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/Libraries/andes/envs/test
R
setwd("/gpfs/alpine/syb105/proj-shared/Personal/noshayjm/Libraries/andes/sgRNA-cleavage-activity-prediction-noNGGN/")
output <- read.delim("ecoli.noNGGN_result.txt", header=T, sep="\t")
colnames(output) <- c("sgRNA", "pred.score")
setwd("/gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli")
score <- read.delim("sgRNA.coord.txt", header=T, sep="\t", stringsAsFactors = F)
colnames(score) <- c("chr", "start", "end", "sgRNA", "id", "seq", "id2", "cut.score", "gid", "change.val", "quality")
score.df <- score[,c(4,8)]
library(dplyr)
score.output <- left_join(score.df, output, by="sgRNA")
score.output.na <- na.omit(score.output)
cor(score.output.na$cut.score,score.output.na$pred.score)
### can't run it because the training data had the NGGN???
module load python
source /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/scripts/loadcondaandes.sh
conda activate sgRNA
cd /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/Libraries/andes/sgRNA-cleavage-activity-prediction/
sed '1d' /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli/DataS1.NGGN.txt | awk '{ printf ">%s\n%s\n",$1,$2 }' > ecoli_sgRNA_NGGN.fasta
python sgRNA_activity_predict_main.py ecoli_test_configure.txt
conda activate /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/Libraries/andes/envs/test
R
library(ranger)
setwd("/gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli/")
seq <- read.delim("DataS1.NGGN.txt", header=T, sep="\t")
score <- read.delim("DataS4.txt", header=T, sep="\t")
setwd("/gpfs/alpine/syb105/proj-shared/Personal/noshayjm/Libraries/andes/sgRNA-cleavage-activity-prediction")
feature <- read.delim("ecoli_feature.txt", header=T, sep=",", stringsAsFactors = F)
library(dplyr)
feature.id <- feature[,1:2]
feature.score <- inner_join(feature.id, score, by="sgRNAID")
feature.input <- subset(feature, feature$sgRNAID %in% feature.score$sgRNAID)
# iRF
iRF <- function(xmat, y, ntree=500, iter=5, classification=F, threads=1,alwayssplits=NULL, saveall=T)
{
tmp <- cbind(xmat, Y = y)
wt <- rep(1/ncol(xmat), ncol(xmat)) # start with equal sample weighting per SNP
rfs <- list()
for(i in 1:iter)
{
cat("\niRF iteration ",i,"\n")
cat("=================\n")
mtry = 1*sum(wt>0)
rf <- ranger::ranger(dependent.variable.name = "Y", data = tmp, num.trees=ntree,
split.select.weights = wt, classification = classification,
mtry = mtry, importance = "impurity_corrected", num.threads=threads, write.forest = T,
always.split.variables = alwayssplits)
wt <- rf$variable.importance / sum(abs(rf$variable.importance)) # scale importances to range(0,1)
wt[wt<0] <- 0 # set negative weights to zero
cat("mtry: ", mtry, "\n")
cat("prediction error: ",rf$prediction.error,"\n")
if(classification==FALSE) cat("r^2: ",rf$r.squared,"\n")
if(classification==TRUE) print(rf$confusion.matrix)
cat("cor(y,yhat): ",cor(rf$predictions,y),"\n")
cat("SNPs with importance > 0:",sum(wt>0),"\n")
if(saveall) rfs[[i]] <- rf
if(sum(wt>0) < max(0.01*(ncol(xmat)-1), 10))
{
if(!saveall) rfs <- rf
break
}
}
return(rfs)
}
xmat = feature.input[,2:ncol(feature.input)]
y = feature.score$score
iRF(xmat, y)
# mtry: 425
# prediction error: 82.85728
# r^2: 0.2502604
# cor(y,yhat): 0.5028285
# SNPs with importance > 0: 251
### remove NGGN features and re-run
xmat = feature.input[,2:405]
y = feature.score$score
iRF(xmat, y)
# mtry: 404
# prediction error: 82.61081
# r^2: 0.2524905
# cor(y,yhat): 0.5054347
# SNPs with importance > 0: 241
# write a script to find the sgRNA sequence from DataS1.rbs in DataS2.rbs and then pull the sequence + 4bp (NGGN) into dataframe
module load python
source /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/scripts/loadcondaandes.sh
conda activate /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/Libraries/andes/envs/test
R
library(ranger)
setwd("/gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli/")
seq1 <- read.delim("DataS1.rbs.txt", header=T, sep="\t")
seq2 <- read.delim("DataS2.rbs.txt", header=T, sep="\t")
score <- read.delim("DataS4.txt", header=T, sep="\t")
setwd("/gpfs/alpine/syb105/proj-shared/Personal/noshayjm/Libraries/andes/sgRNA-cleavage-activity-prediction")
feature <- read.delim("ecoli.fullseq_feature.txt", header=T, sep=",", stringsAsFactors = F)
feature.id <- feature[,1:2]
colnames(feature.id) <- c("RBS.ID", "order1_IP_A")
library(dplyr)
library(tidyr)
seq1$RBS <- seq1$sgRNAID
seq1.rbs <- separate(seq1, RBS, c("RBS.ID", "number"), sep="_")
seq.rbs <- inner_join(seq1.rbs, seq2, by="RBS.ID")
seq.rbs.score <- inner_join(seq.rbs, score, by="sgRNAID")
library(stringr)
seq.rbs.score.coord <- seq.rbs.score %>% mutate(sgRNA.coord=str_locate(nucleotide.sequence.y, nucleotide.sequence.x))
seq.rbs.score.coord$fullseq <- substr(seq.rbs.score.coord$nucleotide.sequence.y, seq.rbs.score.coord$sgRNA.coord[,1]-4, seq.rbs.score.coord$sgRNA.coord[,2]+8)
setwd("/gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli/")
write.table(na.omit(seq.rbs.score.coord[,c(1,8)]), "rbs.fullseq.txt", quote=F, row.names=F, sep="\t")
write.table(seq.rbs.score.coord[,c(1,3,8,6)], "rbs.fullseq.score.txt", quote=F, row.names=F, sep="\t")
######
source /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/scripts/loadcondaandes.sh
conda activate sgRNA
cd /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/Libraries/andes/sgRNA-cleavage-activity-prediction/
sed '1d' /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli/rbs.fullseq.txt | awk '{ printf ">%s\n%s\n",$1,$2 }' > ecoli_sgRNA_fullseq.fasta
python sgRNA_activity_predict_main.py ecoli_fullseq_configure.txt
######
conda activate /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/Libraries/andes/envs/test
R
library(ranger)
setwd("/gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli/")
seq <- read.delim("rbs.fullseq.txt", header=T, sep="\t")
score <- read.delim("rbs.fullseq.score.txt", header=T, sep="\t")
setwd("/gpfs/alpine/syb105/proj-shared/Personal/noshayjm/Libraries/andes/sgRNA-cleavage-activity-prediction")
feature <- read.delim("ecoli.fullseq_feature.txt", header=T, sep=",", stringsAsFactors = F)
feature.id <- feature[,1:2]
colnames(feature.id) <- c("sgRNAID", "order1_IP_A")
library(dplyr)
library(tidyr)
feature.score <- inner_join(feature.id, score[,c(1,4)], by="sgRNAID")
colnames(feature)[1] <- "sgRNAID"
feature.input <- subset(feature, feature$sgRNAID %in% feature.score$sgRNAID)
# iRF
iRF <- function(xmat, y, ntree=500, iter=5, classification=F, threads=1,alwayssplits=NULL, saveall=T)
{
tmp <- cbind(xmat, Y = y)
wt <- rep(1/ncol(xmat), ncol(xmat)) # start with equal sample weighting per SNP
rfs <- list()
for(i in 1:iter)
{
cat("\niRF iteration ",i,"\n")
cat("=================\n")
mtry = 1*sum(wt>0)
rf <- ranger::ranger(dependent.variable.name = "Y", data = tmp, num.trees=ntree,
split.select.weights = wt, classification = classification,
mtry = mtry, importance = "impurity_corrected", num.threads=threads, write.forest = T,
always.split.variables = alwayssplits)
wt <- rf$variable.importance / sum(abs(rf$variable.importance)) # scale importances to range(0,1)
wt[wt<0] <- 0 # set negative weights to zero
cat("mtry: ", mtry, "\n")
cat("prediction error: ",rf$prediction.error,"\n")
if(classification==FALSE) cat("r^2: ",rf$r.squared,"\n")
if(classification==TRUE) print(rf$confusion.matrix)
cat("cor(y,yhat): ",cor(rf$predictions,y),"\n")
cat("SNPs with importance > 0:",sum(wt>0),"\n")
if(saveall) rfs[[i]] <- rf
if(sum(wt>0) < max(0.01*(ncol(xmat)-1), 10))
{
if(!saveall) rfs <- rf
break
}
}
return(rfs)
}
xmat = feature.input[,2:ncol(feature.input)]
feature.score.input <- subset(feature.score, feature.score$sgRNAID %in% feature.input$sgRNAID)
y = feature.score.input$score
iRF(xmat, y)
# iRF iteration 1
# =================
# mtry: 425
# prediction error: 109.0247
# r^2: 0.1081563
# cor(y,yhat): 0.3290085
# SNPs with importance > 0: 239
# iRF iteration 2
# =================
# mtry: 239
# prediction error: 105.54
# r^2: 0.1366616
# cor(y,yhat): 0.3730707
# SNPs with importance > 0: 162
# iRF iteration 3
# =================
# mtry: 162
# prediction error: 105.2015
# r^2: 0.1394308
# cor(y,yhat): 0.375165
# SNPs with importance > 0: 122
# iRF iteration 4
# =================
# mtry: 122
# prediction error: 103.1404
# r^2: 0.1562909
# cor(y,yhat): 0.3956245
# SNPs with importance > 0: 105
# iRF iteration 5
# =================
# mtry: 105
# prediction error: 102.7037
# r^2: 0.1598635
# cor(y,yhat): 0.4015379
# SNPs with importance > 0: 93
# gbm(
# formula = formula(data),
# distribution = "bernoulli",
# data = list(),
# weights,
# var.monotone = NULL,
# n.trees = 100,
# interaction.depth = 1,
# n.minobsinnode = 10,
# shrinkage = 0.1,
# bag.fraction = 0.5,
# train.fraction = 1,
# cv.folds = 0,
# keep.data = TRUE,
# verbose = FALSE,
# class.stratify.cv = NULL,
# n.cores = NULL
# )
module load python
source /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/scripts/loadcondaandes.sh
conda activate /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/Libraries/andes/envs/test
R
library(ranger)
setwd("/gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli/")
seq <- read.delim("DataS1.NGGN.txt", header=T, sep="\t")
score <- read.delim("DataS4.txt", header=T, sep="\t")
setwd("/gpfs/alpine/syb105/proj-shared/Personal/noshayjm/Libraries/andes/sgRNA-cleavage-activity-prediction")
feature <- read.delim("ecoli_feature.txt", header=T, sep=",", stringsAsFactors = F)
library(dplyr)
feature.id <- feature[,1:2]
feature.score <- inner_join(feature.id, score, by="sgRNAID")
feature.input <- subset(feature, feature$sgRNAID %in% feature.score$sgRNAID)
feature.data <- feature.input[,2:ncol(feature.input)]
score.data <- feature.score$score
library(gbm)
gbm.df <- gbm(formula=score.data ~ ., data=feature.data, distribution = "gaussian", n.trees = 500, shrinkage = 0.1,
interaction.depth = 3, bag.fraction = 0.2, train.fraction = 0.8,
n.minobsinnode = 10, cv.folds = 5, keep.data = TRUE,
verbose = FALSE, n.cores = 1)
best.iter <- gbm.perf(gbm.df, method = "OOB")
print(best.iter)
best.iter <- gbm.perf(gbm.df, method = "cv")
print(best.iter)
summary(gbm.df, n.trees = best.iter)
# var rel.inf
# order1_P20_T order1_P20_T 4.60618917
# GC GC 3.96169469
# order2_P19_GG order2_P19_GG 3.53127767
# T20 T20 3.30180408
# order1_P18_G order1_P18_G 3.30088818
# T8 T8 3.19994387
# T5 T5 2.85246217
# order1_P20_A order1_P20_A 2.80280539
# order2_P18_CC order2_P18_CC 2.74417456
# order2_IP_GG order2_IP_GG 2.57887285
# order2_P19_AG order2_P19_AG 2.50694387
nrow(feature.data)*0.7
# 28362.6
train.data <- feature.input[1:28362,2:ncol(feature.input)]
test.data <- feature.input[28363:nrow(feature.input),2:ncol(feature.input)]
feature.score.train <- feature.score[1:28362,]
score.train <- feature.score.train$score
feature.score.test <- feature.score[28363:nrow(feature.input),]
score.test <- feature.score.test$score
library(gbm)
gbm.train <- gbm(formula=score.train ~ ., data=train.data, distribution = "gaussian", n.trees = 500, shrinkage = 0.1,
interaction.depth = 3, bag.fraction = 0.2, train.fraction = 0.8,
n.minobsinnode = 10, cv.folds = 5, keep.data = TRUE,
verbose = FALSE, n.cores = 1)
best.iter <- gbm.perf(gbm.train, method = "OOB")
Yhat <- predict(gbm.train, newdata = test.data, n.trees = best.iter, type = "link")
pred <- data.frame(pred.score = Yhat, exp.score = score.test)
cor(pred$pred.score, pred$exp.score)
# 0.5166295
best.iter <- gbm.perf(gbm.train, method = "cv")
Yhat <- predict(gbm.train, newdata = test.data, n.trees = best.iter, type = "link")
pred <- data.frame(pred.score = Yhat, exp.score = score.test)
cor(pred$pred.score, pred$exp.score)
# 0.5199843
library(caret)
set.seed(998)
feature.score <- cbind(score.data, feature.data)
inTraining <- createDataPartition(feature.score$score.data, p = .75, list = FALSE)
training <- feature.score[ inTraining,2:ncol(feature.score)]
testing <- feature.score[-inTraining,2:ncol(feature.score)]
testing.score <- feature.score[-inTraining,1]
fitControl <- trainControl(## 5-fold CV
method = "repeatedcv",
number = 5,
## repeated ten times
repeats = 1)
set.seed(825)
gbmFit1 <- train(score.data ~ ., data = feature.score,
method = "gbm",
trControl = fitControl,
## This last option is actually one
## for gbm() that passes through
verbose = FALSE)
gbmFit1
# Stochastic Gradient Boosting
#
# 40518 samples
# 425 predictor
#
# No pre-processing
# Resampling: Cross-Validated (5 fold, repeated 1 times)
# Summary of sample sizes: 32414, 32415, 32414, 32415, 32414
# Resampling results across tuning parameters:
#
# interaction.depth n.trees RMSE Rsquared MAE
# 1 50 9.853050 0.1656330 8.122974
# 1 100 9.597826 0.1957858 7.861776
# 1 150 9.458520 0.2090743 7.710746
# 2 50 9.579993 0.1991883 7.848937
# 2 100 9.341788 0.2251052 7.588196
# 2 150 9.229095 0.2389049 7.467592
# 3 50 9.439818 0.2139744 7.696722
# 3 100 9.223693 0.2403652 7.459679
# 3 150 9.117179 0.2542953 7.343738
#
# Tuning parameter 'shrinkage' was held constant at a value of 0.1
#
# Tuning parameter 'n.minobsinnode' was held constant at a value of 10
# RMSE was used to select the optimal model using the smallest value.
# The final values used for the model were n.trees = 150, interaction.depth =
# 3, shrinkage = 0.1 and n.minobsinnode = 10.
pred <- predict(gbmFit1, newdata = testing)
pred.df <- data.frame(pred.score = pred, exp.score = testing.score)
cor(pred.df$pred.score, pred.df$exp.score)
# 0.5272789
feature.score <- cbind(score.data, feature.data)
library(caret)
set.seed(998)
inTraining <- createDataPartition(feature.score$score.data, p = .75, list = FALSE)
training <- feature.score[ inTraining,]
testing <- feature.score[-inTraining,]
fitControl <- trainControl(## 5-fold CV
method = "repeatedcv",
number = 5,
## repeated ten times
repeats = 10)
set.seed(825)
gbmFit1 <- train(score.data ~ ., data = training,
method = "gbm",
trControl = fitControl,
## This last option is actually one
## for gbm() that passes through
verbose = FALSE)
gbmFit1
# Stochastic Gradient Boosting
#
# 30390 samples
# 425 predictor
#
# No pre-processing
# Resampling: Cross-Validated (5 fold, repeated 10 times)
# Summary of sample sizes: 24313, 24312, 24312, 24311, 24312, 24313, ...
# Resampling results across tuning parameters:
#
# interaction.depth n.trees RMSE Rsquared MAE
# 1 50 9.884423 0.1610419 8.153556
# 1 100 9.628529 0.1913269 7.891812
# 1 150 9.493098 0.2048580 7.745396
# 2 50 9.617531 0.1941584 7.882704
# 2 100 9.383874 0.2196552 7.630267
# 2 150 9.267120 0.2343243 7.503221
# 3 50 9.472295 0.2109497 7.729821
# 3 100 9.259533 0.2365398 7.498470
# 3 150 9.159923 0.2492108 7.387490
#
# Tuning parameter 'shrinkage' was held constant at a value of 0.1
#
# Tuning parameter 'n.minobsinnode' was held constant at a value of 10
# RMSE was used to select the optimal model using the smallest value.
# The final values used for the model were n.trees = 150, interaction.depth =
# 3, shrinkage = 0.1 and n.minobsinnode = 10.
pred <- predict(gbmFit1, newdata = testing)
pred.df <- data.frame(pred.score = pred, exp.score = testing$score.data)
cor(pred.df$pred.score, pred.df$exp.score)
# 0.516909
module load python
source /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/scripts/loadcondaandes.sh
conda activate /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/Libraries/andes/envs/test
R
library(ranger)
setwd("/gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli/")
seq <- read.delim("rbs.fullseq.txt", header=T, sep="\t")
score <- read.delim("rbs.fullseq.score.txt", header=T, sep="\t")
setwd("/gpfs/alpine/syb105/proj-shared/Personal/noshayjm/Libraries/andes/sgRNA-cleavage-activity-prediction")
feature <- read.delim("ecoli.fullseq_feature.txt", header=T, sep=",", stringsAsFactors = F)
feature.id <- feature[,1:2]
colnames(feature.id) <- c("sgRNAID", "order1_IP_A")
library(dplyr)
library(tidyr)
feature.score <- inner_join(feature.id, score[,c(1,4)], by="sgRNAID")
colnames(feature)[1] <- "sgRNAID"
feature.input <- subset(feature, feature$sgRNAID %in% feature.score$sgRNAID)
feature.data <- feature.input[,2:ncol(feature.input)]
score.data <- feature.score$score
library(caret)
set.seed(998)
feature.score <- cbind(score.data, feature.data)
inTraining <- createDataPartition(feature.score$score.data, p = .75, list = FALSE)
training <- feature.score[ inTraining,2:ncol(feature.score)]
testing <- feature.score[-inTraining,2:ncol(feature.score)]
testing.score <- feature.score[-inTraining,1]
fitControl <- trainControl(## 5-fold CV
method = "repeatedcv",
number = 5,
## repeated ten times
repeats = 1)
set.seed(825)
gbmFit1 <- train(score.data ~ ., data = feature.score,
method = "gbm",
trControl = fitControl,
## This last option is actually one
## for gbm() that passes through
verbose = FALSE)
gbmFit1
# Stochastic Gradient Boosting
#
# 1772 samples
# 425 predictor
#
# No pre-processing
# Resampling: Cross-Validated (5 fold, repeated 1 times)
# Summary of sample sizes: 1417, 1419, 1418, 1417, 1417
# Resampling results across tuning parameters:
#
# interaction.depth n.trees RMSE Rsquared MAE
# 1 50 10.58964 0.08620767 8.708045
# 1 100 10.51578 0.09589164 8.604299
# 1 150 10.47572 0.10224479 8.515738
# 2 50 10.47385 0.10532671 8.585356
# 2 100 10.46078 0.10645355 8.484740
# 2 150 10.51399 0.10142116 8.478061
# 3 50 10.52873 0.09435998 8.585026
# 3 100 10.48298 0.10456291 8.474284
# 3 150 10.55031 0.09993223 8.513572
#
# Tuning parameter 'shrinkage' was held constant at a value of 0.1
#
# Tuning parameter 'n.minobsinnode' was held constant at a value of 10
# RMSE was used to select the optimal model using the smallest value.
# The final values used for the model were n.trees = 100, interaction.depth =
# 2, shrinkage = 0.1 and n.minobsinnode = 10.
pred <- predict(gbmFit1, newdata = testing)
pred.df <- data.frame(pred.score = pred, exp.score = testing.score)
cor(pred.df$pred.score, pred.df$exp.score)
# 0.5517723
#python path/to/encode_sequences.py path/to/data.txt
# cd /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli/
# python encode_sequences.py DataS1.txt
# python encode_sequences.py DataS1.rbs.txt
import os, sys
import numpy as np
onehot_dict = {
'A': '1000',
'C': '0100',
'T': '0010',
'G': '0001',
'AA': '1000000000000000',
'AC': '0100000000000000',
'AT': '0010000000000000',
'AG': '0001000000000000',
'CA': '0000100000000000',
'CC': '0000010000000000',
'CT': '0000001000000000',
'CG': '0000000100000000',
'TA': '0000000010000000',
'TC': '0000000001000000',
'TT': '0000000000100000',
'TG': '0000000000010000',
'GA': '0000000000001000',
'GC': '0000000000000100',
'GT': '0000000000000010',
'GG': '0000000000000001',
}
# open input and output files
input_path = sys.argv[1]
input_file = open(input_path, 'r')
dep1_file = open(input_path[:-4]+'_dependent1.txt', 'w')
dep2_file = open(input_path[:-4]+'_dependent2.txt', 'w')
indep1_file = open(input_path[:-4]+'_independent1.txt', 'w')
indep2_file = open(input_path[:-4]+'_independent2.txt', 'w')
# loop over nucleotide sequences
for idx, line in enumerate(input_file):
# if first iteration, write title line
if idx == 0:
dep1_file.writelines(line+': first-order position-dependent features'+ '\n')
dep2_file.writelines(line+': second-order position-dependent features'+ '\n')
indep1_file.writelines(line+': first-order position-independent features'+ '\n')
indep2_file.writelines(line+': second-order position-independent features'+ '\n')
# otherwise encode sequence
else:
# split line by tab
line = line.split('\t')
# extract sequence (also remove \n)
seq = line[-1][:-1]
# compute position-dependent features as one-hot vectors
pos_dep1 = ''.join([onehot_dict[seq[i]] for i in range(len(seq))])
pos_dep2 = ''.join([onehot_dict[seq[i:i+2]] for i in range(len(seq)-1)])
# compute position-independent features as sum over position-dependent features
pos_indep1 = list(np.array([int(o) for o in pos_dep1]).reshape([-1, 4]).sum(axis=0))
pos_indep2 = list(np.array([int(o) for o in pos_dep2]).reshape([-1, 16]).sum(axis=0))
pos_indep1 = ''.join([str(p) for p in pos_indep1])
pos_indep2 = ''.join([str(p) for p in pos_indep2])
# write features to file
dep1_file.writelines(line[0] + '\t' + pos_dep1 + '\n')
dep2_file.writelines(line[0] + '\t' + pos_dep2 + '\n')
indep1_file.writelines(line[0] + '\t' + pos_indep1 + '\n')
indep2_file.writelines(line[0] + '\t' + pos_indep2 + '\n')
if idx % 10000 == 0:
print('{0:,}'.format(idx)+' lines processed...')
print('Done!')
input_file.close()
dep1_file.close()
dep2_file.close()
indep1_file.close()
indep2_file.close()
# separate nucleotide sequence values into individual columns in data frame so each position counts as one feature
options(scipen = 999)
setwd("/gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli/")
onehot.ind1 <- read.delim("DataS1_independent1.txt", sep=" ", header=T)
onehot.ind2 <- read.delim("DataS1_independent2.txt")
onehot.dep1 <- read.delim("DataS1_dependent1.txt")
onehot.dep2 <- read.delim("DataS1_dependent2.txt")
onehot.ind1 <- onehot.ind1[2:nrow(onehot.ind1),]
onehot.ind2 <- onehot.ind2[2:nrow(onehot.ind2),]
onehot.dep1 <- onehot.dep1[2:nrow(onehot.dep1),]
onehot.dep2 <- onehot.dep2[2:nrow(onehot.dep2),]
onehot.ind1.sep <- separate(data=onehot.ind1, col=nucleotide.sequence, into=c(NA, "p1", "p2", "p3", "p4"), sep="")
prefix <- "p"
suffix <- seq(1:20)
ids <- paste(prefix, suffix, sep="")
ids2 <- c(NA, NA, ids)
onehot.ind2.sep <- separate(data=onehot.ind2, col=nucleotide.sequence, into=ids2, sep="")
onehot.ind2.sep[is.na(onehot.ind2.sep)] <- 0
suffix <- seq(1:87)
ids2 <- c(NA, NA, ids)
ids <- paste(prefix, suffix, sep="")
onehot.dep1.sep <- separate(data=onehot.dep1, col=nucleotide.sequence, into=ids2, sep="")
onehot.dep1.sep[is.na(onehot.dep1.sep)] <- 0
suffix <- seq(1:303)
ids2 <- c(NA, NA, ids)
ids <- paste(prefix, suffix, sep="")
onehot.dep2.sep <- separate(data=onehot.dep2, col=nucleotide.sequence, into=ids2, sep="")
onehot.dep2.sep[is.na(onehot.dep2.sep)] <- 0
write.table(onehot.ind1.sep, "DataS1_independent1_sep.txt", quote=F, row.names=F, sep="\t")
write.table(onehot.ind2.sep, "DataS1_independent2_sep.txt", quote=F, row.names=F, sep="\t")
write.table(onehot.dep1.sep, "DataS1_dependent1_sep.txt", quote=F, row.names=F, sep="\t")
write.table(onehot.dep2.sep, "DataS1_dependent2_sep.txt", quote=F, row.names=F, sep="\t")
## proper method using unix
sed '1d' DataS1_independent1.txt | sed '1d' | awk '{gsub(/./,"& ",$2);print $0}' | sed '1i sgRNAID A C T G' | cut -d ' ' -f 1-5 > DataS1_ind1.txt
sed '1d' DataS1_independent2.txt | sed '1d' | awk '{gsub(/./,"& ",$2);print $0}' | sed '1i sgRNAID AA AC AT AG CA CC CT CG TA TC TT TG GA GC GT GG' | cut -d ' ' -f 1-17 > DataS1_ind2.txt
sed '1d' DataS1_dependent1.txt | sed '1d' | awk '{gsub(/./,"& ",$2);print $0}' | sed '1i sgRNAID p1.A p1.C p1.T p1.G p2.A p2.C p2.T p2.G p3.A p3.C p3.T p3.G p4.A p4.C p4.T p4.G p5.A p5.C p5.T p5.G p6.A p6.C p6.T p6.G p7.A p7.C p7.T p7.G p8.A p8.C p8.T p8.G p9.A p9.C p9.T p9.G p10.A p10.C p10.T p10.G p11.A p11.C p11.T p11.G p12.A p12.C p12.T p12.G p13.A p13.C p13.T p13.G p14.A p14.C p14.T p14.G p15.A p15.C p15.T p15.G p16.A p16.C p16.T p16.G p17.A p17.C p17.T p17.G p18.A p18.C p18.T p18.G p19.A p19.C p19.T p19.G p20.A p20.C p20.T p20.G' | cut -d ' ' -f 1-21 > DataS1_dep1.txt
sed '1d' DataS1_dependent2.txt | sed '1d' | awk '{gsub(/./,"& ",$2);print $0}' | sed '1i sgRNAID p1.AA p1.AC p1.AT p1.AG p1.CA p1.CC p1.CT p1.CG p1.TA p1.TC p1.TT p1.TG p1.GA p1.GC p1.GT p1.GG p2.AA p2.AC p2.AT p2.AG p2.CA p2.CC p2.CT p2.CG p2.TA p2.TC p2.TT p2.TG p2.GA p2.GC p2.GT p2.GG p3.AA p3.AC p3.AT p3.AG p3.CA p3.CC p3.CT p3.CG p3.TA p3.TC p3.TT p3.TG p3.GA p3.GC p3.GT p3.GG p4.AA p4.AC p4.AT p4.AG p4.CA p4.CC p4.CT p4.CG p4.TA p4.TC p4.TT p4.TG p4.GA p4.GC p4.GT p4.GG p5.AA p5.AC p5.AT p5.AG p5.CA p5.CC p5.CT p5.CG p5.TA p5.TC p5.TT p5.TG p5.GA p5.GC p5.GT p5.GG p6.AA p6.AC p6.AT p6.AG p6.CA p6.CC p6.CT p6.CG p6.TA p6.TC p6.TT p6.TG p6.GA p6.GC p6.GT p6.GG p7.AA p7.AC p7.AT p7.AG p7.CA p7.CC p7.CT p7.CG p7.TA p7.TC p7.TT p7.TG p7.GA p7.GC p7.GT p7.GG p8.AA p8.AC p8.AT p8.AG p8.CA p8.CC p8.CT p8.CG p8.TA p8.TC p8.TT p8.TG p8.GA p8.GC p8.GT p8.GG p9.AA p9.AC p9.AT p9.AG p9.CA p9.CC p9.CT p9.CG p9.TA p9.TC p9.TT p9.TG p9.GA p9.GC p9.GT p9.GG p10.AA p10.AC p10.AT p10.AG p10.CA p10.CC p10.CT p10.CG p10.TA p10.TC p10.TT p10.TG p10.GA p10.GC p10.GT p10.GG p11.AA p11.AC p11.AT p11.AG p11.CA p11.CC p11.CT p11.CG p11.TA p11.TC p11.TT p11.TG p11.GA p11.GC p11.GT p11.GG p12.AA p12.AC p12.AT p12.AG p12.CA p12.CC p12.CT p12.CG p12.TA p12.TC p12.TT p12.TG p12.GA p12.GC p12.GT p12.GG p13.AA p13.AC p13.AT p13.AG p13.CA p13.CC p13.CT p13.CG p13.TA p13.TC p13.TT p13.TG p13.GA p13.GC p13.GT p13.GG p14.AA p14.AC p14.AT p14.AG p14.CA p14.CC p14.CT p14.CG p14.TA p14.TC p14.TT p14.TG p14.GA p14.GC p14.GT p14.GG p15.AA p15.AC p15.AT p15.AG p15.CA p15.CC p15.CT p15.CG p15.TA p15.TC p15.TT p15.TG p15.GA p15.GC p15.GT p15.GG p16.AA p16.AC p16.AT p16.AG p16.CA p16.CC p16.CT p16.CG p16.TA p16.TC p16.TT p16.TG p16.GA p16.GC p16.GT p16.GG p17.AA p17.AC p17.AT p17.AG p17.CA p17.CC p17.CT p17.CG p17.TA p17.TC p17.TT p17.TG p17.GA p17.GC p17.GT p17.GG p18.AA p18.AC p18.AT p18.AG p18.CA p18.CC p18.CT p18.CG p18.TA p18.TC p18.TT p18.TG p18.GA p18.GC p18.GT p18.GG p19.AA p19.AC p19.AT p19.AG p19.CA p19.CC p19.CT p19.CG p19.TA p19.TC p19.TT p19.TG p19.GA p19.GC p19.GT p19.GG p20.AA p20.AC p20.AT p20.AG p20.CA p20.CC p20.CT p20.CG p20.TA p20.TC p20.TT p20.TG p20.GA p20.GC p20.GT p20.GG' | cut -d ' ' -f 1-321 > DataS1_dep2.txt
–> go back and add to previous models
https://datatricks.co.uk/one-hot-encoding-in-r-three-simple-methods https://machinelearningmastery.com/how-to-one-hot-encode-sequence-data-in-python/ https://2-bitbio.com/2018/06/one-hot-encode-dna-sequence-using.html - The class hot_dna takes a fasta as argument. The first chunk will check for and store the sequence name (anything between ‘>’ and newline). Then the sequence is converted to an array for integer encoding. The integer encoding is carried out using LabelEncoder(). Next, the integer encoded DNA is one hot encoded using OneHotEncoder(). Finally, these encodings and the original sequence along with it’s name get loaded as attributes.
module load python
source /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/scripts/loadcondaandes.sh
conda activate /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/Libraries/andes/envs/test
cd /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli
python
from numpy import array
from numpy import argmax
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OneHotEncoder
import re
class hot_dna:
def __init__(self,fasta):
#output_file = open('nucleotide_counts.tsv','w')
#with open('ecoli.1kb.fa', 'r') as fasta:
#check for and grab sequence name
if re.search(">",fasta):
name = re.split("\n",fasta)[0]
sequence = re.split("\n",fasta)[1]
else :
name = 'unknown_sequence'
sequence = fasta
#get sequence into an array
seq_array = array(list(sequence))
#integer encode the sequence
label_encoder = LabelEncoder()
integer_encoded_seq = label_encoder.fit_transform(seq_array)
#one hot the sequence
onehot_encoder = OneHotEncoder(sparse=False)
#reshape because that's what OneHotEncoder likes
integer_encoded_seq = integer_encoded_seq.reshape(len(integer_encoded_seq), 1)
onehot_encoded_seq = onehot_encoder.fit_transform(integer_encoded_seq)
#add the attributes to self
self.name = name
self.sequence = fasta
self.integer = integer_encoded_seq
self.onehot = onehot_encoded_seq
# output_file.write(self.name)
# output_file.write(self.onehot)
#output_file.close()
#exit()
fasta = open('onehot.fa','w')
my_hottie = hot_dna(fasta)
# http://www.dalkescientific.com/writings/NBN/parsing.html
# https://www.kaggle.com/thomasnelson/working-with-dna-sequence-data-for-ml
from Bio import SeqIO
for seq_record in SeqIO.parse('onehot.fa', "fasta"):
print(seq_record.id)
print(seq_record.seq)
# converts to lower case, changes any non 'acgt' characters to 'n'
import numpy as np
import re
def string_to_array(my_string):
my_string = my_string.lower()
my_string = re.sub('[^acgt]', 'z', my_string)
my_array = np.array(list(my_string))
return my_array
# create a label encoder with 'acgtn' alphabet
from sklearn.preprocessing import LabelEncoder
label_encoder = LabelEncoder()
label_encoder.fit(np.array(['a','c','g','t','z']))
# returns a numpy vector with a=0.25, c=0.50, g=0.75, t=1.00, n=0.00
def ordinal_encoder(my_array):
integer_encoded = label_encoder.transform(my_array)
float_encoded = integer_encoded.astype(float)
float_encoded[float_encoded == 0] = 0.25 # A
float_encoded[float_encoded == 1] = 0.50 # C
float_encoded[float_encoded == 2] = 0.75 # G
float_encoded[float_encoded == 3] = 1.00 # T
float_encoded[float_encoded == 4] = 0.00 # anything else, z
return float_encoded
# non 'acgt' bases (n) are 0000
# returns a L x 4 numpy array
from sklearn.preprocessing import OneHotEncoder
def one_hot_encoder(my_array):
integer_encoded = label_encoder.transform(my_array)
onehot_encoder = OneHotEncoder(sparse=False, dtype=int, n_values=5)
integer_encoded = integer_encoded.reshape(len(integer_encoded), 1)
onehot_encoded = onehot_encoder.fit_transform(integer_encoded)
onehot_encoded = np.delete(onehot_encoded, -1, 1)
return onehot_encoded
test_sequence = 'AACGCGGTTNN'
ordinal_encoder(string_to_array(test_sequence))
one_hot_encoder(string_to_array(test_sequence))
from Bio import SeqIO
for seq_record in SeqIO.parse('onehot.fa', "fasta"):
ordinal_encoder(string_to_array(fasta))
ordinal_encoder(string_to_array(fasta))
# scp /Users/27n/Dropbox (ORNL)/ORNL.Noshay/Projects/SEED/protein_rna_dna-vector_lee_nucleotide_dna_data.txt noshayjm@dtn.ccs.ornl.gov:/gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli/.
cd /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli/
sed '1d' DataS1.txt | awk '{gsub(/./,"& ",$2);print $0}' | sed '1i sgRNAID p1 p2 p3 p4 p5 p6 p7 p8 p9 p10 p11 p12 p13 p14 p15 p16 p17 p18 p19 p20' | cut -d ' ' -f 1-21 > DataS1_sequence.txt
sed '1d' DataS1.rbs.txt | awk '{gsub(/./,"& ",$2);print $0}' | sed '1i sgRNAID p1 p2 p3 p4 p5 p6 p7 p8 p9 p10 p11 p12 p13 p14 p15 p16 p17 p18 p19 p20' | cut -d ' ' -f 1-21 > DataS1.rbs_sequence.txt
module load python
source /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/scripts/loadcondaandes.sh
conda activate /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/Libraries/andes/envs/test
R
library(dplyr)
library(reshape2)
setwd("/gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli/")
tensor <- read.delim("protein_rna_dna-vector_lee_nucleotide_dna_data.txt", header=T, sep="\t", stringsAsFactors = F)
seq <- read.delim("DataS1.rbs_sequence.txt", header=T, sep=" ", stringsAsFactors = F)
tensor.features <- tensor[,1]
rownames(tensor) <- tensor[,1]
tensor.df <- tensor[,2:5]
tensor.t <- as.data.frame(t(tensor.df))
tensor.t$base <- c("A", "C", "G", "T")
rownames(seq) <- seq[,1]
seq.df <- seq[,2:21]
seq.melt <- melt(seq, id="sgRNAID")
colnames(seq.melt) <- c("sgRNAID", "position", "base")
seq.tensor <- left_join(seq.melt, tensor.t, by="base")
seq.tensor.melt <- melt(seq.tensor, id=c("sgRNAID", "position", "base"))
seq.tensor.dcast <- dcast(seq.tensor.melt, sgRNAID ~ position + variable, value.var="value")
write.table(seq.tensor.dcast, "ecoli.sgRNA.thermal.tensors.txt", quote=F, row.names=F, sep="\t")
write.table(seq.tensor.melt, "ecoli.sgRNA.thermal.tensors.melt.txt", quote=F, row.names=F, sep="\t")
https://github.com/gmarcais/Jellyfish http://www.genome.umd.edu/docs/JellyfishUserGuide.pdf
conda install -c conda-forge jellyfish
cd /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli
jellyfish count -m 21 -s 100M -t 10 -C ecoli.gRNA.fasta
# on andes
module load python
source /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/scripts/loadcondaandes.sh
conda activate /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/Libraries/andes/envs/test
cd /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli/genome
#/gpfs/alpine/syb105/proj-shared/piet/codebase/andes/submit --lab --name noshayjm --login_port 2458
/gpfs/alpine/syb105/proj-shared/piet/git/codebase/rhea/submit-2-jupyter --lab --name noshayjm --login_port 2458 --loadenv /gpfs/alpine/syb105/proj-shared/Projects/conda-environments/andes-base
# on local computer
ssh -f -L 127.0.0.1:8080:127.0.0.1:2458 noshayjm@dtn.ccs.ornl.gov $HOME/.connect
OLCF Jupyter Notebook Overview: https://docs.olcf.ornl.gov/services_and_applications/jupyter/overview.html#jupyter-at-olcf
Login: https://jupyter.olcf.ornl.gov/user/noshayjm/lab/workspaces/auto-r
Running R on Jupyter: https://datatofish.com/r-jupyter-notebook/
#conda install -c r r-irkernel
# https://docs.olcf.ornl.gov/services_and_applications/jupyter/overview.html#jupyter-at-olcf
# in Terminal
echo $HOME
#/ccs/home/noshayjm
source /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/scripts/loadcondaandes.sh
source activate /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/Libraries/andes/envs/test
python -m ipykernel install --user --name test_python --display-name test
install.packages('IRkernel')
IRkernel::installspec(name = 'test_R', displayname = 'test', user=TRUE)
input fasta file of sgRNA sequence
code raw value identification for melting temp, gc content, one-hot (independent/dependent), RNA structure (iFeature/RNA-SPOT)
reference assembly –> run blast to identify target region… pull 200bp sequence up and down from target… determine gc content, structure, gene density, any additional data available (RNA-seq, WGBS, etc)
run wavelet transform
compile into large matrix of features for each sgRNA id
random forest / prediction correlation
need to get PAM sequence?
https://biopython.org/docs/1.75/api/Bio.SeqUtils.html http://biopython.org/DIST/docs/tutorial/Tutorial.html#sec119
module load python
source /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/scripts/loadcondaandes.sh
conda activate /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/Libraries/andes/envs/test
python
import os
import sys
import numpy as np
from Bio import SeqIO
from Bio.SeqUtils import GC
from Bio.SeqUtils import molecular_weight
from Bio.SeqUtils import nt_search
from Bio.SeqUtils import MeltingTemp
import Bio.SeqUtils.MeltingTemp as Tm
import os, sys
import numpy as np
onehot_dict = {
'A': '1000',
'C': '0100',
'T': '0010',
'G': '0001',
'AA': '1000000000000000',
'AC': '0100000000000000',
'AT': '0010000000000000',
'AG': '0001000000000000',
'CA': '0000100000000000',
'CC': '0000010000000000',
'CT': '0000001000000000',
'CG': '0000000100000000',
'TA': '0000000010000000',
'TC': '0000000001000000',
'TT': '0000000000100000',
'TG': '0000000000010000',
'GA': '0000000000001000',
'GC': '0000000000000100',
'GT': '0000000000000010',
'GG': '0000000000000001',
}
# open input and output files
input_path = sys.argv[1]
input_file = open(input_path, 'r')
feature_file = open(input_path[:-4]+'_features.txt', 'w')
# loop over nucleotide sequences
for idx, line in enumerate(input_file):
# split line by tab
line = line.split('\t')
# extract sequence (also remove \n)
seq = line[-1][:-1]
# compute position-dependent features as one-hot vectors
pos_dep1 = ''.join([onehot_dict[seq[i]] for i in range(len(seq))])
pos_dep2 = ''.join([onehot_dict[seq[i:i+2]] for i in range(len(seq)-1)])
# compute position-independent features as sum over position-dependent features
pos_indep1 = list(np.array([int(o) for o in pos_dep1]).reshape([-1, 4]).sum(axis=0))
pos_indep2 = list(np.array([int(o) for o in pos_dep2]).reshape([-1, 16]).sum(axis=0))
pos_indep1 = ''.join([str(p) for p in pos_indep1])
pos_indep2 = ''.join([str(p) for p in pos_indep2])
# write features to file
feature_file.writelines(line[0] + '\t' + pos_dep1 + '\t' + pos_dep2 + '\t' + pos_indep1 + '\t' + pos_indep2 + '\n')
if idx % 10000 == 0:
print('{0:,}'.format(idx)+' lines processed...')
print('Done!')
input_file.close()
feature_file.close()
# onehot
def Vector_feature_to_Value_feature(position_feature_depedent_Dic,basepairLst):
new_Dic={}
for position in position_feature_depedent_Dic:
for i,base in enumerate(basepairLst):
new_Dic['%s_%s'%(position,base)]=position_feature_depedent_Dic[position][i]
return new_Dic
#######################
#this function is used to extract the Order1 feature
def Order1(sequence):
n=len(sequence)
seq=sequence
baseDic={}
order=1
baseLst=['A','T','C','G']
position_independentDic={}
position_dependentDic={}
#generate the Base and its vector.
for i,base in enumerate(baseLst):
baseDic[base]=np.zeros(4**order)
baseDic[base][i]=1
position_independentDic['order1_IP_%s'%(base)]=0
#extract the seq feature
for i in range(len(seq)):
for j,base in enumerate(baseDic):
if base==seq[i]:
position_dependentDic['order1_P%s'%(i+1)]=baseDic[base]
position_independentDic['order1_IP_%s'%(base)]+=1
# position_dependent=sorted(position_dependent.items(),key=lambda item:item[0])
# position_independent=sorted(position_independent.items(),key=lambda item:item[0])
position_dependentDic=Vector_feature_to_Value_feature(position_dependentDic,baseLst)
Order1_positionDic=dict(position_dependentDic.items()+position_independentDic.items())
return Order1_positionDic
############################
#this function is used to extract the Order2 feature
def Order2(sequence):
seq=sequence
BasepairDic={}
BasepairLst=[]
position_dependentDic={}
position_independentDic={}
order=2
baseLst=['A','T','C','G']
#generate the Basepair and its vector that contain 0 or 1.
for base1 in baseLst:
for base2 in baseLst:
BasepairLst.append(base1+base2)
for i in range(len(BasepairLst)):
BasepairDic[BasepairLst[i]]=np.zeros(4**order)
BasepairDic[BasepairLst[i]][i]=1
position_independentDic['order2_IP_%s'%(BasepairLst[i])]=0
#extract the seqence feature
for i in range(len(seq)-1):
seq_pair=seq[i:i+2]
for j,basepair in enumerate(BasepairLst):
if seq_pair==basepair:
position_dependentDic['order2_P%s'%(i+1)]=BasepairDic[basepair]
position_independentDic['order2_IP_%s'%(basepair)]+=1
position_dependentDic=Vector_feature_to_Value_feature(position_dependentDic,BasepairLst)
Order2_positonDic=dict(position_dependentDic.items()+position_independentDic.items())
return Order2_positonDic
#######################
# Melting temp
def Temper(sequence):
seq=sequence
seq_7=seq[:7]
seq_8=seq[7:15]
seq_5=seq[15:20]
TDic={}
TDic['T20']=Tm.Tm_staluc(seq)
TDic['T7']=Tm.Tm_staluc(seq_7)
TDic['T8']=Tm.Tm_staluc(seq_8)
TDic['T5']=Tm.Tm_staluc(seq_5)
return TDic
#######################
# RNA structure
def Struc(sequence):
struc.out=os.system('python /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/Libraries/andes/iFeature/iFeature.py --file /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli/ecoli.gRNA.rbs.fasta --type AAC --out /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli/ecoli.gRNA.rbs.structure.txt')
return struc.out
#######################
#this function is used to extract the all features of sgRNA sequence!
def feature(sequence):
Order1Position=Order1(sequence)
Order2Position=Order2(sequence)
Temprature=Temper(sequence)
Structure=Struc(sequence)
seq_feature=dict(Order1Position.items()+Order2Position.items()+Temprature.items())
seq_feature['GC']=float(GC(sequence))/100
seq_feature_name=sorted(Order1Position.keys())+sorted(Order2Position.keys())+sorted(Temprature.keys())
seq_feature_name.append('GC')
return seq_feature_name,seq_feature
######################################################################
def get_feature_main():
sgRNA_fasta_file=sys.argv[1]
sgRNAfasta=SeqIO.parse(sgRNA_fasta_file,"fasta")
prefix=sys.argv[2]
sgRNAfastaDic={}
sgrnaFeatureDic={}
###########################################
# check every sgRNA in the fasta file
for rec in sgRNAfasta:
sgRNAfastaDic[rec.id]=str(rec.seq).upper()
sgrnaFeatureDic[rec.id]={}
###########################################
featureName=[]
for sgrna_name in sgRNAfastaDic:
sgrnaSequence=sgRNAfastaDic[sgrna_name][1:20]
NGGNSequence=sgRNAfastaDic[sgrna_name][21]+sgRNAfastaDic[sgrna_name][24]
featureName,featureValue=feature(sgrnaSequence,NGGNSequence)
sgrnaFeatureDic[sgrna_name]=featureValue
output='%s_feature.txt'%(prefix)
os.system('cat /dev/null >%s'%(output))
with open(output,'w') as f1:
writtenLine='sgRNAID,'
for featureID in featureName:
writtenLine+='%s,'%(featureID)
f1.write(writtenLine[:-1]+'\n')
for sgrna_name in sgRNAfastaDic:
writtenLine='%s,'%(sgrna_name)
for featureID in featureName:
writtenLine+='%s,'%(sgrnaFeatureDic[sgrna_name][featureID])
f1.write(writtenLine[:-1]+'\n')
if __name__ =='__main__':
try:
get_feature_main()
except KeyboardInterrupt:
sys.stderr.write("Interrupted.\n")
sys.exit(0)
https://docs.python.org/3/library/re.html
module load python
source /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/scripts/loadcondaandes.sh
conda activate /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/Libraries/andes/envs/test
python
import os
import sys
import numpy as np
import re
from Bio import SeqIO
########################
*** How to find reference coordinates of sgRNA and pull surrounding sequence to obtain features and do wavelet transformations
*** Additional to find PAM (NGGN sequence)?? Look XXbp upstream/downstream
# find sequence in reference to obtain coordinates, pull sequence, and calculate features for 200bp upstream and downstream of sgRNA as well as generate wavelets...
def coord(sequence):
coord=nt_search(reference, sequence)
coord.up=
return
from Bio.Blast import NCBIWWW
from Bio import SeqIO
result_handle = NCBIWWW.qblast("blastn", "nt", sequence)
import re
seq = "ATCG"
refseq = "GGAACTGCGAATCGGGAT"
find_in_seq = re.search(seq, refseq)
start_position = re.search(seq, refseq).start()
end_position = re.search(seq, refseq).end()
subSeq = refseq[max(start_position - 2,0):min(start_position + len(seq) + 2,len(refseq))]
# how to read in a reference fasta and sgRNA fasta...
list of Toxic guides, sequence, and log2fc scores
list of full guides and sequence
identify patterns within the toxic guides (one-hot encoding to start) and run a RF analysis
### onehot encoding
import os, sys
import numpy as np
onehot_dict = {
'A': '1000',
'C': '0100',
'U': '0010',
'G': '0001',
'AA': '1000000000000000',
'AC': '0100000000000000',
'AU': '0010000000000000',
'AG': '0001000000000000',
'CA': '0000100000000000',
'CC': '0000010000000000',
'CU': '0000001000000000',
'CG': '0000000100000000',
'UA': '0000000010000000',
'UC': '0000000001000000',
'UU': '0000000000100000',
'UG': '0000000000010000',
'GA': '0000000000001000',
'GC': '0000000000000100',
'GU': '0000000000000010',
'GG': '0000000000000001',
}
# open input and output files
input_path = sys.argv[1]
input_file = open(input_path, 'r')
dep1_file = open(input_path[:-4]+'_dependent1.txt', 'w')
dep2_file = open(input_path[:-4]+'_dependent2.txt', 'w')
indep1_file = open(input_path[:-4]+'_independent1.txt', 'w')
indep2_file = open(input_path[:-4]+'_independent2.txt', 'w')
# loop over nucleotide sequences
for idx, line in enumerate(input_file):
# if first iteration, write title line
if idx == 0:
dep1_file.writelines(line+': first-order position-dependent features'+ '\n')
dep2_file.writelines(line+': second-order position-dependent features'+ '\n')
indep1_file.writelines(line+': first-order position-independent features'+ '\n')
indep2_file.writelines(line+': second-order position-independent features'+ '\n')
# otherwise encode sequence
else:
# split line by tab
line = line.split('\t')
# extract sequence (also remove \n)
seq = line[-1][:-1]
# compute position-dependent features as one-hot vectors
pos_dep1 = ''.join([onehot_dict[seq[i]] for i in range(len(seq))])
pos_dep2 = ''.join([onehot_dict[seq[i:i+2]] for i in range(len(seq)-1)])
# compute position-independent features as sum over position-dependent features
pos_indep1 = list(np.array([int(o) for o in pos_dep1]).reshape([-1, 4]).sum(axis=0))
pos_indep2 = list(np.array([int(o) for o in pos_dep2]).reshape([-1, 16]).sum(axis=0))
pos_indep1 = ''.join([str(p) for p in pos_indep1])
pos_indep2 = ''.join([str(p) for p in pos_indep2])
# write features to file
dep1_file.writelines(line[0] + '\t' + pos_dep1 + '\n')
dep2_file.writelines(line[0] + '\t' + pos_dep2 + '\n')
indep1_file.writelines(line[0] + '\t' + pos_indep1 + '\n')
indep2_file.writelines(line[0] + '\t' + pos_indep2 + '\n')
if idx % 10000 == 0:
print('{0:,}'.format(idx)+' lines processed...')
print('Done!')
input_file.close()
dep1_file.close()
dep2_file.close()
indep1_file.close()
indep2_file.close()
#python path/to/encode_sequences.py path/to/data.txt
source /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/scripts/loadcondaandes.sh
conda activate /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/Libraries/andes/envs/test
cd /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/toxic.guide/
cut -f 1,2 20210920_lib4_toxic_transcript.txt > 20210920_lib4_toxic_transcript.noscore.txt
cut -f 1 20210920_lib4_toxic_transcript.txt > 20210920_lib4_toxic_transcript_names.txt
cut -f 2 20210920_lib4_toxic_transcript.txt | cut -c-20 > 20210920_lib4_toxic_transcript_guideseq.txt
paste 20210920_lib4_toxic_transcript_names.txt 20210920_lib4_toxic_transcript_guideseq.txt > lib4_toxic_transcript_20bp.txt
python ../e.coli/encode_sequences_RNA.py lib4_toxic_transcript_20bp.txt
# separate nucleotide sequence values into individual columns in data frame so each position counts as one feature
cd /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/e.coli/
sed '1d' lib4_toxic_transcript_20bp_dependent1.txt | sed '1d' | awk '{gsub(/./,"& ",$2);print $0}' | sed '1i sgRNAID A C T G' | cut -d ' ' -f 1-5 > lib4_toxic_transcript_20bp_ind1.txt
sed '1d' lib4_toxic_transcript_20bp_independent2.txt | sed '1d' | awk '{gsub(/./,"& ",$2);print $0}' | sed '1i sgRNAID AA AC AT AG CA CC CT CG TA TC TT TG GA GC GT GG' | cut -d ' ' -f 1-17 > lib4_toxic_transcript_20bp_ind2.txt
sed '1d' lib4_toxic_transcript_20bp_dependent1.txt | sed '1d' | awk '{gsub(/./,"& ",$2);print $0}' | sed '1i sgRNAID p1.A p1.C p1.T p1.G p2.A p2.C p2.T p2.G p3.A p3.C p3.T p3.G p4.A p4.C p4.T p4.G p5.A p5.C p5.T p5.G p6.A p6.C p6.T p6.G p7.A p7.C p7.T p7.G p8.A p8.C p8.T p8.G p9.A p9.C p9.T p9.G p10.A p10.C p10.T p10.G p11.A p11.C p11.T p11.G p12.A p12.C p12.T p12.G p13.A p13.C p13.T p13.G p14.A p14.C p14.T p14.G p15.A p15.C p15.T p15.G p16.A p16.C p16.T p16.G p17.A p17.C p17.T p17.G p18.A p18.C p18.T p18.G p19.A p19.C p19.T p19.G p20.A p20.C p20.T p20.G' | cut -d ' ' -f 1-81 > lib4_toxic_transcript_20bp_dep1.txt
sed '1d' lib4_toxic_transcript_20bp_dependent2.txt | sed '1d' | awk '{gsub(/./,"& ",$2);print $0}' | sed '1i sgRNAID p1.AA p1.AC p1.AT p1.AG p1.CA p1.CC p1.CT p1.CG p1.TA p1.TC p1.TT p1.TG p1.GA p1.GC p1.GT p1.GG p2.AA p2.AC p2.AT p2.AG p2.CA p2.CC p2.CT p2.CG p2.TA p2.TC p2.TT p2.TG p2.GA p2.GC p2.GT p2.GG p3.AA p3.AC p3.AT p3.AG p3.CA p3.CC p3.CT p3.CG p3.TA p3.TC p3.TT p3.TG p3.GA p3.GC p3.GT p3.GG p4.AA p4.AC p4.AT p4.AG p4.CA p4.CC p4.CT p4.CG p4.TA p4.TC p4.TT p4.TG p4.GA p4.GC p4.GT p4.GG p5.AA p5.AC p5.AT p5.AG p5.CA p5.CC p5.CT p5.CG p5.TA p5.TC p5.TT p5.TG p5.GA p5.GC p5.GT p5.GG p6.AA p6.AC p6.AT p6.AG p6.CA p6.CC p6.CT p6.CG p6.TA p6.TC p6.TT p6.TG p6.GA p6.GC p6.GT p6.GG p7.AA p7.AC p7.AT p7.AG p7.CA p7.CC p7.CT p7.CG p7.TA p7.TC p7.TT p7.TG p7.GA p7.GC p7.GT p7.GG p8.AA p8.AC p8.AT p8.AG p8.CA p8.CC p8.CT p8.CG p8.TA p8.TC p8.TT p8.TG p8.GA p8.GC p8.GT p8.GG p9.AA p9.AC p9.AT p9.AG p9.CA p9.CC p9.CT p9.CG p9.TA p9.TC p9.TT p9.TG p9.GA p9.GC p9.GT p9.GG p10.AA p10.AC p10.AT p10.AG p10.CA p10.CC p10.CT p10.CG p10.TA p10.TC p10.TT p10.TG p10.GA p10.GC p10.GT p10.GG p11.AA p11.AC p11.AT p11.AG p11.CA p11.CC p11.CT p11.CG p11.TA p11.TC p11.TT p11.TG p11.GA p11.GC p11.GT p11.GG p12.AA p12.AC p12.AT p12.AG p12.CA p12.CC p12.CT p12.CG p12.TA p12.TC p12.TT p12.TG p12.GA p12.GC p12.GT p12.GG p13.AA p13.AC p13.AT p13.AG p13.CA p13.CC p13.CT p13.CG p13.TA p13.TC p13.TT p13.TG p13.GA p13.GC p13.GT p13.GG p14.AA p14.AC p14.AT p14.AG p14.CA p14.CC p14.CT p14.CG p14.TA p14.TC p14.TT p14.TG p14.GA p14.GC p14.GT p14.GG p15.AA p15.AC p15.AT p15.AG p15.CA p15.CC p15.CT p15.CG p15.TA p15.TC p15.TT p15.TG p15.GA p15.GC p15.GT p15.GG p16.AA p16.AC p16.AT p16.AG p16.CA p16.CC p16.CT p16.CG p16.TA p16.TC p16.TT p16.TG p16.GA p16.GC p16.GT p16.GG p17.AA p17.AC p17.AT p17.AG p17.CA p17.CC p17.CT p17.CG p17.TA p17.TC p17.TT p17.TG p17.GA p17.GC p17.GT p17.GG p18.AA p18.AC p18.AT p18.AG p18.CA p18.CC p18.CT p18.CG p18.TA p18.TC p18.TT p18.TG p18.GA p18.GC p18.GT p18.GG p19.AA p19.AC p19.AT p19.AG p19.CA p19.CC p19.CT p19.CG p19.TA p19.TC p19.TT p19.TG p19.GA p19.GC p19.GT p19.GG p20.AA p20.AC p20.AT p20.AG p20.CA p20.CC p20.CT p20.CG p20.TA p20.TC p20.TT p20.TG p20.GA p20.GC p20.GT p20.GG' | cut -d ' ' -f 1-321 > lib4_toxic_transcript_20bp_dep2.txt
# repeat with all transcripts
cd /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/toxic.guide/
cut -f 1,2,4 20210922_Lib4_transcript_log2FC.txt > 20210922_Lib4_transcript_log2FC_CJ019_12h.txt
cut -f 1,2 20210922_Lib4_transcript_log2FC_CJ019_12h.txt > 20210922_Lib4_transcript_log2FC_CJ019_12h.noscore.txt
cut -f 1 20210922_Lib4_transcript_log2FC_CJ019_12h.txt > 20210922_Lib4_transcript_log2FC_CJ019_12h_names.txt
cut -f 2 20210922_Lib4_transcript_log2FC_CJ019_12h.txt | cut -c-20 > 20210922_Lib4_transcript_log2FC_CJ019_12h_guideseq.txt
paste 20210922_Lib4_transcript_log2FC_CJ019_12h_names.txt 20210922_Lib4_transcript_log2FC_CJ019_12h_guideseq.txt > lib4_transcript_20bp.txt
python ../e.coli/encode_sequences_RNA.py lib4_transcript_20bp.txt
sed '1d' lib4_transcript_20bp_dependent1.txt | sed '1d' | awk '{gsub(/./,"& ",$2);print $0}' | sed '1i sgRNAID A C T G' | cut -d ' ' -f 1-5 > lib4_transcript_20bp_ind1.txt
sed '1d' lib4_transcript_20bp_independent2.txt | sed '1d' | awk '{gsub(/./,"& ",$2);print $0}' | sed '1i sgRNAID AA AC AT AG CA CC CT CG TA TC TT TG GA GC GT GG' | cut -d ' ' -f 1-17 > lib4_transcript_20bp_ind2.txt
sed '1d' lib4_transcript_20bp_dependent1.txt | sed '1d' | awk '{gsub(/./,"& ",$2);print $0}' | sed '1i sgRNAID p1.A p1.C p1.T p1.G p2.A p2.C p2.T p2.G p3.A p3.C p3.T p3.G p4.A p4.C p4.T p4.G p5.A p5.C p5.T p5.G p6.A p6.C p6.T p6.G p7.A p7.C p7.T p7.G p8.A p8.C p8.T p8.G p9.A p9.C p9.T p9.G p10.A p10.C p10.T p10.G p11.A p11.C p11.T p11.G p12.A p12.C p12.T p12.G p13.A p13.C p13.T p13.G p14.A p14.C p14.T p14.G p15.A p15.C p15.T p15.G p16.A p16.C p16.T p16.G p17.A p17.C p17.T p17.G p18.A p18.C p18.T p18.G p19.A p19.C p19.T p19.G p20.A p20.C p20.T p20.G' | cut -d ' ' -f 1-81 > lib4_transcript_20bp_dep1.txt
sed '1d' lib4_transcript_20bp_dependent2.txt | sed '1d' | awk '{gsub(/./,"& ",$2);print $0}' | sed '1i sgRNAID p1.AA p1.AC p1.AT p1.AG p1.CA p1.CC p1.CT p1.CG p1.TA p1.TC p1.TT p1.TG p1.GA p1.GC p1.GT p1.GG p2.AA p2.AC p2.AT p2.AG p2.CA p2.CC p2.CT p2.CG p2.TA p2.TC p2.TT p2.TG p2.GA p2.GC p2.GT p2.GG p3.AA p3.AC p3.AT p3.AG p3.CA p3.CC p3.CT p3.CG p3.TA p3.TC p3.TT p3.TG p3.GA p3.GC p3.GT p3.GG p4.AA p4.AC p4.AT p4.AG p4.CA p4.CC p4.CT p4.CG p4.TA p4.TC p4.TT p4.TG p4.GA p4.GC p4.GT p4.GG p5.AA p5.AC p5.AT p5.AG p5.CA p5.CC p5.CT p5.CG p5.TA p5.TC p5.TT p5.TG p5.GA p5.GC p5.GT p5.GG p6.AA p6.AC p6.AT p6.AG p6.CA p6.CC p6.CT p6.CG p6.TA p6.TC p6.TT p6.TG p6.GA p6.GC p6.GT p6.GG p7.AA p7.AC p7.AT p7.AG p7.CA p7.CC p7.CT p7.CG p7.TA p7.TC p7.TT p7.TG p7.GA p7.GC p7.GT p7.GG p8.AA p8.AC p8.AT p8.AG p8.CA p8.CC p8.CT p8.CG p8.TA p8.TC p8.TT p8.TG p8.GA p8.GC p8.GT p8.GG p9.AA p9.AC p9.AT p9.AG p9.CA p9.CC p9.CT p9.CG p9.TA p9.TC p9.TT p9.TG p9.GA p9.GC p9.GT p9.GG p10.AA p10.AC p10.AT p10.AG p10.CA p10.CC p10.CT p10.CG p10.TA p10.TC p10.TT p10.TG p10.GA p10.GC p10.GT p10.GG p11.AA p11.AC p11.AT p11.AG p11.CA p11.CC p11.CT p11.CG p11.TA p11.TC p11.TT p11.TG p11.GA p11.GC p11.GT p11.GG p12.AA p12.AC p12.AT p12.AG p12.CA p12.CC p12.CT p12.CG p12.TA p12.TC p12.TT p12.TG p12.GA p12.GC p12.GT p12.GG p13.AA p13.AC p13.AT p13.AG p13.CA p13.CC p13.CT p13.CG p13.TA p13.TC p13.TT p13.TG p13.GA p13.GC p13.GT p13.GG p14.AA p14.AC p14.AT p14.AG p14.CA p14.CC p14.CT p14.CG p14.TA p14.TC p14.TT p14.TG p14.GA p14.GC p14.GT p14.GG p15.AA p15.AC p15.AT p15.AG p15.CA p15.CC p15.CT p15.CG p15.TA p15.TC p15.TT p15.TG p15.GA p15.GC p15.GT p15.GG p16.AA p16.AC p16.AT p16.AG p16.CA p16.CC p16.CT p16.CG p16.TA p16.TC p16.TT p16.TG p16.GA p16.GC p16.GT p16.GG p17.AA p17.AC p17.AT p17.AG p17.CA p17.CC p17.CT p17.CG p17.TA p17.TC p17.TT p17.TG p17.GA p17.GC p17.GT p17.GG p18.AA p18.AC p18.AT p18.AG p18.CA p18.CC p18.CT p18.CG p18.TA p18.TC p18.TT p18.TG p18.GA p18.GC p18.GT p18.GG p19.AA p19.AC p19.AT p19.AG p19.CA p19.CC p19.CT p19.CG p19.TA p19.TC p19.TT p19.TG p19.GA p19.GC p19.GT p19.GG p20.AA p20.AC p20.AT p20.AG p20.CA p20.CC p20.CT p20.CG p20.TA p20.TC p20.TT p20.TG p20.GA p20.GC p20.GT p20.GG' | cut -d ' ' -f 1-321 > lib4_transcript_20bp_dep2.txt
library(dplyr)
setwd("/gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/toxic.guide")
onehot.ind1 <- read.delim("lib4_toxic_transcript_20bp_ind1.txt", header=T, sep=" ")
onehot.ind2 <- read.delim("lib4_toxic_transcript_20bp_ind2.txt", header=T, sep=" ")
onehot.dep1 <- read.delim("lib4_toxic_transcript_20bp_dep1.txt", header=T, sep=" ")
onehot.dep2 <- read.delim("lib4_toxic_transcript_20bp_dep2.txt", header=T, sep=" ")
onehot.dep2 <- onehot.dep2[,1:305]
onehot.ind <- full_join(onehot.ind1, onehot.ind2, by="sgRNAID")
onehot.dep <- full_join(onehot.dep1, onehot.dep2, by="sgRNAID")
onehot <- full_join(onehot.ind, onehot.dep, by="sgRNAID")
write.table(onehot, "lib4_toxic_transcript_20bp_onehot.txt", quote=F, row.names=F, sep="\t")
setwd("/gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/toxic.guide")
onehot.ind1 <- read.delim("lib4_transcript_20bp_ind1.txt", header=T, sep=" ")
onehot.ind2 <- read.delim("lib4_transcript_20bp_ind2.txt", header=T, sep=" ")
onehot.dep1 <- read.delim("lib4_transcript_20bp_dep1.txt", header=T, sep=" ")
onehot.dep2 <- read.delim("lib4_transcript_20bp_dep2.txt", header=T, sep=" ")
onehot.dep2 <- onehot.dep2[,1:305]
onehot.ind <- full_join(onehot.ind1, onehot.ind2, by="sgRNAID")
onehot.dep <- full_join(onehot.dep1, onehot.dep2, by="sgRNAID")
onehot <- full_join(onehot.ind, onehot.dep, by="sgRNAID")
write.table(onehot, "lib4_transcript_20bp_onehot.txt", quote=F, row.names=F, sep="\t")
setwd("/gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/toxic.guide")
df <- read.delim("20210922_Lib4_transcript_log2FC_CJ019_12h.txt", header=T, sep="\t")
score <- df[,c(1,3)]
colnames(score) <- c("sgRNAID", "score")
summary(score$score)
# Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
# -0.99811 -0.15871 0.02336 -0.01369 0.17289 1.08272 59
toxic.df <- read.delim("20210920_lib4_toxic_transcript.txt", header=T, sep="\t")
toxic.score <- toxic.df[,c(1,3)]
colnames(toxic.score) <- c("sgRNAID", "score")
summary(toxic.score$score)
# Min. 1st Qu. Median Mean 3rd Qu. Max.
# -12.4849 -1.3473 -1.0968 -2.6190 -1.0432 -0.6318
onehot <- read.delim("lib4_transcript_20bp_onehot.txt", header=T, sep="\t")
toxic.onehot <- read.delim("lib4_toxic_transcript_20bp_onehot.txt", header=T, sep="\t")
data.onehot <- left_join(score, onehot, by=c("sgRNAID"))
# 9998
toxic.data.onehot <- left_join(toxic.score, toxic.onehot, by=c("sgRNAID"))
# 58
data = na.omit(rbind(data.onehot, toxic.data.onehot))
xmat = data[,3:ncol(data)]
xmat.score = data$score
### GBR:
library(gbm)
gbm.df <- gbm(formula=xmat.score ~ ., data=xmat, distribution = "gaussian", n.trees = 500, shrinkage = 0.1,interaction.depth = 3, bag.fraction = 0.2, train.fraction = 0.8,n.minobsinnode = 10, cv.folds = 5, keep.data = TRUE,verbose = FALSE, n.cores = 1)
best.iter <- gbm.perf(gbm.df, method = "OOB")
print(best.iter)
# OOB generally underestimates the optimal number of iterations although predictive performance is reasonably competitive. Using cv_folds>1 when calling gbm usually results in improved predictive performance.
# [1] 153
# attr(,"smoother")
# Call:
# loess(formula = object$oobag.improve ~ x, enp.target = min(max(4,
# length(x)/10), 50))
#
# Number of Observations: 500
# Equivalent Number of Parameters: 39.85
# Residual Standard Error: 3.103e-05
best.iter <- gbm.perf(gbm.df, method = "cv")
print(best.iter)
# 484
head(summary(gbm.df, n.trees = best.iter))
# var rel.inf
# p2.GG p2.GG 4.400380
# GC GC 4.221471
# p3.GG p3.GG 2.593367
# CC CC 2.316023
# p4.G p4.G 2.161703
# p1.GG p1.GG 1.858275
### RF:
library(randomForest)
xmat = data[,2:ncol(data)]
set.seed(131)
rf <- randomForest(score ~ ., data=xmat, mtry=3, importance=TRUE, na.action=na.omit)
print(rf)
# Call:
# randomForest(formula = score ~ ., data = xmat, mtry = 3, importance = TRUE, na.action = na.omit)
# Type of random forest: regression
# Number of trees: 500
# No. of variables tried at each split: 3
#
# Mean of squared residuals: 0.1611834
# % Var explained: 9.29
imp <- data.frame(round(importance(rf), 2))
imp$Feature <- rownames(imp)
colnames(imp) <- c("IncMSE", "IncNodePurity", "Feature")
# Mean Decrease Accuracy (%IncMSE) - This shows how much our model accuracy decreases if we leave out that variable. Mean Decrease Gini (IncNodePurity) - This is a measure of variable importance based on the Gini impurity index used for the calculating the splits in trees.
tail(imp[order( imp[,1] ),])
# IncMSE IncNodePurity Feature
# p11.TT 3.52 0.93 p11.TT
# p1.GG 3.65 3.09 p1.GG
# p10.TG 3.76 1.19 p10.TG
# p3.TC 3.87 1.40 p3.TC
# GC 3.98 9.45 GC
# p2.GG 5.88 8.38 p2.GG
tail(imp[order( imp[,2] ),])
# IncMSE IncNodePurity Feature
# GC 3.98 9.45 GC
# p4.G 3.40 9.51 p4.G
# p12.GC 0.80 10.06 p12.GC
# p11.TG 0.13 11.18 p11.TG
# AT 0.33 12.60 AT
# p10.AT 0.74 12.86 p10.AT
### Classification
data.onehot$class <- "normal"
toxic.data.onehot$class <- "toxic"
data = na.omit(rbind(data.onehot, toxic.data.onehot))
xmat = na.omit(data[,3:ncol(data)])
xmat$class = factor(xmat$class)
library(randomForest)
library(datasets)
library(caret)
set.seed(222)
ind <- sample(2, nrow(xmat), replace = TRUE, prob = c(0.7, 0.3))
train <- xmat[ind==1,]
test <- xmat[ind==2,]
rf <- randomForest(class~., data=train, proximity=TRUE)
print(rf)
# Call:
# randomForest(formula = class ~ ., data = train, proximity = TRUE)
# Type of random forest: classification
# Number of trees: 500
# No. of variables tried at each split: 20
#
# OOB estimate of error rate: 0.58%
# Confusion matrix:
# normal toxic class.error
# normal 6992 0 0
# toxic 41 0 1
p1 <- predict(rf, train)
confusionMatrix(p1, train$class)
# Confusion Matrix and Statistics
#
# Reference
# Prediction normal toxic
# normal 6992 0
# toxic 0 41
#
# Accuracy : 1
# 95% CI : (0.9995, 1)
# No Information Rate : 0.9942
# P-Value [Acc > NIR] : < 2.2e-16
#
# Kappa : 1
#
# Mcnemar's Test P-Value : NA
#
# Sensitivity : 1.0000
# Specificity : 1.0000
# Pos Pred Value : 1.0000
# Neg Pred Value : 1.0000
# Prevalence : 0.9942
# Detection Rate : 0.9942
# Detection Prevalence : 0.9942
# Balanced Accuracy : 1.0000
#
# 'Positive' Class : normal
p2 <- predict(rf, test)
confusionMatrix(p2, test$class)
# Confusion Matrix and Statistics
#
# Reference
# Prediction normal toxic
# normal 2947 17
# toxic 0 0
#
# Accuracy : 0.9943
# 95% CI : (0.9908, 0.9967)
# No Information Rate : 0.9943
# P-Value [Acc > NIR] : 0.5640232
#
# Kappa : 0
#
# Mcnemar's Test P-Value : 0.0001042
#
# Sensitivity : 1.0000
# Specificity : 0.0000
# Pos Pred Value : 0.9943
# Neg Pred Value : NaN
# Prevalence : 0.9943
# Detection Rate : 0.9943
# Detection Prevalence : 1.0000
# Balanced Accuracy : 0.5000
#
# 'Positive' Class : normal
imp <- data.frame(round(importance(rf), 2))
imp$Feature <- rownames(imp)
colnames(imp) <- c("MeanDecreaseGini", "Feature")
imp.sort <- imp[order( imp[,1] ),]
tail(imp.sort)
# MeanDecreaseGini Feature
# UG 0.81 TG
# GC 0.81 GC
# CC 0.84 CC
# UC 0.88 TC
# GG 0.94 GG
# CG 1.10 CG
pdf("rf.classification.toxic.guide.imp.pdf")
varImpPlot(rf,sort = T,n.var = 10,main = "Top 10 - Variable Importance")
dev.off()
# GBM classification:
data.onehot$class <- "normal"
toxic.data.onehot$class <- "toxic"
data = na.omit(rbind(data.onehot, toxic.data.onehot))
xmat = na.omit(data[,3:ncol(data)-1])
xmat$class = factor(xmat$class)
library(gbm)
library(caret)
indexes = createDataPartition(xmat$class, p = .90, list = F)
train = xmat[indexes, ]
test = xmat[-indexes, ]
mod_gbm = gbm(class ~., data = train, distribution = "multinomial", cv.folds = 10, shrinkage = .01, n.minobsinnode = 10, n.trees = 200)
print(mod_gbm)
# gbm(formula = class ~ ., distribution = "multinomial", data = train,
# n.trees = 200, n.minobsinnode = 10, shrinkage = 0.01, cv.folds = 10)
# A gradient boosted model with multinomial loss function.
# 200 iterations were performed.
# The best cross-validation iteration was 200.
# There were 404 predictors of which 26 had non-zero influence.
pred = predict.gbm(object = mod_gbm,newdata = test,n.trees = 200,type = "response")
labels = colnames(pred)[apply(pred, 1, which.max)]
result = data.frame(test$class, labels)
print(result)
cm = confusionMatrix(test$class, as.factor(labels))
print(cm)
# caret train method
tc = trainControl(method = "repeatedcv", number = 10)
model = train(class ~., data=train, method="gbm", trControl=tc)
print(model)
pred = predict(model, test)
result = data.frame(test$class, pred)
print(result)
cm = confusionMatrix(test$class, as.factor(pred))
print(cm)
# Confusion Matrix and Statistics
#
# Reference
# Prediction normal toxic
# normal 993 0
# toxic 5 0
#
# Accuracy : 0.995
# 95% CI : (0.9883, 0.9984)
# No Information Rate : 1
# P-Value [Acc > NIR] : 1.00000
#
# Kappa : 0
#
# Mcnemar's Test P-Value : 0.07364
#
# Sensitivity : 0.995
# Specificity : NA
# Pos Pred Value : NA
# Neg Pred Value : NA
# Prevalence : 1.000
# Detection Rate : 0.995
# Detection Prevalence : 0.995
# Balanced Accuracy : NA
#
# 'Positive' Class : normal
setwd("/gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/toxic.guide")
df <- read.delim("20210922_Lib4_transcript_log2FC_CJ019_12h.txt", header=T, sep="\t")
score <- df[,c(1,3)]
colnames(score) <- c("sgRNAID", "score")
toxic.df <- read.delim("20210920_lib4_toxic_transcript.txt", header=T, sep="\t")
toxic.score <- toxic.df[,c(1,3)]
colnames(toxic.score) <- c("sgRNAID", "score")
onehot <- read.delim("lib4_transcript_20bp_onehot.txt", header=T, sep="\t")
toxic.onehot <- read.delim("lib4_toxic_transcript_20bp_onehot.txt", header=T, sep="\t")
library(dplyr)
data.onehot <- left_join(score, onehot, by=c("sgRNAID"))
toxic.data.onehot <- left_join(toxic.score, toxic.onehot, by=c("sgRNAID"))
data.onehot.sample <- data.onehot[sample(nrow(data.onehot), nrow(toxic.data.onehot)),]
data = na.omit(rbind(data.onehot.sample, toxic.data.onehot))
xmat = data[,3:ncol(data)]
xmat.score = data$score
### GBR:
library(gbm)
gbm.df <- gbm(formula=xmat.score ~ ., data=xmat, distribution = "gaussian", n.trees = 500, shrinkage = 0.1,interaction.depth = 3, bag.fraction = 0.2, train.fraction = 0.8,n.minobsinnode = 10, cv.folds = 5, keep.data = TRUE,verbose = FALSE, n.cores = 1)
# The data set is too small or the subsampling rate is too large: `nTrain * bag.fraction <= n.minobsinnode`
### RF:
library(randomForest)
xmat = data[,2:ncol(data)]
set.seed(131)
rf <- randomForest(score ~ ., data=xmat, mtry=3, importance=TRUE, na.action=na.omit)
print(rf)
# Call:
# randomForest(formula = score ~ ., data = xmat, mtry = 3, importance = TRUE, na.action = na.omit)
# Type of random forest: regression
# Number of trees: 500
# No. of variables tried at each split: 3
#
# Mean of squared residuals: 5.943516
# % Var explained: 18.32
imp <- data.frame(round(importance(rf), 2))
imp$Feature <- rownames(imp)
colnames(imp) <- c("IncMSE", "IncNodePurity", "Feature")
# Mean Decrease Accuracy (%IncMSE) - This shows how much our model accuracy decreases if we leave out that variable. Mean Decrease Gini (IncNodePurity) - This is a measure of variable importance based on the Gini impurity index used for the calculating the splits in trees.
tail(imp[order( imp[,1] ),])
# IncMSE IncNodePurity Feature
# p13.GG 2.93 0.96 p13.GG
# p1.A 3.06 6.89 p1.A
# p11.T 3.11 9.08 p11.T
# p10.AT 3.78 12.40 p10.AT
# p12.GC 4.32 16.44 p12.GC
# p11.TG 4.58 17.37 p11.TG
tail(imp[order( imp[,2] ),])
# IncMSE IncNodePurity Feature
# p11.T 3.11 9.08 p11.T
# A 2.85 9.96 A
# p12.G 2.80 11.75 p12.G
# p10.AT 3.78 12.40 p10.AT
# p12.GC 4.32 16.44 p12.GC
# p11.TG 4.58 17.37 p11.TG
### Classification
data.onehot.sample$class <- "normal"
toxic.data.onehot$class <- "toxic"
data = na.omit(rbind(data.onehot.sample, toxic.data.onehot))
xmat = na.omit(data[,3:ncol(data)])
xmat$class = factor(xmat$class)
library(randomForest)
library(datasets)
library(caret)
set.seed(222)
ind <- sample(2, nrow(xmat), replace = TRUE, prob = c(0.7, 0.3))
train <- xmat[ind==1,]
test <- xmat[ind==2,]
rf <- randomForest(class~., data=train, proximity=TRUE)
print(rf)
p1 <- predict(rf, train)
#confusionMatrix(p1, train$class)
p2 <- predict(rf, test)
confusionMatrix(p2, test$class)
imp <- data.frame(round(importance(rf), 2))
imp$Feature <- rownames(imp)
colnames(imp) <- c("MeanDecreaseGini", "Feature")
imp.sort <- imp[order( imp[,1] ),]
tail(imp.sort)
# Call:
# randomForest(formula = class ~ ., data = train, proximity = TRUE)
# Type of random forest: classification
# Number of trees: 500
# No. of variables tried at each split: 20
#
# OOB estimate of error rate: 26.92%
# Confusion matrix:
# normal toxic class.error
# normal 28 11 0.2820513
# toxic 10 29 0.2564103
# Confusion Matrix and Statistics
#
# Reference
# Prediction normal toxic
# normal 12 1
# toxic 7 18
#
# Accuracy : 0.7895
# 95% CI : (0.6268, 0.9045)
# No Information Rate : 0.5
# P-Value [Acc > NIR] : 0.000236
#
# Kappa : 0.5789
#
# Mcnemar's Test P-Value : 0.077100
#
# Sensitivity : 0.6316
# Specificity : 0.9474
# Pos Pred Value : 0.9231
# Neg Pred Value : 0.7200
# Prevalence : 0.5000
# Detection Rate : 0.3158
# Detection Prevalence : 0.3421
# Balanced Accuracy : 0.7895
#
# 'Positive' Class : normal
#
# MeanDecreaseGini Feature
# p3.T 0.54 p3.T
# p2.G 0.62 p2.G
# GC 0.70 GC
# p2.GG 0.83 p2.GG
# CG 0.87 CG
# p5.T 0.93 p5.T
https://www.ebi.ac.uk/Tools/msa/clustalo/
sed '1d' lib4_toxic_transcript_20bp.txt | awk '{print ">"$1"\n"$2}' > lib4_toxic_transcript_20bp.fasta
# CLUSTAL O(1.2.4) multiple sequence alignment
#
# PP_5504:2546864-2546886:+ ----UCCGGCUG--------GUGU-AGCCCGCC------------------ 20
# PP_2165:2470686-2470708:- ---AUCACUC-------------A-UGACGCCGGCAC-------------- 20
# ccmA:4917161-4917183:+ --AGUCCUUG-------------A-UGCCGGCGGCG--------------- 20
# PP_1579:1769468-1769490:- ----GGGCGC-------------U-CGCCCGCGGCUCG------------- 20
# PP_4833:5500436-5500458:- -----CCCGC-------------U-CGCCGGCGGCGAUC------------ 20
# PP_4547:5168516-5168538:- ------UGG-CC--------GGCC-UGUCUGGU-------ACA-------- 20
# PP_4543:5160639-5160661:+ ------GGG-CC--------GCUG-ACCGGGCU-------GAA-------- 20
# cobD:1869923-1869945:- -----CAGG-CG--------GCGC-UCCAGGUU-------GC--------- 20
# PP_5433:252487-252509:- GCAUCCAGG-CU--------CUGC-UCCAG--------------------- 20
# PP_3866:4390891-4390913:+ -------------------------UGCCGGCGCCAAGAAAAAAC------ 20
# garK:3602777-3602799:- ------------------------------GUGGCCCGC-GCACGGUCUGG 20
# aruC:452409-452431:- ---UGCCGG--U--------ACUC-UGACACAGC----------------- 20
# PP_5676:5012360-5012382:+ -----------U--------CCUC-CGCCGGCGGACUUCA----------- 20
# PP_4942:5624331-5624353:+ ----------------------------CGCCG-CCGGCAGAAUGAUCU-- 20
# cigR:2873385-2873407:- -------------------------UGCCGCCGGCGCUCUGCUUG------ 20
# PP_3382:3829541-3829563:+ --------------------------ACCGGCGCCCAGCACCAGGG----- 20
# csgG:3939545-3939567:+ --------------------------CAGGGCGUCCAUCAACAUGC----- 20
# mgtE:5078375-5078397:+ --------------------------GUCGGCGUCCAUCUCCUUGG----- 20
# PP_2607:2982161-2982183:+ --------------------------GCCGGCGUC-ACGGGCUUGGU---- 20
# murF:1518472-1518494:- -----CAGGGCG--------GCUA-CCGCGCCUU----------------- 20
# PP_5456:870252-870274:+ -------GGGCA--------GCUC-GCGCCCCAG---CC------------ 20
# PP_4525:5141235-5141257:+ -------------------------CGCCAGCAAC-GCCGGCAAGU----- 20
# pdxY:6108080-6108102:+ ---------------------CCG-CGGCGCUGUU-GCCGGCG-------- 20
# PP_t52:5079842-5079864:+ -------GGGGA--------GCGC-CUUCGGCCAC-U-------------- 20
# PP_4048:4561299-4561321:+ ---------CCG--------GCGG-CUGCGGCGUU-GCC------------ 20
# astA-I:5089596-5089618:+ -----CAGGCCG--------ACGG-CGCCGGCGA----------------- 20
# PP_3193:3622396-3622418:+ ------AGAAGA--------ACGC-CGCCGGCGAC---------------- 20
# PP_3509:3982129-3982151:- -----------AUCGAGAAUAUGC----GCAGGAC---------------- 20
# PP_5157:5883517-5883539:- ------------UGGCGAGUUCUC----GGCCUGUA--------------- 20
# PP_5455:811177-811199:- -----------AGCGCGGGUAUGC----GCACUGU---------------- 20
# PP_0596:694389-694411:- -----------GUCCAGCGUACUC----AGUUGAC---------------- 20
# PP_3877:4397528-4397550:+ -----------ACCGGGCGUACUG----CGUUACA---------------- 20
# ychF:837219-837241:+ -----------AUGGGCACGAUGC----CGCUGUU---------------- 20
# PP_4996:5693370-5693392:- -------------------UACGC----CGUUCUGCGCCUUCA-------- 20
# PP_4307:4899070-4899092:+ ------------GAG---GAACGC----CACACUCAGCC------------ 20
# PP_5143:5868961-5868983:+ -------------GG---GGACGC----CACGGCUGGCCU----------- 20
# arsR-I:2176613-2176635:+ -------CGGCA--AGGCGC--------AACGCAUCG-------------- 20
# PP_3160:3581271-3581293:- ------CCGAAC--GGGCUC--------AAGCUGUC--------------- 20
# PP_0663:771926-771948:- ----------CA--CCGCGC--------AUGCGGUCAUAG----------- 20
# PP_1752:1953874-1953896:- ----UGAGCACA--UCGAUC--------AUGCGC----------------- 20
# rlmH:5472171-5472193:+ -----------------AGGG----UGACAAUGCGUUCCCC---------- 20
# nfuA:2718943-2718965:- -----------AAAAUGCGAA----UGCCGAUGCC---------------- 20
# mexF:3878498-3878520:- ------UGGGCUGGGUACGGG----UGACG--------------------- 20
# PP_5006:5703946-5703968:- ------UGGGCUGAUGCCGAG----UUCGU--------------------- 20
# ybaB:4852902-4852924:+ ---------------CAUUUUUUCCUGCAUCUGCU---------------- 20
# arfB:3351078-3351100:- ---------------------AUCUGGCAGAUGCACGUUAU---------- 20
# PP_1325:1511435-1511457:+ ---------------------UUCGAGCAGUUGCUCGAUGC---------- 20
# PP_3157:3576652-3576674:- ---------------------AUCGGCCAAUUGCUCGGCAA---------- 20
# PP_3893:4410081-4410103:+ ---------------------AUCUCCCCGUUGCUCAGCGU---------- 20
# PP_4948:5635636-5635658:- ----CGGGGGCUGAGCAAGGUC--UC------------------------- 20
# rubA:6062766-6062788:+ ----UCGGGGCACAGCCAGUCU--UC------------------------- 20
# PP_0141:149432-149454:+ ------------GGGCCAGUUU--CACCCCG-GCC---------------- 20
# PP_2930:3332877-3332899:+ -------------CCCCAGCUU--GCGUCCG-GCAU--------------- 20
# rpsM:561837-561859:- ----------CUGGGCGAUUAC--UUGCGGAU------------------- 20
# PP_2661:3048290-3048312:- ------------GAGCGUUUUC--CAGCCCGCGC----------------- 20
# PP_1474:1679489-1679511:+ --AGGUGCUCCUUGGCAUUU------------GC----------------- 20
# lysS:1701678-1701700:- ------------GGGCGUUUUC--UUCCUGUUGC----------------- 20
# yejA:4685507-4685529:- ----------CGGGGCCUUUUC--GA--UCUUGC----------------- 20
cut -f 1 20210920_lib4_toxic_transcript.txt > 20210920_lib4_toxic_transcript_names.txt
cut -f 2 20210920_lib4_toxic_transcript.txt | cut -c-20 > 20210920_lib4_toxic_transcript_guideseq.txt
cut -c 15-20 20210920_lib4_toxic_transcript_guideseq.txt > 20210920_lib4_toxic_transcript_SEEDseq.txt
paste 20210920_lib4_toxic_transcript_names.txt 20210920_lib4_toxic_transcript_SEEDseq.txt > lib4_toxic_transcript_seed.txt
sed '1d' lib4_toxic_transcript_seed.txt | awk '{print ">"$1"\n"$2}' > lib4_toxic_transcript_seed.fasta
CLUSTAL O(1.2.4) multiple sequence alignment
# PP_3866:4390891-4390913:+ -----AAAAA-C---- 6
# PP_4307:4899070-4899092:+ -----UCAGC-C---- 6
# PP_5456:870252-870274:+ -----CCAGC-C---- 6
# aruC:452409-452431:- ----CACAGC------ 6
# PP_0663:771926-771948:- ---UCAUAG------- 6
# PP_4996:5693370-5693392:- --CCUUCA-------- 6
# PP_5676:5012360-5012382:+ --ACUUCA-------- 6
# PP_5433:252487-252509:- ---CUCCAG------- 6
# rlmH:5472171-5472193:+ --UUCCCC-------- 6
# PP_1474:1679489-1679511:+ AUUUGC---------- 6
# mexF:3878498-3878520:- --GUGACG-------- 6
# cobD:1869923-1869945:- GGUUGC---------- 6
# lysS:1701678-1701700:- UGUUGC---------- 6
# PP_4048:4561299-4561321:+ -GUUGCC--------- 6
# PP_0596:694389-694411:- -GUUGAC--------- 6
# PP_4547:5168516-5168538:- --GGUACA-------- 6
# PP_4525:5141235-5141257:+ ---GCAAGU------- 6
# PP_3877:4397528-4397550:+ --GUUACA-------- 6
# arfB:3351078-3351100:- -CGUUAU--------- 6
# PP_2165:2470686-2470708:- -CGGCAC--------- 6
# PP_2930:3332877-3332899:+ -CGGCAU--------- 6
# PP_3157:3576652-3576674:- -CGGCAA--------- 6
# garK:3602777-3602799:- --GUCUGG-------- 6
# PP_4948:5635636-5635658:- -GGUCUC--------- 6
# rubA:6062766-6062788:+ --GUCUUC-------- 6
# ybaB:4852902-4852924:+ ---UCU-GCU------ 6
# PP_5143:5868961-5868983:+ ------UGGC-CU--- 6
# PP_5006:5703946-5703968:- -GUUCGU--------- 6
# PP_3509:3982129-3982151:- ----CAGGAC------ 6
# PP_3382:3829541-3829563:+ ---CCAGGG------- 6
# murF:1518472-1518494:- -CGCCUU--------- 6
# cigR:2873385-2873407:- --UGCUUG-------- 6
# mgtE:5078375-5078397:+ ---CCUUGG------- 6
# PP_2607:2982161-2982183:+ ----CUUGGU------ 6
# yejA:4685507-4685529:- ---UCUUGC------- 6
# PP_2661:3048290-3048312:- -----CCGCG-C---- 6
# csgG:3939545-3939567:+ ---ACAUGC------- 6
# nfuA:2718943-2718965:- ----GAUGCC------ 6
# PP_1325:1511435-1511457:+ ---CGAUGC------- 6
# PP_1752:1953874-1953896:- -----AUGCG-C---- 6
# PP_5455:811177-811199:- --------CA-CUGU- 6
# PP_5157:5883517-5883539:- ---------C-CUGUA 6
# PP_3160:3581271-3581293:- ---------G-CUGUC 6
# ychF:837219-837241:+ ---------G-CUGUU 6
# PP_4543:5160639-5160661:+ ---------G-CUGAA 6
# PP_1579:1769468-1769490:- ------GG---CUCG- 6
# rpsM:561837-561859:- -------GCGGAU--- 6
# PP_t52:5079842-5079864:+ -------GCC-ACU-- 6
# PP_4942:5624331-5624353:+ --------UG-AUCU- 6
# PP_4833:5500436-5500458:- -------GCG-AUC-- 6
# PP_3893:4410081-4410103:+ -----CAGCG-U---- 6
# arsR-I:2176613-2176635:+ -------GCA-UCG-- 6
# PP_5504:2546864-2546886:+ ----CCCGCC------ 6
# PP_3193:3622396-3622418:+ ------GGCG-AC--- 6
# PP_0141:149432-149454:+ ----CCGGCC------ 6
# astA-I:5089596-5089618:+ -----CGGCG-A---- 6
# ccmA:4917161-4917183:+ ----GCGGCG------ 6
# pdxY:6108080-6108102:+ ----CCGGCG------ 6
PP_2930_3332877-3332899_+ ---CGGCAU--
PP_3157_3576652-3576674_- ---CGGCAA--
PP_2165_2470686-2470708_- ---CGGCAC--
PP_4525_5141235-5141257_+ -----GCAAGU
astA-I_5089596-5089618_+ ---CGGCGA--
ccmA_4917161-4917183_+ --GCGGCG---
pdxY_6108080-6108102_+ --CCGGCG---
PP_3193_3622396-3622418_+ ----GGCGAC-
PP_1579_1769468-1769490_- ----GGCUCG-
aruC_452409-452431_- -CACAGC----
PP_0663_771926-771948_- UCAUAG-----
PP_4307_4899070-4899092_+ --UCAGCC---
PP_3382_3829541-3829563_+ --CCAGGG---
PP_3509_3982129-3982151_- ---CAGGAC--
PP_5433_252487-252509_- CUCCAG-----
PP_t52_5079842-5079864_+ -GCCACU----
murF_1518472-1518494_- ---CGCCUU--
PP_5143_5868961-5868983_+ ---UGGCCU--
PP_5456_870252-870274_+ --CCAGCC---
PP_5504_2546864-2546886_+ --CCCGCC---
PP_0141_149432-149454_+ --CCGGCC---
arsR-I_2176613-2176635_+ --GCAUCG---
PP_3893_4410081-4410103_+ ---CAGCGU--
nfuA_2718943-2718965_- ----GAUGCC-
PP_1325_1511435-1511457_+ ---CGAUGC--
csgG_3939545-3939567_+ ---ACAUGC--
PP_1752_1953874-1953896_- ---AUGCGC--
PP_2661_3048290-3048312_- ---CCGCGC--
PP_4543_5160639-5160661_+ -GCUGAA----
rpsM_561837-561859_- -GCGGAU----
PP_4833_5500436-5500458_- --GCGAUC---
PP_4942_5624331-5624353_+ ---UGAUCU--
PP_4948_5635636-5635658_- GGUCUC-----
rubA_6062766-6062788_+ -GUCUUC----
garK_3602777-3602799_- -GUCUGG----
ybaB_4852902-4852924_+ --UCUGCU---
PP_3160_3581271-3581293_- --GCUGUC---
ychF_837219-837241_+ --GCUGUU---
PP_5157_5883517-5883539_- --CCUGUA---
PP_5455_811177-811199_- -CACUGU----
mgtE_5078375-5078397_+ -CCUUGG----
PP_2607_2982161-2982183_+ --CUUGGU---
cigR_2873385-2873407_- UGCUUG-----
PP_4996_5693370-5693392_- -CCUUCA----
PP_5676_5012360-5012382_+ -ACUUCA----
PP_5006_5703946-5703968_- --GUUCGU---
rlmH_5472171-5472193_+ ---UUCCCC--
cobD_1869923-1869945_- -GGUUGC----
lysS_1701678-1701700_- -UGUUGC----
PP_4048_4561299-4561321_+ --GUUGCC---
PP_1474_1679489-1679511_+ -AUUUGC----
yejA_4685507-4685529_- -UCUUGC----
mexF_3878498-3878520_- --GUGACG---
PP_0596_694389-694411_- -GUUGAC----
PP_3877_4397528-4397550_+ --GUUACA---
PP_4547_5168516-5168538_- --GGUACA---
PP_3866_4390891-4390913_+ -AAAAAC----
arfB_3351078-3351100_- -CGUUAU----
http://homer.ucsd.edu/homer/motif/ https://www.ncbi.nlm.nih.gov/nuccore/1160889053?report=fasta
source /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/scripts/loadcondaandes.sh
# conda create --name homer homer=4.11 python=3.8 -c bioconda -c conda-forge
conda activate homer
cd /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/toxic.guide
mkdir homer.output
homer2 denovo -s lib4_toxic_transcript_20bp.fasta > homer.output.txt
findMotifs.pl lib4_toxic_transcript_20bp.fasta fasta homer.output
mkdir homer.all.output
sed '1d' lib4_transcript_20bp.txt | awk '{print ">"$1"\n"$2}' > lib4_transcript_20bp.fasta
findMotifs.pl lib4_toxic_transcript_20bp.fasta fasta homer.all.output -fasta lib4_transcript_20bp.fasta
# finding lots of CGCG-like motifs.... alter parameters
sed '1d' lib4_transcript_20bp.txt | awk '{print ">"$1"\n"$2}' > lib4_transcript_20bp.fasta
findMotifs.pl lib4_toxic_transcript_20bp.fasta fasta homer.all.output -fasta lib4_transcript_20bp.fasta -cpg
–> try running with just the SEED sequence
cd /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/toxic.guide/
cut -f 2 20210920_lib4_toxic_transcript.txt | cut -c15-20 > 20210920_lib4_toxic_transcript_SEEDseq.txt
paste 20210920_lib4_toxic_transcript_names.txt 20210920_lib4_toxic_transcript_SEEDseq.txt > lib4_toxic_transcript_SEED.txt
cut -f 2 20210922_Lib4_transcript_log2FC_CJ019_12h.txt | cut -c15-20 > 20210922_Lib4_transcript_log2FC_CJ019_12h_SEEDseq.txt
paste 20210922_Lib4_transcript_log2FC_CJ019_12h_names.txt 20210922_Lib4_transcript_log2FC_CJ019_12h_SEEDseq.txt > lib4_transcript_SEED.txt
sed '1d' lib4_toxic_transcript_SEED.txt | awk '{print ">"$1"\n"$2}' > lib4_toxic_transcript_SEED.fasta
sed '1d' lib4_transcript_SEED.txt | awk '{print ">"$1"\n"$2}' > lib4_transcript_SEED.fasta
source /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/scripts/loadcondaandes.sh
conda activate homer
cd /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/toxic.guide
mkdir homer.SEED.output
findMotifs.pl lib4_toxic_transcript_SEED.fasta fasta homer.SEED.output -fasta lib4_transcript_SEED.fasta -cpg -len 3,4,5
source /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/scripts/loadcondaandes.sh
conda activate /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/Libraries/andes/envs/test
cd /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/toxic.guide
python
input_file = open('lib4_transcript_20bp.fasta', 'r')
output_file = open('transcript.gc.tsv','w')
output_file.write('Window\tA\tC\tG\tT\tLength\tCG%\n')
from Bio import SeqIO
for cur_record in SeqIO.parse(input_file, "fasta") :
gene_name = cur_record.name
A_count = cur_record.seq.count('A')
C_count = cur_record.seq.count('C')
G_count = cur_record.seq.count('G')
U_count = cur_record.seq.count('U')
length = len(cur_record.seq)
cg_percentage = float(C_count + G_count) / length
output_line = '%s\t%i\t%i\t%i\t%i\t%i\t%f\n' % \
(gene_name, A_count, C_count, G_count, U_count, length, cg_percentage)
output_file.write(output_line)
output_file.close()
input_file.close()
exit()
python
input_file = open('lib4_toxic_transcript_20bp.fasta', 'r')
output_file = open('toxic.gc.tsv','w')
output_file.write('Window\tA\tC\tG\tT\tLength\tCG%\n')
from Bio import SeqIO
for cur_record in SeqIO.parse(input_file, "fasta") :
gene_name = cur_record.name
A_count = cur_record.seq.count('A')
C_count = cur_record.seq.count('C')
G_count = cur_record.seq.count('G')
U_count = cur_record.seq.count('U')
length = len(cur_record.seq)
cg_percentage = float(C_count + G_count) / length
output_line = '%s\t%i\t%i\t%i\t%i\t%i\t%f\n' % \
(gene_name, A_count, C_count, G_count, U_count, length, cg_percentage)
output_file.write(output_line)
output_file.close()
input_file.close()
exit()
# R
setwd("/gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/toxic.guide")
gc <- read.delim("transcript.gc.tsv", header=T, sep="\t")
toxic.gc <- read.delim("toxic.gc.tsv", header=T, sep="\t")
mean(gc$CG.)
# 0.6222545
mean(toxic.gc$CG.)
# 0.6646552
summary(gc[,2:5])
# A C G T
# Min. : 0.000 Min. : 0.000 Min. : 0.000 Min. : 0.000
# 1st Qu.: 3.000 1st Qu.: 5.000 1st Qu.: 5.000 1st Qu.: 2.000
# Median : 4.000 Median : 6.000 Median : 6.000 Median : 4.000
# Mean : 3.762 Mean : 6.142 Mean : 6.303 Mean : 3.793
# 3rd Qu.: 5.000 3rd Qu.: 7.000 3rd Qu.: 8.000 3rd Qu.: 5.000
# Max. :12.000 Max. :14.000 Max. :15.000 Max. :12.000
summary(toxic.gc[,2:5])
# A C G T
# Min. :0.000 Min. : 3.000 Min. : 2.000 Min. : 0.000
# 1st Qu.:2.000 1st Qu.: 6.000 1st Qu.: 5.000 1st Qu.: 3.000
# Median :2.500 Median : 7.000 Median : 6.000 Median : 4.000
# Mean :2.931 Mean : 6.741 Mean : 6.552 Mean : 3.776
# 3rd Qu.:4.000 3rd Qu.: 8.000 3rd Qu.: 7.750 3rd Qu.: 5.000
# Max. :8.000 Max. :10.000 Max. :11.000 Max. :10.000
#### try with just the SEED sequence...
python
input_file = open('lib4_transcript_SEED.fasta', 'r')
output_file = open('transcript.SEED.gc.tsv','w')
output_file.write('Window\tA\tC\tG\tT\tLength\tCG%\n')
from Bio import SeqIO
for cur_record in SeqIO.parse(input_file, "fasta") :
gene_name = cur_record.name
A_count = cur_record.seq.count('A')
C_count = cur_record.seq.count('C')
G_count = cur_record.seq.count('G')
U_count = cur_record.seq.count('U')
length = len(cur_record.seq)
cg_percentage = float(C_count + G_count) / length
output_line = '%s\t%i\t%i\t%i\t%i\t%i\t%f\n' % \
(gene_name, A_count, C_count, G_count, U_count, length, cg_percentage)
output_file.write(output_line)
output_file.close()
input_file.close()
exit()
python
input_file = open('lib4_toxic_transcript_SEED.fasta', 'r')
output_file = open('toxic.SEED.gc.tsv','w')
output_file.write('Window\tA\tC\tG\tT\tLength\tCG%\n')
from Bio import SeqIO
for cur_record in SeqIO.parse(input_file, "fasta") :
gene_name = cur_record.name
A_count = cur_record.seq.count('A')
C_count = cur_record.seq.count('C')
G_count = cur_record.seq.count('G')
U_count = cur_record.seq.count('U')
length = len(cur_record.seq)
cg_percentage = float(C_count + G_count) / length
output_line = '%s\t%i\t%i\t%i\t%i\t%i\t%f\n' % \
(gene_name, A_count, C_count, G_count, U_count, length, cg_percentage)
output_file.write(output_line)
output_file.close()
input_file.close()
exit()
# R
setwd("/gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/toxic.guide")
gc <- read.delim("transcript.SEED.gc.tsv", header=T, sep="\t")
toxic.gc <- read.delim("toxic.SEED.gc.tsv", header=T, sep="\t")
mean(gc$CG.)
# 0.6173568
mean(toxic.gc$CG.)
# 0.6235633
summary(gc[,2:5])
# A C G T
# Min. :0.000 Min. :0.000 Min. :0.000 Min. :0.000
# 1st Qu.:0.000 1st Qu.:1.000 1st Qu.:1.000 1st Qu.:0.000
# Median :1.000 Median :2.000 Median :2.000 Median :1.000
# Mean :1.139 Mean :1.871 Mean :1.833 Mean :1.157
# 3rd Qu.:2.000 3rd Qu.:3.000 3rd Qu.:2.000 3rd Qu.:2.000
# Max. :6.000 Max. :6.000 Max. :5.000 Max. :6.000
summary(toxic.gc[,2:5])
# A C G T
# Min. :0.0000 Min. :1 Min. :0.000 Min. :0.000
# 1st Qu.:0.0000 1st Qu.:1 1st Qu.:1.000 1st Qu.:1.000
# Median :1.0000 Median :2 Median :2.000 Median :1.000
# Mean :0.8448 Mean :2 Mean :1.741 Mean :1.414
# 3rd Qu.:1.0000 3rd Qu.:2 3rd Qu.:2.000 3rd Qu.:2.000
# Max. :5.0000 Max. :5 Max. :4.000 Max. :3.000
https://towardsdatascience.com/bootstrap-regression-in-r-98bfe4ff5007 https://github.com/serafimpetrov1/bootstrap/blob/main/BootReg.R
# Generating data
set.seed(2021)
n <- 1000
x <- rnorm(n)
y <- x + rnorm(n)
#Models of population and sample
population.data <- as.data.frame(cbind(x, y))
population.model <- lm(y ~ x, population.data)
summary(population.model)
#Sampling the data
sample.data <- population.data[sample(nrow(population.data), 20, replace = TRUE), ]
sample.model <- lm(y ~ x, data = sample.data)
summary(sample.model)
#Plotting the models
plot(y ~ x, col = "gray", main = 'Population and Sample Regressions')
abline(coef(population.model)[1], coef(population.model)[2], col = "red")
abline(coef(sample.model)[1], coef(sample.model)[2], col = "blue", lty = 2)
legend("topleft", legend = c("Sample", "Population"), col = c("red", "blue"), lty = 1:2, cex = 0.8)
#The bootstrap regression
sample_coef_intercept <- NULL
sample_coef_x1 <- NULL
for (i in 1:1000) {
sample_d = sample.data[sample(1:nrow(sample.data), nrow(sample.data), replace = TRUE), ]
model_bootstrap <- lm(y ~ x, data = sample_d)
sample_coef_intercept <-
c(sample_coef_intercept, model_bootstrap$coefficients[1])
sample_coef_x1 <-
c(sample_coef_x1, model_bootstrap$coefficients[2])
}
coefs <- rbind(sample_coef_intercept, sample_coef_x1)
# Combining the results in a table
means.boot = c(mean(sample_coef_intercept), mean(sample_coef_x1))
knitr::kable(round(
cbind(
population = coef(summary(population.model))[, 1],
sample = coef(summary(sample.model))[, 1],
bootstrap = means.boot),4),
"simple", caption = "Coefficients in different models")
confint(population.model)
confint(sample.model)
a <-
cbind(
quantile(sample_coef_intercept, prob = 0.025),
quantile(sample_coef_intercept, prob = 0.975))
b <-
cbind(quantile(sample_coef_x1, prob = 0.025),
quantile(sample_coef_x1, prob = 0.975))
c <-
round(cbind(
population = confint(population.model),
sample = confint(sample.model),
boot = rbind(a, b)), 4)
colnames(c) <- c("2.5 %", "97.5 %",
"2.5 %", "97.5 %",
"2.5 %", "97.5 %")
knitr::kable(rbind(
c('population',
'population',
'sample',
'sample',
'bootstrap',
'bootstrap'),c))
#Predicting on new data
new.data = seq(min(x), max(x), by = 0.05)
conf_interval <-
predict(
sample.model,
newdata = data.frame(x = new.data),
interval = "confidence",
level = 0.95)
#Plotting the results on the project step-by-step
plot(
y ~ x,
col = "gray",
xlab = "x",
ylab = "y",
main = "Compare regressions")
apply(coefs, 2, abline, col = rgb(1, 0, 0, 0.03))
abline(coef(population.model)[1], coef(population.model)[2], col = "blue")
abline(coef(sample.model)[1],
coef(sample.model)[2],
col = "black",
lty = 2, lwd=3)
abline(mean(sample_coef_intercept),
mean(sample_coef_x1),
col = "green",
lty = 4, lwd=3)
lines(new.data, conf_interval[, 2], col = "black", lty = 3, lwd=3)
lines(new.data, conf_interval[, 3], col = "black", lty = 3, lwd=3)
legend("topleft",
legend = c("Bootstrap", "Population", 'Sample'),
col = c("red", "blue", 'green'),
lty = 1:3,
cex = 0.8)
## Boot package
set.seed(0)
library(boot)
#define function to calculate R-squared
rsq_function <- function(formula, data, indices) {
d <- data[indices,] #allows boot to select sample
fit <- lm(formula, data=d) #fit regression model
return(summary(fit)$r.square) #return R-squared of model
}
#perform bootstrapping with 2000 replications
reps <- boot(data=population.data, statistic=rsq_function, R=2000, formula=y~x)
#view results of boostrapping
reps
plot(reps)
#calculate adjusted bootstrap percentile (BCa) interval
boot.ci(reps, type="bca")
https://kkorthauer.org/fungeno2019/methylation/vignettes/1-binomial-regression.html https://www.r-bloggers.com/2011/03/how-to-binomial-regression-models-in-r/
model0 <- glm(cbind(y, cov-y) ~ x, family="binomial")
summary(model0)
library(dplyr)
library(broom)
library(purrr)
set.seed(10)
n <- 50
cov <- 10
B <- 1000
x <- c(rep(0,n/2), rep(1, n/2))
# functions to run one replicate for each model
m0_rep <- function(){
data.frame(x = c(rep(0,n/2), rep(1, n/2))) %>%
mutate(p = 0.4 + 0.2*x,
cov = cov) %>%
mutate(y=rbinom(n, cov, p)) %>%
do(tidy(glm(cbind(y, cov-y) ~ x, family="binomial", data=.),
conf.int = TRUE))
}
m1_rep <- function(){
data.frame(x = c(rep(0,n/2), rep(1, n/2))) %>%
mutate(p = pmin((1-x)*rbeta(n,4,6) + x*rbeta(n,6,4), 1),
cov = cov) %>%
mutate(y=rbinom(n, cov, p)) %>%
do(tidy(glm(cbind(y, cov-y) ~ x, family="binomial", data=.),
conf.int = TRUE))
}
# replicate B times
m0_all <- replicate(B, m0_rep(), simplify=FALSE) %>%
do.call("rbind", .) %>%
mutate(model="m0") %>%
mutate(n = sort(rep(1:B, 2)))
m1_all <- replicate(B, m1_rep(), simplify=FALSE) %>%
do.call("rbind", .) %>%
mutate(model="m1") %>%
mutate(n = sort(rep(1:B, 2)))
# combine and pull out relevant info
all <- rbind(m0_all, m1_all)
x <- filter(all, term == "x")
prob1 <- all %>% group_by(model,n) %>%
summarize(p = sum(estimate))
prob0 <- all %>% filter(term == "(Intercept)") %>%
mutate(p = estimate)
prob0 %>% ggplot(aes(x = model, y = exp(p) / (1+exp(p)))) +
geom_boxplot() +
ylab("p estimate for x=0") +
geom_hline(yintercept=0.4, linetype="dashed", size=0.8, color = "blue")
prob1 %>% ggplot(aes(x = model, y = exp(p) / (1+exp(p)))) +
geom_boxplot() +
ylab("p estimate for x=1") +
geom_hline(yintercept=0.6, linetype="dashed", size=0.8, color = "blue")
library(dplyr)
library(reshape2)
library(ggplot2)
setwd("/Users/27n/Dropbox (ORNL)/ORNL.Noshay/Projects/SEED/Toxic.Guides")
df <- read.delim("Miniprep_Gsearch.txt", header=F, sep="\t")
# 'query', 'target', '%id', 'alnlen', 'mism', 'gaps'
nrow(df)
# 16186553
df.count <- df %>% group_by(V2) %>% mutate(count = n())
df.count.uniq <- unique(df.count[,c(2,7)])
miniprep <- read.delim("20210730_10K_reads.csv", header=T, sep=",")
nrow(miniprep)
# 9998
sum(miniprep$Miniprep_Gsearch)
# 16186553
bootstrap <- read.delim("20210907_bootstrap_replacement.csv", header=T, sep=",")
# 9998
hist(df.count.uniq$count,breaks=seq(0,max(df.count.uniq$count)+100,100))
hist(miniprep$Miniprep_Gsearch,breaks=seq(0,max(miniprep$Miniprep_Gsearch)+100,100))
hist(bootstrap$Miniprep_Rep1,breaks=seq(0,max(bootstrap$Miniprep_Rep1)+100,100))
bootstrap.data <- as.data.frame(replicate(1000,sample(df$V2, size=10000, replace=TRUE)))
column.count <- bootstrap.data %>% count(bootstrap.data$V1)
colnames(column.count) <- c("id", "V1")
for (i in 2:1000) {
new.column.count <- bootstrap.data %>% count(bootstrap.data[,i])
colnames(new.column.count) <- c("id", paste0("V",i))
count.join <- full_join(column.count, new.column.count, by="id")
column.count <- count.join
}
column.count[is.na(column.count)] <- 0
df.id <- data.frame(id = unique(df$V2))
boostrap.data.count <- left_join(df.id, column.count, by="id")
miniprep.data <- miniprep[,c(1,3)]
colnames(miniprep.data) <- c("id", "miniprep.count")
summary(miniprep.data$miniprep.count)
# Min. 1st Qu. Median Mean 3rd Qu. Max.
# 0 1313 1560 1619 1847 34711
boostrap.miniprep <- left_join(miniprep.data, boostrap.data.count, by="id")
model_test <- lm(V1 ~ miniprep.count, data = boostrap.miniprep)
# Residuals:
# Min 1Q Median 3Q Max
# -2.0319 -0.4946 -0.3667 0.4909 4.5476
#
# Coefficients:
# Estimate Std. Error t value Pr(>|t|)
# (Intercept) -2.045e-02 2.152e-02 -0.95 0.342
# miniprep.count 3.221e-04 1.254e-05 25.68 <2e-16 ***
# ---
# Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
#
# Residual standard error: 0.7115 on 9995 degrees of freedom
# (1 observation deleted due to missingness)
# Multiple R-squared: 0.06188, Adjusted R-squared: 0.06179
# F-statistic: 659.3 on 1 and 9995 DF, p-value: < 2.2e-16
boostrap.miniprep[is.na(boostrap.miniprep)] <- 0
boostrap.miniprep$bootstrap.mean <- rowMeans(boostrap.miniprep[,3:1002])
model_mean <- lm(bootstrap.mean ~ miniprep.count, data = boostrap.miniprep)
# Residuals:
# Min 1Q Median 3Q Max
# -0.12331 -0.01790 -0.00013 0.01792 0.40603
#
# Coefficients:
# Estimate Std. Error t value Pr(>|t|)
# (Intercept) -8.543e-03 8.455e-04 -10.1 <2e-16 ***
# miniprep.count 3.150e-04 4.928e-07 639.1 <2e-16 ***
# ---
# Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
#
# Residual standard error: 0.02795 on 9995 degrees of freedom
# (1 observation deleted due to missingness)
# Multiple R-squared: 0.9761, Adjusted R-squared: 0.9761
# F-statistic: 4.084e+05 on 1 and 9995 DF, p-value: < 2.2e-16
sample_coef_intercept <- c(sample_coef_intercept, model_mean$coefficients[1])
sample_coef_x1 <- c(sample_coef_x1, model_mean$coefficients[2])
coefs <- rbind(sample_coef_intercept, sample_coef_x1)
hist(boostrap.miniprep$miniprep.count,breaks=seq(0,max(boostrap.miniprep$miniprep.count)+100,100))
hist(boostrap.miniprep$bootstrap.mean,breaks=seq(0,max(boostrap.miniprep$bootstrap.mean)+1,0.1))
library(ggplot2)
library(reshape2)
boostrap.miniprep.melt <- melt(boostrap.miniprep, id="id")
ggplot(boostrap.miniprep.melt, aes(x=variable, y=value)) + geom_violin()
boostrap.melt <- melt(boostrap.miniprep[,c(1,3:ncol(boostrap.miniprep))], id="id")
ggplot(boostrap.melt, aes(x=variable, y=value)) + geom_violin()
# test with the experimental conditions
experimental.data <- miniprep[,c(1,3:23)]
names(experimental.data)[names(experimental.data) == 'gRNA_name'] <- 'id'
experimental.names <- names(experimental.data[,2:22])
boostrap.data.count$bootstrap.mean <- rowMeans(boostrap.data.count[,2:ncol(boostrap.data.count)])
boostrap.mean <- boostrap.data.count[,c(1,1002)]
bootstrap.experimental <- left_join(experimental.data, boostrap.mean, by="id")
bootstrap.experimental[is.na(bootstrap.experimental)] <- 0
bootstrap.experimental.complete<-bootstrap.experimental[complete.cases(bootstrap.experimental),]
CJ019.C1.8h_Gsearch_bootstrap <- lm(CJ019.C1.8h_Gsearch ~ bootstrap.mean, data = bootstrap.experimental)
# Residuals:
# Min 1Q Median 3Q Max
# -2833.26 -180.41 6.47 190.05 2506.13
#
# Coefficients:
# Estimate Std. Error t value Pr(>|t|)
# (Intercept) 59.302 9.048 6.554 5.88e-11 ***
# bootstrap.mean 3403.140 16.976 200.466 < 2e-16 ***
# ---
# Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
#
# Residual standard error: 307.1 on 9996 degrees of freedom
# Multiple R-squared: 0.8008, Adjusted R-squared: 0.8008
# F-statistic: 4.019e+04 on 1 and 9996 DF, p-value: < 2.2e-16
library(ggplot2)
library(reshape2)
bootstrap.experimental.melt <- melt(bootstrap.experimental.complete[,c(1,3:ncol(bootstrap.experimental.complete))], id="id")
ggplot(bootstrap.experimental.melt, aes(x=variable, y=value)) + geom_violin()
sample_coef_intercept <- NULL
sample_coef_x1 <- NULL
for (i in experimental.names) {
model_bootstrap <- lm(i ~ bootstrap.mean, data = bootstrap.experimental.complete)
sample_coef_intercept <-
c(sample_coef_intercept, model_bootstrap$coefficients[1])
sample_coef_x1 <-
c(sample_coef_x1, model_bootstrap$coefficients[2])
}
coefs <- rbind(sample_coef_intercept, sample_coef_x1)
ultimate goal is to look at how the error changes for different subsamples
how often, given a number of sampling events, do you see guides with zero
two main steps: transformation and sequencing
sequence at 200 fold coverage, 1600 fold coverage,
transform at 20-50 fold coverage
test subsampling from 20 to 200 fold
founder effects and toxic guides
distribution across subsampling
variance of each guide for each subsampling (should get larger for smaller subsamples… prop 1 - prop2)
guides with zero counts
column 1: original reads, column 2: subsample <– proportion of reads to whole
set.seed(2458)
library(dplyr)
library(reshape2)
library(ggplot2)
setwd("/Users/27n/Dropbox (ORNL)/ORNL.Noshay/Projects/SEED/Toxic.Guides")
df <- read.delim("Miniprep_Gsearch.txt", header=F, sep="\t")
# 'query', 'target', '%id', 'alnlen', 'mism', 'gaps'
nrow(df)
# 16186553
df.count <- df %>% group_by(V2) %>% mutate(count = n())
df.count.uniq <- unique(df.count[,c(2,7)])
miniprep <- read.delim("20210730_10K_reads.csv", header=T, sep=",")
nrow(miniprep)
# 9998
sum(miniprep$Miniprep_Gsearch)
hist(df.count.uniq$count,breaks=seq(0,max(df.count.uniq$count)+100,100))
hist(miniprep$Miniprep_Gsearch,breaks=seq(0,max(miniprep$Miniprep_Gsearch)+100,100))
bootstrap.20 <- as.data.frame(replicate(100,sample(df$V2, size=200000, replace=TRUE)))
bootstrap.50 <- as.data.frame(replicate(100,sample(df$V2, size=500000, replace=TRUE)))
bootstrap.100 <- as.data.frame(replicate(100,sample(df$V2, size=1000000, replace=TRUE)))
bootstrap.200 <- as.data.frame(replicate(100,sample(df$V2, size=2000000, replace=TRUE)))
bootstrap.500 <- as.data.frame(replicate(100,sample(df$V2, size=5000000, replace=TRUE)))
bootstrap.20.count <- bootstrap.20 %>% count(bootstrap.20$V1)
colnames(bootstrap.20.count) <- c("id", "V1")
for (i in 2:100) {
new.bootstrap.20.count <- bootstrap.20 %>% count(bootstrap.20[,i])
colnames(new.bootstrap.20.count) <- c("id", paste0("V",i))
count.join <- full_join(bootstrap.20.count, new.bootstrap.20.count, by="id")
bootstrap.20.count <- count.join
}
bootstrap.20.count[is.na(bootstrap.20.count)] <- 0
df.id <- data.frame(id = unique(df$V2))
boostrap.20.count.df <- left_join(df.id, bootstrap.20.count, by="id")
boostrap.20.count.df$bootstrap.mean <- rowMeans(boostrap.20.count.df[,2:101])
hist(boostrap.20.count.df$bootstrap.mean,breaks=seq(0,max(boostrap.20.count.df$bootstrap.mean)+1,1))
bootstrap.50.count <- bootstrap.50 %>% count(bootstrap.50$V1)
colnames(bootstrap.50.count) <- c("id", "V1")
for (i in 2:100) {
new.bootstrap.50.count <- bootstrap.50 %>% count(bootstrap.50[,i])
colnames(new.bootstrap.50.count) <- c("id", paste0("V",i))
count.join <- full_join(bootstrap.50.count, new.bootstrap.50.count, by="id")
bootstrap.50.count <- count.join
}
bootstrap.50.count[is.na(bootstrap.50.count)] <- 0
boostrap.50.count.df <- left_join(df.id, bootstrap.50.count, by="id")
boostrap.50.count.df$bootstrap.mean <- rowMeans(boostrap.50.count.df[,2:101])
hist(boostrap.50.count.df$bootstrap.mean,breaks=seq(0,max(boostrap.50.count.df$bootstrap.mean)+1,1))
bootstrap.100.count <- bootstrap.100 %>% count(bootstrap.100$V1)
colnames(bootstrap.100.count) <- c("id", "V1")
for (i in 2:100) {
new.bootstrap.100.count <- bootstrap.100 %>% count(bootstrap.100[,i])
colnames(new.bootstrap.100.count) <- c("id", paste0("V",i))
count.join <- full_join(bootstrap.100.count, new.bootstrap.100.count, by="id")
bootstrap.100.count <- count.join
}
bootstrap.100.count[is.na(bootstrap.100.count)] <- 0
boostrap.100.count.df <- left_join(df.id, bootstrap.100.count, by="id")
boostrap.100.count.df$bootstrap.mean <- rowMeans(boostrap.100.count.df[,2:101])
hist(boostrap.100.count.df$bootstrap.mean,breaks=seq(0,max(boostrap.100.count.df$bootstrap.mean)+1,1))
bootstrap.200.count <- bootstrap.200 %>% count(bootstrap.200$V1)
colnames(bootstrap.200.count) <- c("id", "V1")
for (i in 2:100) {
new.bootstrap.200.count <- bootstrap.200 %>% count(bootstrap.200[,i])
colnames(new.bootstrap.200.count) <- c("id", paste0("V",i))
count.join <- full_join(bootstrap.200.count, new.bootstrap.200.count, by="id")
bootstrap.200.count <- count.join
}
bootstrap.200.count[is.na(bootstrap.200.count)] <- 0
boostrap.200.count.df <- left_join(df.id, bootstrap.200.count, by="id")
boostrap.200.count.df$bootstrap.mean <- rowMeans(boostrap.200.count.df[,2:101])
hist(boostrap.200.count.df$bootstrap.mean,breaks=seq(0,max(boostrap.200.count.df$bootstrap.mean)+1,1))
bootstrap.500.count <- bootstrap.500 %>% count(bootstrap.500$V1)
colnames(bootstrap.500.count) <- c("id", "V1")
for (i in 2:100) {
new.bootstrap.500.count <- bootstrap.500 %>% count(bootstrap.500[,i])
colnames(new.bootstrap.500.count) <- c("id", paste0("V",i))
count.join <- full_join(bootstrap.500.count, new.bootstrap.500.count, by="id")
bootstrap.500.count <- count.join
}
bootstrap.500.count[is.na(bootstrap.500.count)] <- 0
boostrap.500.count.df <- left_join(df.id, bootstrap.500.count, by="id")
boostrap.500.count.df$bootstrap.mean <- rowMeans(boostrap.500.count.df[,2:101])
hist(boostrap.500.count.df$bootstrap.mean,breaks=seq(0,max(boostrap.500.count.df$bootstrap.mean)+1,1))
bootstrap.20.50 <- full_join(boostrap.20.count.df[,c(1,102)], boostrap.50.count.df[,c(1,102)], by="id")
bootstrap.20.50.100 <- full_join(bootstrap.20.50, boostrap.100.count.df[,c(1,102)], by="id")
bootstrap.20.50.100.200 <- full_join(bootstrap.20.50.100, boostrap.200.count.df[,c(1,102)], by="id")
bootstrap.all <- full_join(bootstrap.20.50.100.200, boostrap.500.count.df[,c(1,102)], by="id")
colnames(bootstrap.all) <- c("id", "bootstrap.20fold", "bootstrap.50fold", "bootstrap.100fold", "bootstrap.200fold", "bootstrap.500fold")
miniprep.count <- miniprep[,c(1,3)]
colnames(miniprep.count) <- c("id", "miniprep")
bootstrap.miniprep <- full_join(miniprep.count, bootstrap.all, by="id")
library(reshape2)
library(ggplot2)
bootstrap.miniprep.melt <- melt(bootstrap.miniprep, id="id")
ggplot(bootstrap.miniprep.melt, aes(x=value, color=variable)) + geom_density() + theme_classic() + facet_grid(variable ~ ., scales="free") + xlim(0,3000)
# calculate the number of zeros in each bootstrap
#mini.zero <- colSums(miniprep.count==0)/nrow(miniprep.count)*100
mini.zero <- colSums(miniprep.count==0)
b20.zero <- colSums(bootstrap.20.count==0)
b50.zero <- colSums(bootstrap.50.count==0)
b100.zero <- colSums(bootstrap.100.count==0)
b200.zero <- colSums(bootstrap.200.count==0)
b500.zero <- colSums(bootstrap.500.count==0)
zero.count.df <- data.frame(b20=b20.zero, b50=b50.zero, b100=b100.zero, b200=b200.zero, b500=b500.zero)
zero.count.mean <- colMeans(zero.count.df[2:101,])
zero.count.sd <- colSds(as.matrix(zero.count.df[2:101,]))
zero.count.var <- colVars(as.matrix(zero.count.df[2:101,]))
mini.zero
zero.count.mean
zero.count.sd
zero.count.var
zero.count.stats <- data.frame(mean.count = zero.count.mean, sd = zero.count.sd, var = zero.count.var)
miniprep.stats <- data.frame(mean.count = 1, sd = NA, var = NA)
rownames(miniprep.stats) <- "miniprep"
zero.stats.all <- rbind(zero.count.stats, miniprep.stats)
write.table(zero.stats.all, "miniprep.bootstrap.zero.count.txt", quote=F, row.names=F)
# calculate the proportion of each sgRNA in each subsample
# then take the mean, variance, sd of each sgRNA
#install.packages("expss")
library(expss)
library(dplyr)
library(matrixStats)
bootstrap.20.prop <- prop(bootstrap.20.count[,2:101])
bootstrap.20.prop.df <- cbind(bootstrap.20.count[,1], bootstrap.20.prop)
bootstrap.20.prop.stats <- bootstrap.20.prop.df %>% replace(is.na(.), 0) %>% mutate(b20.mean = rowMeans(as.matrix(bootstrap.20.prop.df[,2:101])), b20.sd = rowSds(as.matrix(bootstrap.20.prop.df[,2:101])), b20.var = rowVars(as.matrix(bootstrap.20.prop.df[,2:101])))
stats.20 <- bootstrap.20.prop.stats[,c(1,102:104)]
colnames(stats.20) <- c("id", "b20.mean", "b20.sd", "b20.var")
bootstrap.50.prop <- prop(bootstrap.50.count[,2:101])
bootstrap.50.prop.df <- cbind(bootstrap.50.count[,1], bootstrap.50.prop)
bootstrap.50.prop.stats <- bootstrap.50.prop.df %>% replace(is.na(.), 0) %>% mutate(b50.mean = rowMeans(as.matrix(bootstrap.50.prop.df[,2:101])), b50.sd = rowSds(as.matrix(bootstrap.50.prop.df[,2:101])), b50.var = rowVars(as.matrix(bootstrap.50.prop.df[,2:101])))
stats.50 <- bootstrap.50.prop.stats[,c(1,102:104)]
colnames(stats.50) <- c("id", "b50.mean", "b50.sd", "b50.var")
bootstrap.100.prop <- prop(bootstrap.100.count[,2:101])
bootstrap.100.prop.df <- cbind(bootstrap.100.count[,1], bootstrap.100.prop)
bootstrap.100.prop.stats <- bootstrap.100.prop.df %>% replace(is.na(.), 0) %>% mutate(b100.mean = rowMeans(as.matrix(bootstrap.100.prop.df[,2:101])), b100.sd = rowSds(as.matrix(bootstrap.100.prop.df[,2:101])), b100.var = rowVars(as.matrix(bootstrap.100.prop.df[,2:101])))
stats.100 <- bootstrap.100.prop.stats[,c(1,102:104)]
colnames(stats.100) <- c("id", "b100.mean", "b100.sd", "b100.var")
bootstrap.200.prop <- prop(bootstrap.200.count[,2:101])
bootstrap.200.prop.df <- cbind(bootstrap.200.count[,1], bootstrap.200.prop)
bootstrap.200.prop.stats <- bootstrap.200.prop.df %>% replace(is.na(.), 0) %>% mutate(b200.mean = rowMeans(as.matrix(bootstrap.200.prop.df[,2:101])), b200.sd = rowSds(as.matrix(bootstrap.200.prop.df[,2:101])), b200.var = rowVars(as.matrix(bootstrap.200.prop.df[,2:101])))
stats.200 <- bootstrap.200.prop.stats[,c(1,102:104)]
colnames(stats.200) <- c("id", "b200.mean", "b200.sd", "b200.var")
bootstrap.500.prop <- prop(bootstrap.500.count[,2:101])
bootstrap.500.prop.df <- cbind(bootstrap.500.count[,1], bootstrap.500.prop)
bootstrap.500.prop.stats <- bootstrap.500.prop.df %>% replace(is.na(.), 0) %>% mutate(b500.mean = rowMeans(as.matrix(bootstrap.500.prop.df[,2:101])), b500.sd = rowSds(as.matrix(bootstrap.500.prop.df[,2:101])), b500.var = rowVars(as.matrix(bootstrap.500.prop.df[,2:101])))
stats.500 <- bootstrap.500.prop.stats[,c(1,102:104)]
colnames(stats.500) <- c("id", "b500.mean", "b500.sd", "b500.var")
miniprep.prop <- prop(miniprep.count[,2])
miniprep.prop.df <- data.frame("id"=miniprep.count[,1], "miniprep.mean"=miniprep.prop)
stats.miniprep.20 <- left_join(miniprep.prop.df, stats.20, by="id")
stats.miniprep.20.50 <- left_join(stats.miniprep.20, stats.50, by="id")
stats.miniprep.20.50.100 <- left_join(stats.miniprep.20.50, stats.100, by="id")
stats.miniprep.20.50.100.200 <- left_join(stats.miniprep.20.50.100, stats.200, by="id")
stats.all <- left_join(stats.miniprep.20.50.100.200, stats.500, by="id")
write.table(stats.all, "miniprep.bootstrap.stats.txt", quote=F, row.names=F, sep="\t")
stats.mean <- stats.all[,c(2,3,6,9,12,15)]
t.test(stats.mean[,1:2])
–> what happens if you under-sampled? look at 1x, 0.8x, and 0.5x
set.seed(2458)
library(dplyr)
library(reshape2)
library(ggplot2)
setwd("/Users/27n/Dropbox (ORNL)/ORNL.Noshay/Projects/SEED/Toxic.Guides")
df <- read.delim("Miniprep_Gsearch.txt", header=F, sep="\t")
# 'query', 'target', '%id', 'alnlen', 'mism', 'gaps'
nrow(df)
# 16186553
df.count <- df %>% group_by(V2) %>% mutate(count = n())
df.count.uniq <- unique(df.count[,c(2,7)])
miniprep <- read.delim("20210730_10K_reads.csv", header=T, sep=",")
nrow(miniprep)
# 9998
sum(miniprep$Miniprep_Gsearch)
# 16186553
hist(df.count.uniq$count,breaks=seq(0,max(df.count.uniq$count)+100,100))
hist(miniprep$Miniprep_Gsearch,breaks=seq(0,max(miniprep$Miniprep_Gsearch)+100,100))
bootstrap.1 <- as.data.frame(replicate(100,sample(df$V2, size=10000, replace=TRUE)))
bootstrap.0.8 <- as.data.frame(replicate(100,sample(df$V2, size=(10000*0.8), replace=TRUE)))
bootstrap.0.5 <- as.data.frame(replicate(100,sample(df$V2, size=(10000*0.5), replace=TRUE)))
bootstrap.1.count <- bootstrap.1 %>% count(bootstrap.1$V1)
colnames(bootstrap.1.count) <- c("id", "V1")
for (i in 2:100) {
new.bootstrap.1.count <- bootstrap.1 %>% count(bootstrap.1[,i])
colnames(new.bootstrap.1.count) <- c("id", paste0("V",i))
count.join <- full_join(bootstrap.1.count, new.bootstrap.1.count, by="id")
bootstrap.1.count <- count.join
}
bootstrap.1.count[is.na(bootstrap.1.count)] <- 0
df.id <- data.frame(id = unique(df$V2))
boostrap.1.count.df <- left_join(df.id, bootstrap.1.count, by="id")
boostrap.1.count.df$bootstrap.mean <- rowMeans(boostrap.1.count.df[,2:101])
hist(boostrap.1.count.df$bootstrap.mean,breaks=seq(0,max(boostrap.1.count.df$bootstrap.mean)+1,0.1))
write.table(boostrap.1.count.df, "miniprep.undersample.bootstrap.1.count.txt", quote=F, row.names=F, sep="\t")
bootstrap.0.8.count <- bootstrap.0.8 %>% count(bootstrap.0.8$V1)
colnames(bootstrap.0.8.count) <- c("id", "V1")
for (i in 2:100) {
new.bootstrap.0.8.count <- bootstrap.0.8 %>% count(bootstrap.0.8[,i])
colnames(new.bootstrap.0.8.count) <- c("id", paste0("V",i))
count.join <- full_join(bootstrap.0.8.count, new.bootstrap.0.8.count, by="id")
bootstrap.0.8.count <- count.join
}
bootstrap.0.8.count[is.na(bootstrap.0.8.count)] <- 0
boostrap.0.8.count.df <- left_join(df.id, bootstrap.0.8.count, by="id")
boostrap.0.8.count.df$bootstrap.mean <- rowMeans(boostrap.0.8.count.df[,2:101])
hist(boostrap.0.8.count.df$bootstrap.mean,breaks=seq(0,max(boostrap.0.8.count.df$bootstrap.mean)+1,0.1))
write.table(boostrap.0.8.count.df, "miniprep.undersample.bootstrap.0.8.count.txt", quote=F, row.names=F, sep="\t")
bootstrap.0.5.count <- bootstrap.0.5 %>% count(bootstrap.0.5$V1)
colnames(bootstrap.0.5.count) <- c("id", "V1")
for (i in 2:100) {
new.bootstrap.0.5.count <- bootstrap.0.5 %>% count(bootstrap.0.5[,i])
colnames(new.bootstrap.0.5.count) <- c("id", paste0("V",i))
count.join <- full_join(bootstrap.0.5.count, new.bootstrap.0.5.count, by="id")
bootstrap.0.5.count <- count.join
}
bootstrap.0.5.count[is.na(bootstrap.0.5.count)] <- 0
boostrap.0.5.count.df <- left_join(df.id, bootstrap.0.5.count, by="id")
boostrap.0.5.count.df$bootstrap.mean <- rowMeans(boostrap.0.5.count.df[,2:101])
hist(boostrap.0.5.count.df$bootstrap.mean,breaks=seq(0,max(boostrap.0.5.count.df$bootstrap.mean)+1,0.1))
write.table(boostrap.0.5.count.df, "miniprep.undersample.bootstrap.0.5.count.txt", quote=F, row.names=F, sep="\t")
bootstrap.1.0.8 <- full_join(boostrap.1.count.df[,c(1,102)], boostrap.0.8.count.df[,c(1,102)], by="id")
bootstrap.all <- full_join(bootstrap.1.0.8, boostrap.0.5.count.df[,c(1,102)], by="id")
colnames(bootstrap.all) <- c("id", "bootstrap.1fold", "bootstrap.0.8fold", "bootstrap.0.5fold")
miniprep.count <- miniprep[,c(1,3)]
colnames(miniprep.count) <- c("id", "miniprep")
bootstrap.miniprep <- full_join(miniprep.count, bootstrap.all, by="id")
library(reshape2)
library(ggplot2)
bootstrap.miniprep.melt <- melt(bootstrap.miniprep, id="id")
bootstrap.miniprep.melt.nomini <- subset(bootstrap.miniprep.melt, bootstrap.miniprep.melt$variable != "miniprep")
ggplot(bootstrap.miniprep.melt.nomini, aes(x=value, color=variable)) + geom_density() + theme_classic() + facet_grid(variable ~ ., scales="free") + xlim(0,1)
# calculate the number of zeros in each bootstrap
library("matrixStats")
mini.zero <- colSums(miniprep.count==0)
b1.zero <- colSums(bootstrap.1.count==0)
b0.8.zero <- colSums(bootstrap.0.8.count==0)
b0.5.zero <- colSums(bootstrap.0.5.count==0)
zero.count.df <- data.frame(b1=b1.zero, b0.8=b0.8.zero, b0.5=b0.5.zero)
zero.count.mean <- colMeans(zero.count.df[2:101,])
zero.count.sd <- colSds(as.matrix(zero.count.df[2:101,]))
zero.count.var <- colVars(as.matrix(zero.count.df[2:101,]))
mini.zero
# id miniprep
# 0 1
zero.count.mean
# b1 b0.8 b0.5
#12249.94 13491.02 15617.89
zero.count.sd
#[1] 33.74498 28.00685 19.42294
zero.count.var
#[1] 1138.7236 784.3834 377.2504
zero.count.stats <- data.frame(mean.count = zero.count.mean, sd = zero.count.sd, var = zero.count.var)
miniprep.stats <- data.frame(mean.count = 1, sd = NA, var = NA)
rownames(miniprep.stats) <- "miniprep"
zero.stats.all <- rbind(zero.count.stats, miniprep.stats)
write.table(zero.stats.all, "miniprep.undersample.bootstrap.zero.count.txt", quote=F, row.names=F)
# calculate the proportion of each sgRNA in each subsample
# then take the mean, variance, sd of each sgRNA
library(expss)
library(dplyr)
library(matrixStats)
bootstrap.1.prop <- prop(bootstrap.1.count[,2:101])
bootstrap.1.prop.df <- cbind(bootstrap.1.count[,1], bootstrap.1.prop)
bootstrap.1.prop.stats <- bootstrap.1.prop.df %>% replace(is.na(.), 0) %>% mutate(b1.mean = rowMeans(as.matrix(bootstrap.1.prop.df[,2:101])), b1.sd = rowSds(as.matrix(bootstrap.1.prop.df[,2:101])), b1.var = rowVars(as.matrix(bootstrap.1.prop.df[,2:101])))
stats.1 <- bootstrap.1.prop.stats[,c(1,102:104)]
colnames(stats.1) <- c("id", "b1.mean", "b1.sd", "b1.var")
bootstrap.0.8.prop <- prop(bootstrap.0.8.count[,2:101])
bootstrap.0.8.prop.df <- cbind(bootstrap.0.8.count[,1], bootstrap.0.8.prop)
bootstrap.0.8.prop.stats <- bootstrap.0.8.prop.df %>% replace(is.na(.), 0) %>% mutate(b0.8.mean = rowMeans(as.matrix(bootstrap.0.8.prop.df[,2:101])), b0.8.sd = rowSds(as.matrix(bootstrap.0.8.prop.df[,2:101])), b0.8.var = rowVars(as.matrix(bootstrap.0.8.prop.df[,2:101])))
stats0.8 <- bootstrap.0.8.prop.stats[,c(1,102:104)]
colnames(stats0.8) <- c("id", "b0.8.mean", "b0.8.sd", "b0.8.var")
bootstrap.0.5.prop <- prop(bootstrap.0.5.count[,2:101])
bootstrap.0.5.prop.df <- cbind(bootstrap.0.5.count[,1], bootstrap.0.5.prop)
bootstrap.0.5.prop.stats <- bootstrap.0.5.prop.df %>% replace(is.na(.), 0) %>% mutate(b0.5.mean = rowMeans(as.matrix(bootstrap.0.5.prop.df[,2:101])), b0.5.sd = rowSds(as.matrix(bootstrap.0.5.prop.df[,2:101])), b0.5.var = rowVars(as.matrix(bootstrap.0.5.prop.df[,2:101])))
stats0.5 <- bootstrap.0.5.prop.stats[,c(1,102:104)]
colnames(stats0.5) <- c("id", "b0.5.mean", "b0.5.sd", "b0.5.var")
miniprep.prop <- prop(miniprep.count[,2])
miniprep.prop.df <- data.frame("id"=miniprep.count[,1], "miniprep.mean"=miniprep.prop)
stats.miniprep.1 <- left_join(miniprep.prop.df, stats.1, by="id")
stats.miniprep.1.0.8 <- left_join(stats.miniprep.1, stats0.8, by="id")
stats.all <- left_join(stats.miniprep.1.0.8, stats0.5, by="id")
write.table(stats.all, "miniprep.undersample.bootstrap.stats.txt", quote=F, row.names=F, sep="\t")
source /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/scripts/loadcondaandes.sh
conda activate /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/Libraries/andes/envs/test
setwd("/Users/27n/Dropbox (ORNL)/ORNL.Noshay/Projects/SEED/ExploratoryDataForModelGeneration/putida/Lib1_Cas9_library_database.csv")
id <- read.delim("Lib1_Cas9_library_database.csv", header=T, sep=",", stringsAsFactors = F)
data <- read.delim("deseq2_lib_vs_delta_tf.csv", header=T, sep=",", stringsAsFactors = F)
library(tidyverse)
library(dplyr)
id$sgRNAID <- str_extract_all(id$gRNA, "[A-Z]+")
colnames(id) <- c("gRNA", "seq", "nucleotide.sequence")
data.id <- left_join(data, id[,c(1,3)], by="gRNA")
df <- data.id[,c(1,3,8)]
colnames(df) <- c("sgRNAID", "cut.score", "nucleotide.sequence")
df.mat <- data.frame(sgRNAID = as.character(df$sgRNAID), cut.score = as.numeric(df$cut.score), nucleotide.sequence = as.character(df$nucleotide.sequence))
write.table(df.mat, "putida.delta.txt", quote=F, row.names=F, sep="\t")
toxic <- subset(df.mat, df.mat$cut.score < -3)
write.table(toxic, "putida.toxic.txt", quote=F, row.names=F, sep="\t")
# scp /Users/27n/Dropbox (ORNL)/ORNL.Noshay/Projects/SEED/ExploratoryDataForModelGeneration/putida/putida.toxic.txt noshayjm@dtn.ccs.ornl.gov:/gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/putida/.
cd /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/putida/
sed '1d' putida.toxic.txt | awk '{print ">"$1"\n"$3}' > putida.toxic.fasta
source /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/scripts/loadcondaandes.sh
conda activate /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/Libraries/andes/envs/test
cd /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/putida
# /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/Libraries/andes/ncbi-blast-2.11.0+/bin/makeblastdb -in GCF_000412675.1_ASM41267v1_genomic.fna -dbtype nucl
/gpfs/alpine/syb105/proj-shared/Personal/noshayjm/Libraries/andes/ncbi-blast-2.11.0+/bin/blastn -query putida.toxic.fasta -db GCF_000412675.1_ASM41267v1_genomic.fna -out putida.toxic.gRNA.blast.tab -outfmt 6 -task blastn -num_threads 10
# 614 putida.toxic.fasta
awk '{if ($9 > $10) print $2"\t"$10"\t"$9"\t"$1}' putida.toxic.gRNA.blast.tab > tmp1.bed
awk '{if ($10 > $9) print $2"\t"$9"\t"$10"\t"$1}' putida.toxic.gRNA.blast.tab > tmp2.bed
cat tmp1.bed tmp2.bed > putida.toxic.gRNA.blast.tab
# 21429 putida.toxic.gRNA.blast.tab
/gpfs/alpine/syb105/proj-shared/Personal/noshayjm/Libraries/andes/ncbi-blast-2.11.0+/bin/blastn -query putida.toxic.fasta -db GCF_000412675.1_ASM41267v1_genomic.fna -out putida.toxic.gRNA.blast2.tab -outfmt 6 -evalue 0.001 -task blastn -num_threads 10
# 67 putida.toxic.gRNA.blast2.tab
/gpfs/alpine/syb105/proj-shared/Personal/noshayjm/Libraries/andes/ncbi-blast-2.11.0+/bin/blastn -query putida.toxic.fasta -db GCF_000412675.1_ASM41267v1_genomic.fna -out putida.toxic.gRNA.blast2.tab -outfmt 6 -evalue 0.01 -task blastn -num_threads 10
# 126 putida.toxic.gRNA.blast2.tab
/gpfs/alpine/syb105/proj-shared/Personal/noshayjm/Libraries/andes/ncbi-blast-2.11.0+/bin/blastn -query putida.toxic.fasta -db GCF_000412675.1_ASM41267v1_genomic.fna -out putida.toxic.gRNA.blast2.tab -outfmt 6 -evalue 0.05 -task blastn -num_threads 10
# 171 putida.toxic.gRNA.blast2.tab
/gpfs/alpine/syb105/proj-shared/Personal/noshayjm/Libraries/andes/ncbi-blast-2.11.0+/bin/blastn -query putida.fasta -db GCF_000412675.1_ASM41267v1_genomic.fna -out putida.gRNA.blast.tab -outfmt 6 -evalue 0.001 -task blastn -num_threads 10
# 299998 putida.fasta
# 28971 putida.gRNA.blast.tab
/gpfs/alpine/syb105/proj-shared/Personal/noshayjm/Libraries/andes/ncbi-blast-2.11.0+/bin/blastn -query putida.fasta -db GCF_000412675.1_ASM41267v1_genomic.fna -out putida.gRNA.blast2.tab -outfmt 6 -task blastn -num_threads 10
# 11053646 putida.gRNA.blast2.tab
# R
library(dplyr)
setwd("/gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/putida")
guides <- read.delim("putida.toxic.txt", header=T, sep="\t")
blast <- read.delim("putida.toxic.gRNA.blast.tab", header=F, sep="\t")
blast.count <- blast %>% group_by(V4) %>% count()
nrow(guides)
# 307
nrow(blast)
# 21429
nrow(blast.count)
# 307
summary(blast.count$n)
# Min. 1st Qu. Median Mean 3rd Qu. Max.
# 1.0 20.0 48.0 69.8 91.0 570.0
setwd("/gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/putida")
guides <- read.delim("putida.txt", header=T, sep="\t")
blast <- read.delim("putida.gRNA.blast2.tab", header=F, sep="\t")
blast.count <- blast %>% group_by(V1) %>% count()
nrow(guides)
# 149999
nrow(blast)
# 11053646
nrow(blast.count)
# 149625
summary(blast.count$n)
# Min. 1st Qu. Median Mean 3rd Qu. Max.
# 1.00 26.00 53.00 73.88 99.00 1190.00
source /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/scripts/loadcondaandes.sh
conda activate emboss
cd /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/putida
revseq putida.toxic.fasta -reverse -notag -complement -outseq putida.toxic.reversecomplement.fasta
source /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/scripts/loadcondaandes.sh
conda activate /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/Libraries/andes/envs/test
/gpfs/alpine/syb105/proj-shared/Personal/noshayjm/Libraries/andes/ncbi-blast-2.11.0+/bin/blastn -query putida.toxic.reversecomplement.fasta -db GCF_000412675.1_ASM41267v1_genomic.fna -out putida.toxic.reversecomplement.gRNA.blast.05.tab -outfmt 6 -evalue 0.05 -task blastn -num_threads 10
# 171
/gpfs/alpine/syb105/proj-shared/Personal/noshayjm/Libraries/andes/ncbi-blast-2.11.0+/bin/blastn -query putida.toxic.fasta -db GCF_000412675.1_ASM41267v1_genomic.fna -out putida.toxic.gRNA.blast.05.tab -outfmt 6 -evalue 0.05 -task blastn -num_threads 10
# 171
# R
library(dplyr)
#setwd("/gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/putida")
setwd("/Users/27n/Dropbox (ORNL)/ORNL.Noshay/Projects/SEED/Toxic.Guides")
guides <- read.delim("putida.txt", header=T, sep="\t")
toxic <- read.delim("putida.toxic.txt", header=T, sep="\t")
nrow(guides)
# 149999
nrow(toxic)
# 307
nrow(subset(guides, !(guides$sgRNAID %in% toxic$sgRNAID)))
# 149692
no.toxic <- subset(guides, !(guides$sgRNAID %in% toxic$sgRNAID))
no.toxic.ATG <- no.toxic %>% mutate(ATG.pos = regexpr("ATG", no.toxic$nucleotide.sequence))
nrow(subset(no.toxic.ATG, no.toxic.ATG$ATG.pos != -1))
# 34556 / 149692 = 0.2308473
no.toxic.ATG.present <- subset(no.toxic.ATG, no.toxic.ATG$ATG.pos != -1)
toxic.ATG <- toxic %>% mutate(ATG.pos = regexpr("ATG", toxic$nucleotide.sequence))
nrow(subset(toxic.ATG, toxic.ATG$ATG.pos != -1))
# 198 / 307 = 0.6449511
toxic.ATG.present <- subset(toxic.ATG, toxic.ATG$ATG.pos != -1)
hist(no.toxic.ATG.present$ATG.pos)
hist(toxic.ATG.present$ATG.pos)
# view images in terminal #### NOT WORKING
# source /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/scripts/loadcondaandes.sh
# conda activate /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/Libraries/andes/envs/test
# imgcat file.jpg
# salloc -A SYB105 -N 2 -t 4:00:00 -p gpu
source /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/scripts/loadcondaandes.sh
conda activate /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/Libraries/andes/envs/test
R
library(dplyr)
library(reshape2)
library(tidyr)
setwd("/gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/putida")
structure <- read.delim("putida.gRNA.ViennaRNA.output.value.id.txt", header=T, sep="\t", stringsAsFactors = F)
nuc <- read.delim("putida.nucleotide_counts_sgRNA_temp.txt", header=T, sep="\t", stringsAsFactors = F)
score <- read.delim("putida.sgRNA.coord.txt", header=T, sep="\t", stringsAsFactors = F)
score.df <- unique(score[,c(5,7,6)])
colnames(score.df) <- c("sgRNAID", "cut.score", "nucleotide.sequence")
structure.df <- structure[,2]
gc.df <- nuc[,7]
temp.df <- nuc[,8]
# structure, gc, temp
structure.df <- data.frame(structure[,2])
gc.df <- data.frame(nuc[,7])
temp.df <- data.frame(nuc[,8])
structure.df$scale <- "sgRNA.raw"
gc.df$scale <- "sgRNA.raw"
temp.df$scale <- "sgRNA.raw"
structure.df$sgRNAID <- structure[,1]
gc.df$sgRNAID <- nuc[,1]
temp.df$sgRNAID <- nuc[,1]
structure.temp <- left_join(structure.df, temp.df, by=c("sgRNAID", "scale"))
structure.temp.gc <- left_join(structure.temp, gc.df, by=c("sgRNAID", "scale"))
score.structure.temp.gc <- left_join(score.df, structure.temp.gc, by=c("sgRNAID"))
colnames(score.structure.temp.gc) <- c("sgRNAID", "cut.score", "seq", "sgRNA.structure", "scale", "sgRNA.temp", "sgRNA.gc")
## add one-hot encoding of sequence
setwd("/gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/Chuai2018")
onehot.ind1 <- read.delim("putida_ind1.txt", header=T, sep=" ")
onehot.ind2 <- read.delim("putida_ind2.txt", header=T, sep=" ")
onehot.dep1 <- read.delim("putida_dep1.txt", header=F, sep=" ")
onehot.dep2 <- read.delim("putida_dep2.txt", header=F, sep=" ")
onehot.dep3 <- read.delim("putida_dep3.txt", header=F, sep=" ")
onehot.dep4 <- read.delim("putida_dep4.txt", header=F, sep=" ")
colnames(onehot.dep1)[1] <- "sgRNAID"
colnames(onehot.dep2)[1] <- "sgRNAID"
colnames(onehot.dep3)[1] <- "sgRNAID"
colnames(onehot.dep4)[1] <- "sgRNAID"
onehot.ind <- full_join(onehot.ind1, onehot.ind2, by="sgRNAID")
onehot.dep12 <- full_join(onehot.dep1[,1:ncol(onehot.dep1)-1], onehot.dep2[,1:ncol(onehot.dep2)-1], by="sgRNAID")
onehot.dep123 <- full_join(onehot.dep12, onehot.dep3[,1:ncol(onehot.dep3)-1], by="sgRNAID")
onehot.dep <- full_join(onehot.dep123, onehot.dep4[,1:ncol(onehot.dep4)-1], by="sgRNAID")
onehot <- full_join(onehot.ind, onehot.dep, by="sgRNAID")
onehot$scale <- "sgRNA.raw"
data.onehot <- left_join(score.structure.temp.gc, onehot, by=c("sgRNAID", "scale"))
df.melt <- melt(data.onehot[,c(1,2,4:ncol(data.onehot))], id=c("cut.score", "scale", "sgRNAID"))
df <- na.omit(df.melt)
df.id <- df %>% unite(feature.scale, c(variable, scale), sep = "")
df.id$value <- as.numeric(df.id$value)
df.id <- df.id[!(is.na(df.id$value) | df.id$value==""), ]
colnames(df.id) <- c("cut.score", "feature.scale", "sgRNAID", "value")
write.table(df.id, "putida.structure.temp.gc.onehot1to4.txt", quote=F, row.names=F, sep="\t")
# pam (distance and nucleotide)
# setwd("/gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/putida")
# sgRNA.pam <- read.table("putida.sgRNA.closestPAM.bed", header=T, sep="\t", stringsAsFactors = F)
# sgRNA.pam.sub <- sgRNA.pam[,c(1,4,5)]
# colnames(sgRNA.pam.sub) <- c("sgRNAID", "pam.code", "pam.distance")
# sgRNA.pam.onehot <- sgRNA.pam.sub %>% mutate(PAM.A = ifelse(pam.code == "AGG" | pam.code == "CCT", 1, 0), PAM.C = ifelse(pam.code == "CGG" | pam.code == "CCG", 1, 0), PAM.T = ifelse(pam.code == "TGG" | pam.code == "CCA", 1, 0), PAM.G = ifelse(pam.code == "GGG" | pam.code == "CCC", 1, 0))
# sgRNA.pam.df <- sgRNA.pam.onehot[,c(1,3:7)]
# sgRNA.pam.id <- sgRNA.pam.df
#
# score <- read.delim("putida.sgRNA.coord.txt", header=T, sep="\t", stringsAsFactors = F)
# score.df <- score[,c(5,7)]
# colnames(score.df) <- c("sgRNAID", "cut.score")
#
# score.location <- left_join(score.df, sgRNA.pam.id, by="sgRNAID")
# score.location$scale <- 0
#
# df.melt <- melt(score.location, id=c("cut.score", "scale", "sgRNAID"))
# df <- na.omit(df.melt)
# colnames(df) <- c("cut.score", "scale", "sgRNAID", "variable", "value")
#
# df.id <- df %>% unite(feature.scale, c(variable, scale), sep = "")
# df.pam.dcast <- df.id %>% dcast(sgRNAID + cut.score ~ feature.scale, value.var = "value", fun.aggregate=mean, na.rm=TRUE)
#
# df <- read.delim("putida.structure.temp.gc.onehot1to4.txt", header=T, sep="\t")
# df.onehot.dcast <- df %>% dcast(sgRNAID + cut.score ~ feature.scale, value.var = "value", fun.aggregate=mean, na.rm=TRUE)
#
# df.onehot.pam <- left_join(df.onehot.dcast, df.pam.dcast, by=c("sgRNAID"))
#
# df.onehot.pam.na <- na.omit(df.onehot.pam)
# location relative to gene
setwd("/gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/putida")
sgRNA.genes <- read.table("putida.sgRNA.gene.closest.bed", header=F, sep="\t", stringsAsFactors = F)
sgRNA.genes.df <- sgRNA.genes[,c(4,14)]
colnames(sgRNA.genes.df) <- c("sgRNAID", "gene.distance")
sgRNA.genes.id <- sgRNA.genes.df
score.location <- left_join(score.df, sgRNA.genes.id, by=c("sgRNAID"))
score.location$scale <- 0
df.melt <- melt(score.location, id=c("cut.score", "scale", "sgRNAID"))
df <- na.omit(df.melt)
colnames(df) <- c("cut.score", "scale", "sgRNAID", "variable", "value")
df.id <- df %>% unite(feature.scale, c(variable, scale), sep = "")
df.location.dcast <- df.id %>% dcast(sgRNAID + cut.score ~ feature.scale, value.var = "value", fun.aggregate=mean, na.rm=TRUE)
df.location.dcast.na <- na.omit(df.location.dcast)
df.pam.location <- inner_join(df.location.dcast.na, df.onehot.pam.na, by=c("sgRNAID"))
nrow(df.pam.location)
# 16748
df.final <- df.pam.location[,c(1:3,5:5915,5917:5921)]
ncol(df.final)
# 5919
write.table(df.final, "putida.raw.matrix.txt", quote=F, row.names=F, sep="\t")
# add new DNA/RNA features to data table
# salloc -A SYB105 -N 2 -t 4:00:00
source /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/scripts/loadcondaandes.sh
conda activate /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/Libraries/andes/envs/test
R
library(dplyr)
library(reshape2)
# Monomer
setwd("/gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/")
tensor <- read.delim("HL.Bond.Monomer.txt", header=T, sep="\t", stringsAsFactors = F)
setwd("/gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/putida/")
seq <- read.delim("putida.sequence.txt", header=T, sep=" ", stringsAsFactors = F)
tensor.features <- tensor[,1]
rownames(tensor) <- tensor[,1]
tensor.df <- tensor[,2:ncol(tensor)]
tensor.t <- as.data.frame(t(tensor.df))
tensor.t$base <- names(tensor[,2:ncol(tensor)])
rownames(seq) <- seq[,1]
seq.melt <- melt(seq, id="sgRNAID")
colnames(seq.melt) <- c("sgRNAID", "position", "base")
seq.tensor <- left_join(seq.melt, tensor.t, by="base")
seq.tensor.melt <- melt(seq.tensor, id=c("sgRNAID", "position", "base"))
seq.tensor.dcast <- dcast(seq.tensor.melt, sgRNAID ~ position + variable, value.var="value")
write.table(seq.tensor.dcast, "putida.quantum.monomer.txt", quote=F, row.names=F, sep="\t")
write.table(seq.tensor.melt, "putida.quantum.monomer.melt.txt", quote=F, row.names=F, sep="\t")
# Basepair
setwd("/gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/")
tensor <- read.delim("HL.Bond.Basepair.txt", header=T, sep="\t", stringsAsFactors = F)
setwd("/gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/putida/")
seq <- read.delim("putida.sequence.txt", header=T, sep=" ", stringsAsFactors = F)
tensor.features <- tensor[,1]
rownames(tensor) <- tensor[,1]
tensor.df <- tensor[,2:ncol(tensor)]
tensor.t <- as.data.frame(t(tensor.df))
tensor.t$base <- names(tensor[,2:ncol(tensor)])
rownames(seq) <- seq[,1]
seq.melt <- melt(seq, id="sgRNAID")
colnames(seq.melt) <- c("sgRNAID", "position", "base")
seq.tensor <- left_join(seq.melt, tensor.t, by="base")
seq.tensor.melt <- melt(seq.tensor, id=c("sgRNAID", "position", "base"))
seq.tensor.dcast <- dcast(seq.tensor.melt, sgRNAID ~ position + variable, value.var="value")
write.table(seq.tensor.dcast, "putida.quantum.basepair.txt", quote=F, row.names=F, sep="\t")
write.table(seq.tensor.melt, "putida.quantum.basepair.melt.txt", quote=F, row.names=F, sep="\t")
# Dimer
setwd("/gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/")
tensor <- read.delim("HL.Bond.Dimer.txt", header=T, sep="\t", stringsAsFactors = F)
setwd("/gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/putida/")
seq <- read.delim("putida.sequence.txt", header=T, sep=" ", stringsAsFactors = F)
seq.dimer <- seq %>% unite("p1", p1:p2, remove=F, sep= "") %>% unite("p2", p2:p3, remove=F, sep= "") %>% unite("p3", p3:p4, remove=F, sep= "") %>% unite("p4", p4:p5, remove=F, sep= "") %>% unite("p5", p5:p6, remove=F, sep= "") %>% unite("p6", p6:p7, remove=F, sep= "") %>% unite("p7", p7:p8, remove=F, sep= "") %>% unite("p8", p8:p9, remove=F, sep= "") %>% unite("p9", p9:p10, remove=F, sep= "") %>% unite("p10", p10:p11, remove=F, sep= "") %>% unite("p11", p11:p12, remove=F, sep= "") %>% unite("p12", p12:p13, remove=F, sep= "") %>% unite("p13", p13:p14, remove=F, sep= "") %>% unite("p14", p14:p15, remove=F, sep= "") %>% unite("p15", p15:p16, remove=F, sep= "") %>% unite("p16", p16:p17, remove=F, sep= "") %>% unite("p17", p17:p18, remove=F, sep= "") %>% unite("p18", p18:p19, remove=F, sep= "") %>% unite("p19", p19:p20, remove=T, sep= "")
tensor.features <- tensor[,1]
rownames(tensor) <- tensor[,1]
tensor.df <- tensor[,2:ncol(tensor)]
tensor.t <- as.data.frame(t(tensor.df))
tensor.t$base <- names(tensor[,2:ncol(tensor)])
rownames(seq.dimer) <- seq.dimer[,1]
seq.df <- seq.dimer[,1:20]
seq.melt <- melt(seq.df, id="sgRNAID")
colnames(seq.melt) <- c("sgRNAID", "position", "base")
seq.tensor <- left_join(seq.melt, tensor.t, by="base")
seq.tensor.melt <- melt(seq.tensor, id=c("sgRNAID", "position", "base"))
seq.tensor.dcast <- dcast(seq.tensor.melt, sgRNAID ~ position + variable, value.var="value")
write.table(seq.tensor.dcast, "putida.quantum.dimer.txt", quote=F, row.names=F, sep="\t")
write.table(seq.tensor.melt, "putida.quantum.dimer.melt.txt", quote=F, row.names=F, sep="\t")
# Trimer
setwd("/gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/")
tensor <- read.delim("HL.Bond.Trimer.txt", header=T, sep="\t", stringsAsFactors = F)
setwd("/gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/putida/")
seq <- read.delim("putida.sequence.txt", header=T, sep=" ", stringsAsFactors = F)
seq.trimer <- seq %>% unite("p1", p1:p3, remove=F, sep= "") %>% unite("p2", p2:p4, remove=F, sep= "") %>% unite("p3", p3:p5, remove=F, sep= "") %>% unite("p4", p4:p6, remove=F, sep= "") %>% unite("p5", p5:p7, remove=F, sep= "") %>% unite("p6", p6:p8, remove=F, sep= "") %>% unite("p7", p7:p9, remove=F, sep= "") %>% unite("p8", p8:p10, remove=F, sep= "") %>% unite("p9", p9:p11, remove=F, sep= "") %>% unite("p10", p10:p12, remove=F, sep= "") %>% unite("p11", p11:p13, remove=F, sep= "") %>% unite("p12", p12:p14, remove=F, sep= "") %>% unite("p13", p13:p15, remove=F, sep= "") %>% unite("p14", p14:p16, remove=F, sep= "") %>% unite("p15", p15:p17, remove=F, sep= "") %>% unite("p16", p16:p18, remove=F, sep= "") %>% unite("p17", p17:p19, remove=F, sep= "") %>% unite("p18", p18:p20, remove=F, sep= "")
tensor.features <- tensor[,1]
rownames(tensor) <- tensor[,1]
tensor.df <- tensor[,2:ncol(tensor)]
tensor.t <- as.data.frame(t(tensor.df))
tensor.t$base <- names(tensor[,2:ncol(tensor)])
rownames(seq.trimer) <- seq.trimer[,1]
seq.df <- seq.trimer[,1:19]
seq.melt <- melt(seq.df, id="sgRNAID")
colnames(seq.melt) <- c("sgRNAID", "position", "base")
seq.tensor <- left_join(seq.melt, tensor.t, by="base")
seq.tensor.melt <- melt(seq.tensor, id=c("sgRNAID", "position", "base"))
seq.tensor.dcast <- dcast(seq.tensor.melt, sgRNAID ~ position + variable, value.var="value")
write.table(seq.tensor.dcast, "putida.quantum.trimer.txt", quote=F, row.names=F, sep="\t")
write.table(seq.tensor.melt, "putida.quantum.trimer.melt.txt", quote=F, row.names=F, sep="\t")
# Tetramer
setwd("/gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/")
tensor <- read.delim("HL.Bond.Tetramer.txt", header=T, sep="\t", stringsAsFactors = F)
setwd("/gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/putida/")
seq <- read.delim("putida.sequence.txt", header=T, sep=" ", stringsAsFactors = F)
seq.tetramer <- seq %>% unite("p1", p1:p4, remove=F, sep= "") %>% unite("p2", p2:p5, remove=F, sep= "") %>% unite("p3", p3:p6, remove=F, sep= "") %>% unite("p4", p4:p7, remove=F, sep= "") %>% unite("p5", p5:p8, remove=F, sep= "") %>% unite("p6", p6:p9, remove=F, sep= "") %>% unite("p7", p7:p10, remove=F, sep= "") %>% unite("p8", p8:p11, remove=F, sep= "") %>% unite("p9", p9:p12, remove=F, sep= "") %>% unite("p10", p10:p13, remove=F, sep= "") %>% unite("p11", p11:p14, remove=F, sep= "") %>% unite("p12", p12:p15, remove=F, sep= "") %>% unite("p13", p13:p16, remove=F, sep= "") %>% unite("p14", p14:p17, remove=F, sep= "") %>% unite("p15", p15:p18, remove=F, sep= "") %>% unite("p16", p16:p19, remove=F, sep= "") %>% unite("p17", p17:p20, remove=F, sep= "")
tensor.features <- tensor[,1]
rownames(tensor) <- tensor[,1]
tensor.df <- tensor[,2:ncol(tensor)]
tensor.t <- as.data.frame(t(tensor.df))
tensor.t$base <- names(tensor[,2:ncol(tensor)])
rownames(seq.tetramer) <- seq.tetramer[,1]
seq.df <- seq.tetramer[,1:18]
seq.melt <- melt(seq.df, id="sgRNAID")
colnames(seq.melt) <- c("sgRNAID", "position", "base")
seq.tensor <- left_join(seq.melt, tensor.t, by="base")
seq.tensor.melt <- melt(seq.tensor, id=c("sgRNAID", "position", "base"))
seq.tensor.dcast <- dcast(seq.tensor.melt, sgRNAID ~ position + variable, value.var="value")
write.table(seq.tensor.dcast, "putida.quantum.tetramer.txt", quote=F, row.names=F, sep="\t")
write.table(seq.tensor.melt, "putida.quantum.tetramer.melt.txt", quote=F, row.names=F, sep="\t")
setwd("/gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/putida/")
monomer <- read.delim("putida.quantum.monomer.melt.txt", header=T, sep="\t", stringsAsFactors = F)
basepair <- read.delim("putida.quantum.basepair.melt.txt", header=T, sep="\t", stringsAsFactors = F)
dimer <- read.delim("putida.quantum.dimer.melt.txt", header=T, sep="\t", stringsAsFactors = F)
trimer <- read.delim("putida.quantum.trimer.melt.txt", header=T, sep="\t", stringsAsFactors = F)
tetramer <- read.delim("putida.quantum.tetramer.melt.txt", header=T, sep="\t", stringsAsFactors = F)
monomer.basepair <- rbind(monomer, basepair)
monomer.basepair.dimer <- rbind(monomer.basepair, dimer)
monomer.basepair.dimer.trimer <- rbind(monomer.basepair.dimer, trimer)
monomer.basepair.dimer.trimer.tetramer <- rbind(monomer.basepair.dimer.trimer, tetramer)
write.table(monomer.basepair.dimer.trimer.tetramer, "putida.quantum.melt.txt", quote=F, row.names=F, sep="\t")
# salloc -A SYB105 -N 2 -t 4:00:00 -p gpu
source /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/scripts/loadcondaandes.sh
conda activate /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/Libraries/andes/envs/test
R
library(dplyr)
library(reshape2)
library(tidyr)
setwd("/gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/putida")
df <- read.delim("putida.raw.matrix.txt", header=T, sep="\t", stringsAsFactors = F)
# quantum chemical tensors
setwd("/gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/putida")
tensor <- read.delim("putida.quantum.melt.txt", header=T, sep="\t")
tensor[is.na(tensor)] <- 0
tensor$scale <- "raw"
tensor.id <- tensor %>% unite(feature.scale, c(position, variable, scale), sep = "")
tensor.id$value <- as.numeric(tensor.id$value)
tensor.id[is.na(tensor.id)] <- 0
df.score <- unique(df[,c(1,2)])
tensor.score <- inner_join(tensor.id, df.score, by="sgRNAID")
tensor.score.order <- tensor.score[,c(5,2,1,4)]
colnames(tensor.score.order) <- c("cut.score", "feature.scale", "sgRNAID", "value")
df.dcast <- tensor.score.order %>% dcast(sgRNAID + cut.score ~ feature.scale, value.var = "value", fun.aggregate=mean, na.rm=TRUE)
df.dcast.na <- na.omit(df.dcast)
nrow(df.dcast.na)
# 148591
df.location <- inner_join(df, df.dcast.na, by=c("sgRNAID"))
write.table(df.location, "putida.finalquantum.txt", quote=F, row.names=F, sep="\t")
continuous y-vec
# salloc -A SYB105 -N 2 -t 4:00:00
source /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/scripts/loadcondaandes.sh
conda activate /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/Libraries/andes/envs/test
R
library(dplyr)
setwd("/gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/putida")
df <- read.delim("putida.finalquantum.txt", header=T, sep="\t", stringsAsFactors = F)
names(df)[names(df) == 'cut.score.x'] <- 'cut.score'
df <- df[,c(1:5044, 5046, 5048:5363)]
df.cut <- df %>% select(-grep("V3324sgRNA.raw", names(df)))
df.na <- na.omit(df.cut)
# 148591
#df.num <- mutate_all(df.na[,2:ncol(df.na)], function(x) as.numeric(as.character(x)))
df.num <- as.numeric(as.character(df.na[,2:ncol(df.na)]))
df.all <- cbind(df.na[,1], df.num)
colnames(df.all)[1] <- "sgRNAID"
write.table(df.all[,c(1,3:ncol(df.all))], "putida.finalquantum.features.txt", quote=F, row.names=F, sep="\t")
write.table(df.all[,c(1,3:ncol(df.all))], "putida.finalquantum.features_overlap.txt", quote=F, row.names=F, sep="\t")
write.table(df.all[,3:ncol(df.all)], "putida.finalquantum.features_overlap_noSampleIDs.txt", quote=F, row.names=F, sep="\t")
write.table(df.all[,1:2], "putida.finalquantum.score.txt", quote=F, row.names=F, sep="\t")
write.table(df.all[,1:2], "putida.finalquantum.score_overlap.txt", quote=F, row.names=F, sep="\t")
write.table(data.frame("cut.score" = df.all[,2]), "putida.finalquantum.score_overlap_noSampleIDs.txt", quote=F, row.names=F, sep="\t")
colnames(score)[apply(score, 2, anyNA)]
colnames(df)[apply(df, 2, anyNA)]
# V3324sgRNA.raw
# Andes
source /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/scripts/loadcondaandes.sh
conda activate /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/Libraries/andes/envs/test
module load python/3.7-anaconda3
mkdir /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/putida/iRF.run/putida.all
cd /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/putida/iRF.run/putida.all
python /gpfs/alpine/syb105/proj-shared/Projects/iRF/iRF_LOOP_SetUp_CrossLayer.py --System Summit --NodesPer 1 --TotalNodes 50 --RunTime 90 --Account SYB105 --NumTrees 1000 --NumIterations 5 --RunName putida.all --bypass --targetNodeSize 50 --Prediction /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/putida/putida.finalquantum.features.txt /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/putida/putida.finalquantum.score.txt
# Summit
cd /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/putida/iRF.run/putida.all
module load python/3.7.0-anaconda3-5.3.0
# full
bsub /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/putida/iRF.run/putida.all/Submits/submit_full_putida.all_0.sh
# train
bsub /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/putida/iRF.run/putida.all/Submits/submit_train_putida.all_0.sh
# once the train submissions are done run the test submissions
# test
bsub /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/putida/iRF.run/putida.all/Submits/submit_test_putida.all_0.sh
# Andes
module load python/3.7-anaconda3
cd /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/putida/iRF.run/putida.finalquantum
python /gpfs/alpine/syb105/proj-shared/Projects/iRF/iRF_postProcessing.py --Iterations 5 --Prediction --PredAccuracy MAE,MAEA,MSE,R2 --varTot 95 /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/putida/iRF.run/YNames.txt putida.all
# 0.2497100288038237
sort -k3rg topVarEdges/cut.score_top95.txt | head
# sgRNA.structuresgRNA.raw cut.score 0.23350755818800148
# sgRNA.gcsgRNA.raw cut.score 0.04289250555917792
# sgRNA.tempsgRNA.raw cut.score 0.03905222514980143
# V4087sgRNA.raw cut.score 0.025792547076915376 <- p16.GGCC
# p11tetramer.Hbond.energyraw cut.score 0.019625679832632137
# GGsgRNA.raw cut.score 0.01851639846123979
# V4343sgRNA.raw cut.score 0.015418165378914536 <- p17.GGCC
# p7tetramer.Hbond.energyraw cut.score 0.013642788627073781
# p13tetramer.Hbond.energyraw cut.score 0.013434957932850747
# p17tetramer.Hlgap.eVEraw cut.score 0.01259884826856768
# pearson correlation
setwd("/gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/putida/iRF.run/putida.all/cut.score/foldRuns/fold9/Runs/Set4")
pred <- read.delim("putida.all_Set4_test.prediction", header=T, sep="\t")
y <- read.delim("set4_Y_test_noSampleIDs.txt", header=T, sep="\t")
cor(y$cut.score, pred$Predictions.)
# 0.5668159
#!/bin/bash
#SBATCH -A SYB105
#SBATCH -J RIT.run
#SBATCH -N 2
#SBATCH -t 48:00:00
#SBATCH --mem-per-cpu=0
cd /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/putida/iRF.run/putida.all/cut.score
/gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/runRIT.sh cut.score putida.all
# sbatch /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/putida/iRF.run/putida.all/cut.score/RIT.run
# p1tetramer.Hlgap.eVEraw cut.score 0.02958235529594759 -0.00011528815662705526 639.302 0.5014918061262607
# p17tetramer.Hlgap.eVEraw cut.score 0.029521575025357568 -0.00031165835211497425 693.769 0.511701944094686
# p3tetramer.Hlgap.eVEraw cut.score 0.024694134828854918 0.00013664641246905086 500.582 0.5022176217151879
# p6tetramer.Hbond.energyraw cut.score 0.021058833320975318 -0.0002491734107466877 465.856 0.5063487707875959
# gene.distance0 cut.score 0.020622002455367876 5.208266653497892e-05 323.87 0.4947275548231648
# p15tetramer.Hlgap.eVEraw cut.score 0.01881725111137344 -0.0002563608564939106 385.849 0.5116186664539905
# sgRNA.structuresgRNA.raw cut.score 0.016888607129485665 7.0503091510368e-05 261.581 0.4932105013572439
# p2tetramer.Hlgap.eVEraw cut.score 0.016755386104739187 0.00021649093737096997 303.203 0.4882808983826795
# p14tetramer.Hbond.stackingraw cut.score 0.016662685260386053 0.00015213922597939064 318.569 0.4892237053978665
# p12tetramer.Hbond.energyraw cut.score 0.01617906670446831 -0.00010579613262548503 354.032 0.5017452908165951
all putida data with binary y-vec for toxic or non-toxic guide
library(dplyr)
setwd("/gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/putida")
df <- read.delim("putida.finalquantum.txt", header=T, sep="\t", stringsAsFactors = F)
toxic <- read.delim("putida.toxic.txt", header=T, sep="\t")
names(df)[names(df) == 'cut.score.x'] <- 'cut.score'
df <- df[,c(1:5044, 5046, 5048:5363)]
df.num <- mutate_all(df[,2:ncol(df)], function(x) as.numeric(as.character(x)))
df.all <- cbind(df[,1], df.num)
colnames(df.all)[1] <- "sgRNAID"
df.cut <- df.all %>% select(-grep("V3324sgRNA.raw", names(df)))
df.na <- na.omit(df.cut)
df.toxic <- subset(df.na, df.na$sgRNAID %in% toxic$sgRNAID)
df.nontoxic <- subset(df.na, !(df.na$sgRNAID %in% toxic$sgRNAID))
df.toxic$cut.score <- 1
df.nontoxic$cut.score <- 0
df.all <- rbind(df.toxic, df.nontoxic)
write.table(df.all[,c(1,3:ncol(df.all))], "putida.toxic.features.txt", quote=F, row.names=F, sep="\t")
write.table(df.all[,c(1,3:ncol(df.all))], "putida.toxic.features_overlap.txt", quote=F, row.names=F, sep="\t")
write.table(df.all[,3:ncol(df.all)], "putida.toxic.features_overlap_noSampleIDs.txt", quote=F, row.names=F, sep="\t")
write.table(df.all[,1:2], "putida.toxic.score.txt", quote=F, row.names=F, sep="\t")
write.table(df.all[,1:2], "putida.toxic.score_overlap.txt", quote=F, row.names=F, sep="\t")
write.table(data.frame("cut.score" = df.all[,2]), "putida.toxic.score_overlap_noSampleIDs.txt", quote=F, row.names=F, sep="\t")
# run python scripts on Andes
# run job submissions on Summit
# Builder script: /gpfs/alpine/syb105/proj-shared/Projects/iRF/iRF_LOOP_SetUp_CrossLayer.py
# [python iRF_LOOP_SetUp_CrossLayer.py --DataFile --YFile --System Summit --NodesPer 1 --TotalNodes 10 --RunTime 2 --Account SYB105 --NumTrees 1000 --NumIterations 5 --RunName iRF.XX --bypass --Prediction]
# Andes
module load python/3.7-anaconda3
mkdir /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/putida/iRF.run/putida.toxic
cd /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/putida/iRF.run/putida.toxic
python /gpfs/alpine/syb105/proj-shared/Projects/iRF/iRF_LOOP_SetUp_CrossLayer.py --System Summit --NodesPer 1 --TotalNodes 50 --RunTime 90 --Account SYB105 --NumTrees 1000 --NumIterations 5 --RunName putida.toxic --bypass --targetNodeSize 50 --Prediction /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/putida/putida.toxic.features.txt /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/putida/putida.toxic.score.txt
# Summit
cd /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/putida/iRF.run/putida.toxic
module load python/3.7.0-anaconda3-5.3.0
# full
bsub /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/putida/iRF.run/putida.toxic/Submits/submit_full_putida.toxic_0.sh
# train
bsub /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/putida/iRF.run/putida.toxic/Submits/submit_train_putida.toxic_0.sh
# once the train submissions are done run the test submissions
# test
bsub /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/putida/iRF.run/putida.toxic/Submits/submit_test_putida.toxic_0.sh
# Andes
module load python/3.7-anaconda3
vim /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/putida/iRF.run/YNames.txt
# cut.score
cd /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/putida/iRF.run/putida.toxic
python /gpfs/alpine/syb105/proj-shared/Projects/iRF/iRF_postProcessing.py --Iterations 5 --Prediction --PredAccuracy MAE,MAEA,MSE,R2 --varTot 95 /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/putida/iRF.run/YNames.txt putida.toxic
# 0.2723134136615347
sort -k2rg cut.score/foldRuns/fold9/Runs/Set4/putida.toxic_cut.score.importance4 | head
# V2.xsgRNA.raw: 34.8683 <- p1.A
# p1monomer.HLgap.eVraw: 30.6637
# V4.ysgRNA.raw: 19.3178 <- p1.AT
# V2351sgRNA.raw: 19.1476 <- p10.ATGC
# V3256sgRNA.raw: 3.54151 <- p13.TGCT
# V212sgRNA.raw: 2.25208 <- p1.GCAT
# V2744sgRNA.raw: 2.12511 <- p11.TGCT
# gene.distance0: 2.01697
# p17tetramer.Hlgap.eVEraw: 1.43922
# p12tetramer.Hbond.stackingraw: 1.28718
# pearson correlation
setwd("/gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/putida/iRF.run/putida.toxic/cut.score/foldRuns/fold9/Runs/Set4")
pred <- read.delim("putida.toxic_Set4_test.prediction", header=T, sep="\t")
y <- read.delim("set4_Y_test_noSampleIDs.txt", header=T, sep="\t")
cor(y$cut.score, pred$Predictions.)
# 0.6229057
#!/bin/bash
#SBATCH -A SYB105
#SBATCH -J RIT.run
#SBATCH -N 2
#SBATCH -t 48:00:00
#SBATCH --mem-per-cpu=0
cd /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/putida/iRF.run/putida.toxic/cut.score
/gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/runRIT.sh cut.score putida.toxic
# sbatch /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/seed/putida/iRF.run/putida.toxic/cut.score/RIT.run
p1monomer.HLgap.eVraw cut.score 0.2262725463170956 0.00011095610163166765 4112.394 0.014187120146086606
V2.xsgRNA.raw cut.score 0.18221582829439129 0.008037880616833358 251.438 0.08628023109683436
V4.ysgRNA.raw cut.score 0.12122628447866456 0.0006542737130496269 10505.528 -0.0004121364379226049
V2351sgRNA.raw cut.score 0.11863568252089206 0.0007797269336901252 148591.0 -0.007223177552553901
V212sgRNA.raw cut.score 0.012916193868251604 4.202193884067253e-05 55249.525 0.00022434593561855305
gene.distance0 cut.score 0.010703691914690066 -6.428745812097893e-06 26038.084 0.0009157877421875984
V2744sgRNA.raw cut.score 0.010141012769831806 0.00023734383919545257 142890.388 0.0016291624795197277
V3256sgRNA.raw cut.score 0.00991871854090609 4.186525196620564e-05 60310.434 0.00037842830673800033
p17tetramer.Hlgap.eVEraw cut.score 0.0075886254379308165 -3.789702618813022e-05 28117.193 0.003439541453099477
TGsgRNA.raw cut.score 0.007266773366701968 2.069220903107592e-05 115784.491 0.000500198983057414
p1.monomer.HL-gap cut.score 0.2262725463170956 0.00011095610163166765
p1.A cut.score 0.18221582829439129 0.008037880616833358
p1.AT cut.score 0.12122628447866456 0.0006542737130496269
p10.ATGC cut.score 0.11863568252089206 0.0007797269336901252
p1.GCAT cut.score 0.012916193868251604 4.202193884067253e-05
gene.distance cut.score 0.010703691914690066 -6.428745812097893e-06
p11.TGCT cut.score 0.010141012769831806 0.00023734383919545257
p13.TGCT cut.score 0.00991871854090609 4.186525196620564e-05
p17.tetramer.HL-gap cut.score 0.0075886254379308165 -3.789702618813022e-05
TG cut.score 0.007266773366701968 2.069220903107592e-05
A-ATGC at position 1-10:13
subsetter(data, ‘A’, 1) A at pos1 & ATGC at pos10 AT at pos1 & TGCT at pos11
–> All are “toxic” (depleted)
what if it is shifted? what if it is flipped? –> non-toxic examples