shhh <- suppressPackageStartupMessages
shhh(library(data.table))
shhh(library(dplyr))
shhh(library(tidyr))
shhh(library(Mfuzz))
shhh(library(VennDiagram))
shhh(library(seqinr))
shhh(library(xlsx))
shhh(library(colortools))
shhh(library(plotrix))
shhh(library(ggplot2))
shhh(library(RColorBrewer))
shhh(library(ggtree))
shhh(library(ggsci))
shhh(library(factoextra))
shhh(library(scales))
shhh(library(knitr))
shhh(library(gridExtra))
freq.table=function(x){
total=x$Nv_gene %>% unique() %>% length()
l=list()
for(i in unique(x$code)){
sub=x[code==i]
num.genes=unique(sub$Nv_gene) %>% length()
freq=num.genes/total
l[[i]]=data.table(code_name=i,code_count=num.genes,frequency=freq,total=total)
}
return(rbindlist(l))
}
vlepo.enrich=function(trial,omic){
fischer_p=c()
test.list=list()
ref.list=list()
for(i in trial$code_name){
test=trial[trial$code_name==i,c('code_count','total')] %>% as.numeric()
ref=omic[omic$code_name==i,c('code_count','total')] %>% as.numeric()
p.value=fisher.test(matrix(c(test[1],ref[1],test[2],ref[2]),nrow=2,ncol=2),alternative="greater")$p.value ## 3extract P value from fischer test
fischer_p=c(fischer_p,p.value)
names(test)=c('test_N','test_total')
test.list[[i]]=as.data.frame(t(test))
names(ref)=c('ref_N','ref_total')
ref.list[[i]]=as.data.frame(t(ref))
}
test.add=rbindlist(test.list)
ref.add=rbindlist(ref.list)
trial$fischP=fischer_p
trial$fdr=p.adjust(trial$fischP,method='fdr')
combined=cbind(test.add,ref.add,trial)
enriched= combined %>% arrange(fdr) %>% filter(fdr<1e-02) %>% filter(test_N>10) %>% select(-frequency,-code_count) %>%
mutate(test_freq=test_N/test_total, ref_freq=ref_N/ref_total) %>%
select(code_name,fischP,fdr,everything()) %>% data.table()
return(enriched)
}
gene.subset=function(geneset,database){
return(database[Nv_gene %in% geneset])
}
oto=fread('./inputs/Nv_trinity_Dm_1_to_1.csv')
dm.key=fread('./inputs/Dm_master_key_by_gene.csv')
Nv_trinity_oto=merge(oto,dm.key,by='Dm_FBgn') %>% select(Nv_gene,name)
colnames(Nv_trinity_oto)[2]='annotation'
uni.blast=fread('./inputs/Nv_Uniref_annotation_clean.csv')
uni.blast=uni.blast[!(Nv_gene %in% Nv_trinity_oto$Nv_gene)]
annot=rbind(uni.blast,Nv_trinity_oto)
annot$Nv_gene=gsub("TRINITY_","",annot$Nv_gene)
trans.raw=fread('./inputs/NezVir_Kallisto_TPM_raw.tsv')
bacterial_genes=readLines('./inputs/Nv_bacterial_genes.txt')
unigenes=readLines('./inputs/NezVir_unigenes.txt')
trans.raw$Nv_gene=gsub("TRINITY_","",trans.raw$Nv_gene)
#unigenes=readLines('./raw_omics/Unigene_list.txt')
unigenes=gsub("TRINITY_","",unigenes)
trans.raw=trans.raw %>% filter(!(Nv_gene %in% bacterial_genes)) %>% data.table()
trans.sum= trans.raw %>% rowwise() %>% mutate(M1_mean=mean(c(M1_1,M1_2,M1_3,M1_4)),M1_sd=sd(c(M1_1,M1_2,M1_3,M1_4))) %>%
mutate(M2_mean=mean(c(M2_1,M2_2,M2_3,M2_4)),M2_sd=sd(c(M2_1,M2_2,M2_3,M2_4))) %>%
mutate(M3_mean=mean(c(M3_1,M3_2,M3_3,M3_4)),M3_sd=sd(c(M3_1,M3_2,M3_3,M3_4))) %>%
mutate(M4H_mean=mean(c(M4H_1,M4H_2,M4H_3,M4H_4)),M4H_sd=sd(c(M4H_1,M4H_2,M4H_3,M4H_4))) %>%
mutate(C_mean=mean(c(C1,C2,C3,C4)),C_sd=sd(c(C1,C2,C3,C4))) %>%
filter(Nv_gene %in% unigenes) %>% select(Nv_gene,M1_mean,M2_mean,M3_mean,M4H_mean,M1_sd,M2_sd,M3_sd,M4H_sd,C_mean,C_sd) %>%
merge(annot,by='Nv_gene',all.x=T) %>%
data.table()
#write.xlsx(trans.sum,'./final_figures_tables/TableS3_Transcript_TPM_values.xlsx',sheetName = 'Sheet1')
kable(head(trans.sum),digits=2)
| Nv_gene | M1_mean | M2_mean | M3_mean | M4H_mean | M1_sd | M2_sd | M3_sd | M4H_sd | C_mean | C_sd | annotation |
|---|---|---|---|---|---|---|---|---|---|---|---|
| DN100000_c0_g1 | 0.14 | 2.36 | 1.08 | 0.14 | 0.14 | 2.93 | 1.05 | 0.19 | 19.64 | 5.00 | DNA ligase 1-like isoform X1 _Halyomorpha_1.06e-59 |
| DN100001_c0_g1 | 0.00 | 0.02 | 0.02 | 0.21 | 0.00 | 0.04 | 0.04 | 0.18 | 1.72 | 0.28 | uncharacterized protein LOC106683759 isoform X1 _Halyomorpha_3.6e-73 |
| DN100002_c0_g1 | 39.82 | 51.18 | 53.71 | 42.03 | 3.74 | 5.48 | 3.75 | 0.70 | 62.35 | 8.64 | CG9393 |
| DN100003_c0_g1 | 0.19 | 0.10 | 0.14 | 0.10 | 0.17 | 0.13 | 0.19 | 0.14 | 0.08 | 0.11 | complement factor D _Ursus_1.2e-14 |
| DN100003_c0_g2 | 0.07 | 0.05 | 0.11 | 0.20 | 0.09 | 0.06 | 0.18 | 0.11 | 0.03 | 0.07 | complement factor D _Ursus_1.03e-14 |
| DN100005_c0_g2 | 0.03 | 0.01 | 0.01 | 0.01 | 0.03 | 0.02 | 0.01 | 0.03 | 1.68 | 0.30 | Protein apterous _Neoptera_0 |
proteomics.list=lapply(list.files('./inputs/proteomics/specific_compartment',full.names = T),readLines)
names(proteomics.list)=c('M1','M2','M3','M4H')
prot.sum=trans.sum %>% select(Nv_gene) %>%
mutate(M1_present=ifelse(Nv_gene %in% proteomics.list$M1,'Present','Absent')) %>%
mutate(M2_present=ifelse(Nv_gene %in% proteomics.list$M2,'Present','Absent')) %>%
mutate(M3_present=ifelse(Nv_gene %in% proteomics.list$M3,'Present','Absent')) %>%
mutate(M4H_present=ifelse(Nv_gene %in% proteomics.list$M4,'Present','Absent')) %>%
merge(annot,by='Nv_gene',all.x=T) %>%
data.table()
write.xlsx(prot.sum,'./final_figures_tables/TableS4_Proteomics_values.xlsx',sheetName = 'Sheet1')
kable(head(prot.sum),digits=2)
| Nv_gene | M1_present | M2_present | M3_present | M4H_present | annotation |
|---|---|---|---|---|---|
| DN100000_c0_g1 | Absent | Absent | Absent | Absent | DNA ligase 1-like isoform X1 _Halyomorpha_1.06e-59 |
| DN100001_c0_g1 | Absent | Absent | Absent | Absent | uncharacterized protein LOC106683759 isoform X1 _Halyomorpha_3.6e-73 |
| DN100002_c0_g1 | Present | Absent | Absent | Absent | CG9393 |
| DN100003_c0_g1 | Absent | Absent | Absent | Absent | complement factor D _Ursus_1.2e-14 |
| DN100003_c0_g2 | Absent | Absent | Absent | Absent | complement factor D _Ursus_1.03e-14 |
| DN100005_c0_g2 | Absent | Absent | Absent | Absent | Protein apterous _Neoptera_0 |
dir.create('./Venn_diagrams')
## Warning in dir.create("./Venn_diagrams"): './Venn_diagrams' already exists
##Transcriptomics
l=list()
for(i in c('M1_mean','M4H_mean','M2_mean','M3_mean')){
a=trans.sum[,c('Nv_gene',i),with=FALSE]
colnames(a)=c('Nv_gene','expression')
table=a %>% filter(expression>1) %>% data.table()
l[[i]]=table$Nv_gene
}
trans.ven=venn.diagram(l,filename=NULL,main='Midgut Comparison',cex=1.5,fill=c('coral','cadetblue','gold','lightgreen'))
ggsave(trans.ven,file='./final_figures_tables/Figure2A_Transcriptomics_venn_comparison.svg',device='svg')
## Saving 7 x 5 in image
## Proteomics
#proteomics.list=lapply(list.files('./inputs/proteomics/specific_compartment',full.names = T),readLines)
#proteomics.list=list(proteomics.list[[1]],proteomics.list[[4]],proteomics.list[[2]],proteomics.list[[3]])
proteomics.list=list(prot.sum[M1_present=='Present']$Nv_gene,prot.sum[M4H_present=='Present']$Nv_gene,prot.sum[M2_present=='Present']$Nv_gene,prot.sum[M3_present=='Present']$Nv_gene)
names(proteomics.list)=c('M1_mean','M4H_mean','M2_mean','M3_mean')
prot.ven=venn.diagram(proteomics.list,filename=NULL,main='Midgut Comparison',cex=1.5,fill=c('coral','cadetblue','gold','lightgreen'))
ggsave(prot.ven,file='./final_figures_tables/Figure2A_Proteomics_venn_comparison.svg',device='svg')
## Saving 7 x 5 in image
#file.remove(list.files('./Venn_diagrams/',full.names = T)[grepl('.log',list.files('./Venn_diagrams/'))])
file.remove(list.files()[grepl('.log',list.files())])
## [1] TRUE TRUE
## fraction of transcriptomes
pan.gut=Reduce(intersect,l)
total.gut=Reduce(union,l)
print('The fraction of transcripts detected in all compartments is')
## [1] "The fraction of transcripts detected in all compartments is"
length(pan.gut)/length(total.gut)
## [1] 0.6835136
## fraction of proteomes
pan.gut=Reduce(intersect,proteomics.list)
total.gut=Reduce(union,proteomics.list)
print('The fraction of proteins detected in all compartments is')
## [1] "The fraction of proteins detected in all compartments is"
length(pan.gut)/length(total.gut)
## [1] 0.4301288
full.gut=trans.raw %>% select(-Nv_gene)
raw.gut2=full.gut %>% as.matrix() %>% t()
groups=c(rep('C',4),rep('M1',4),rep('M2',4),rep('M3',4),rep('M4',4))
PCA1=prcomp(raw.gut2)
pc=fviz_pca_ind(PCA1,
col.ind = groups,
palette = c("red","blue",'green','black','purple','gold','orange'),
repel = TRUE,
addEllipses = TRUE, # Concentration ellipses
ellipse.type = "confidence",
legend.title = "Groups",
title='Nezara midgut Samples - PCA'# Avoid text overlapping
)
ggsave(filename = './final_figures_tables/Figure3_PCA.svg',plot=pc,device='svg')
## Saving 7 x 5 in image
print(pc)
### Make expression dataset
trans.matrix=as.matrix(select(trans.sum,Nv_gene,M1_mean,M2_mean,M3_mean,M4H_mean),rownames = 'Nv_gene')
gut.exp=ExpressionSet(assayData=trans.matrix)
gut.exp.f=filter.std(gut.exp,min.std=0,visu=F)
## 933 genes excluded.
gut.exp.s <- standardise(gut.exp.f)
## Run Mfuzz
m1 <- mestimate(gut.exp.s)
c.8 <- mfuzz(gut.exp.s,c=8,m=m1)
#save(gut.exp.f,file='./Mfuzz/gut.exp.f_GRAPH')
#save(c.8,file='./Mfuzz/c.8_GRAPH')
## create graph
pdf('./final_figures_tables/Figure4_fuzzy.pdf')
fuzzy.plot=mfuzz.plot2(gut.exp.s,cl=c.8,mfrow=c(2,4),min.mem=.6,time.labels = c('M1','M2','M3','M4'),colo='fancy',xlab='Gut Compartment',x11=F)
print(fuzzy.plot)
## NULL
dev.off()
## png
## 2
#ggsave(filename = './final_figures_tables/Figure4_fuzzy.svg',plot=fuzzy.plot,device='svg')
## divide up clusters
key=data.table(M2=411,M4=3273,M34=439,M1=1031,M3=505,Decreasing=22,Increase=794,M23=293) %>% t()
cluster=data.table(best.clust=c.8[[3]],Nv_gene=names(c.8[[3]]))
membership=c.8[[4]] %>% data.table()
for(i in 1:length(colnames(membership))){colnames(membership)[i]=paste0('A',colnames(membership)[i])}
membership=membership %>% rowwise() %>% mutate(maximum=max(A1,A2,A3,A4,A5,A6,A7,A8)) %>% data.table()
membership=cbind(cluster,membership)
fin.membership=membership %>% filter(maximum>.6) %>% arrange(desc(best.clust)) %>% data.table()
mem.tab=fin.membership$best.clust %>% table()
ml=list()
for(i in 1:8){
n=dim(fin.membership[best.clust==i])[1]
ml[[i]]=data.table(num=i,name=names(key[,][key[,]>n-20 & key[,]<n+20]))
}
final.fuzzy.key=rbindlist(ml)
cl_name=c()
for(i in fin.membership$best.clust){cl_name=c(cl_name,final.fuzzy.key[num==i]$name)}
fin.membership$cluster_name=cl_name
## write clusters to .csv
for(i in unique(fin.membership$cluster_name)){
fil=fin.membership %>% filter(cluster_name==i)
writeLines(fil$Nv_gene,paste0('./Mfuzz/Cluster_',as.character(i),'_Gene_List.txt'))
}
dir.create('./Top500')
## Warning in dir.create("./Top500"): './Top500' already exists
top500=function(x){
a=trans.sum %>% arrange(desc(get(x))) %>% head(500) %>% data.table()
#fwrite(a,paste0('/home/shanedenecke/Dropbox/wp2_omics/Nezara_midgut_atlas/Top500/',x,'_Top500.csv'),row.names = F)
#writeLines(a$Nv_gene,paste0('/home/shanedenecke/Dropbox/wp2_omics/Nezara_midgut_atlas/Top500/',x,'_Top500.txt'))
return(a$Nv_gene)
}
c.write=function(x){
a=trans.sum[Nv_gene %in% x] %>% select(Nv_gene,annotation)
fwrite(a,paste0('./Top500/',x,'_Top500.csv'),row.names = F)
writeLines(a$Nv_gene,paste0('./Top500/',x,'_Top500.txt'))
}
m1=top500("M1_mean")
m2=top500("M2_mean")
m3=top500("M3_mean")
m4=top500('M4H_mean')
c=top500('C_mean')
com.list=list(M1=m1,M2=m2,M3=m3,M4=m4,Car=c)
common=Reduce(intersect,com.list)
good.list=lapply(com.list,function(x) x[!(x %in% common)])[1:4]
for(i in names(good.list)){
a=good.list[[i]]
b=trans.sum[Nv_gene %in% a] %>% select(Nv_gene,annotation)
fwrite(b,paste0('./Top500/',i,'_Top500.csv'),row.names = F)
writeLines(b$Nv_gene,paste0('./Top500/',i,'_Top500.txt'))
}
dir.create('Bulk_gene_analysis')
## Warning in dir.create("Bulk_gene_analysis"): 'Bulk_gene_analysis' already
## exists
### IMPORT DATA
ipscan.filter=fread('./inputs/NezVir_Trinity_Unigene_IPscan.tsv',col.names = c('Nv_gene','database','code','description'))[Nv_gene %in% unigenes]
pfam.key=fread('./inputs/Pfam_key.tsv',header=F,col.names = c('code_name','annotation'))
goterms=fread('./inputs/Nv_IPScan_GO_terms.tsv')[Nv_gene %in% unigenes]
goterms.key=goterms[,c(2,3)]
#### FILTER DATA
pfam=ipscan.filter[database=='Pfam' & Nv_gene %in% unigenes] %>% unique.data.frame() ## subset only PFAM from IPscan
colnames(goterms.key)=c('code_name','annotation')
pfam.freq=pfam %>% freq.table() %>% data.table() ###CALCULATE GENOME WIDE FREQUENCIES
goterms.freq=goterms %>% freq.table() %>% data.table() ###CALCULATE GENOME WIDE FREQUENCIES
#### GROUP ANNOTATION
comp.fuzz.files=list.files('./Mfuzz',full.names = T)[grepl("txt$",list.files('./Mfuzz'))]
top500.files=list.files('./Top500',full.names = T)[grepl("txt$",list.files('./Top500'))]
proteomes=list.files('./inputs/proteomics/specific_compartment/',full.names = T)[grepl(".txt$",list.files('./proteomics/specific_compartment/'))]
files=c(comp.fuzz.files,top500.files,proteomes)
short=c()
for(i in files){short=c(short,strsplit(i,'\\/')[[1]][length(strsplit(i,'\\/')[[1]])])}
comparisons.list=list()
for(i in files){comparisons.list[[i]]=readLines(i)}
### Output all genes used in the enrichment comparisons in a single table with annotations
annotation.table=list()
for(i in 1:length(comparisons.list)){
comp=unlist(strsplit(names(comparisons.list[i]),split='/'))[3]
genes=comparisons.list[[i]]
sub=trans.sum[Nv_gene %in% genes] %>% select(Nv_gene,annotation)
sub$comparison=comp
annotation.table[[i]]=sub
}
fwrite(rbindlist(annotation.table),'./final_figures_tables/TableS6_Gene_Groups.xlsx')
##make frequency tables
gene.subset=function(geneset,database){
return(database[Nv_gene %in% geneset])
}
pfam.specific=lapply(comparisons.list,gene.subset,pfam)
pfam.specific.freq=list()
for(i in names(pfam.specific)){pfam.specific.freq[[i]]=freq.table(pfam.specific[[i]])}
goterms.specific=lapply(comparisons.list,gene.subset,goterms)
goterms.specific.freq=list()
for(i in names(goterms.specific)){goterms.specific.freq[[i]]=freq.table(goterms.specific[[i]])}
##caluclate enrichment
pfam.enrich=lapply(pfam.specific.freq,vlepo.enrich,pfam.freq)
goterms.enrich=lapply(goterms.specific.freq,vlepo.enrich,goterms.freq)
##anotate PFAM file
annotate_enrichment=function(enrich.table,key){
l=list()
for(i in names(enrich.table)){
enrich=enrich.table[[i]]
annot=merge(enrich,key,by='code_name',all.y=F,sort=F)
#separate(annot,annotation,into=c('term','category'),sep=',')
annot$group=i
l[[i]]=(annot[!duplicated(annot)])
}
return(l)
}
pfam.annotate=annotate_enrichment(pfam.enrich,pfam.key) %>% rbindlist() #%>% fwrite('./Bulk_gene_analysis/PFAM_enrichment.csv')
goterms.annotate=annotate_enrichment(goterms.enrich,goterms.key) %>% rbindlist() #%>% fwrite('./Bulk_gene_analysis/GO_enrichment.csv')
write.xlsx(rbind(pfam.annotate,goterms.annotate),'./final_figures_tables/Table2_Associated_GO_terms.xlsx',sheetName = 'Sheet1')
#pfam.annotate=annotate_enrichment(pfam.enrich,pfam.key)
#for(i in short){fwrite(pfam.annotate[[grep(i,names(pfam.annotate))]],paste0('./Bulk_gene_analysis/',i,'PFAM_enrichment.csv'))}
#goterms.annotate=annotate_enrichment(goterms.enrich,goterms.key)
#for(i in short){fwrite(goterms.annotate[[grep(i,names(goterms.annotate))]],paste0('./Bulk_gene_analysis/',i,'goterms_enrichment.csv'))}
### Get annotations
#annotations=lapply(comparisons.list,function(x) clean.annot[Nv_gene %in% x])
#for(i in short){fwrite(annotations[[grep(i,names(annotations))]],paste0('./Bulk_gene_analysis/',i,'_individual_gene_annotation.csv'),row.names = F)}
### ALL RUN ON SERVER
cd /data2/shane/Documents/Nezara_midgut_atlas/Nv_P450/
mkdir P450_search
mkdir ./P450_search/draft_outputs
for i in $(ls ./ref/model_proteomes/* | grep -E "*unigene.faa$")
do
### get basename and make directory
a=$(basename $i)
mkdir ./P450_search/$a
cd ./P450_search/$a
############### BLAST
/data2/shane/Applications/custom/recip_blast.sh /data2/shane/Documents/Nezara_midgut_atlas/Nv_P450/ref/curated_P450s/P450_homepage_combined.faa ../../ref/model_proteomes/$a 1e-3 40 16
########################################## Length and Pfam filters
mkdir filter
################ Lenght filter
#/data2/shane/Applications/custom/fasta_seq_len_filter.py ./blast/recip_blast.faa 350 650 > ./filter/$a'_P450_length.faa'
#echo "Numer of genes After length filter: ______________" $(grep ">" ./filter/$a'_P450_length.faa' | wc -l) >> summary.txt
cat ./blast/recip_blast.faa > ./filter/$a'_P450_length.faa'
###############Xref with IPscan
/home/pioannidis/Programs/interproscan-5.30-69.0/interproscan.sh -appl pfam -i ./filter/$a'_P450_length.faa' -o ./filter/$a'_ipscan.tsv' -f TSV
grep "PF00067" ./filter/$a'_ipscan.tsv' | cut -f 1 | sort -u | /data2/shane/Applications/custom/unigene_fa_sub.sh ./filter/$a'_P450_length.faa' - > ./filter/$a'_length_ipscan.faa'
echo "Number of genes with After PFAM filter:______________" $(grep ">" ./filter/$a'_length_ipscan.faa' | wc -l) >> summary.txt
### Align and copy to draft outputs
mafft --threadtb 2 ./filter/$a'_length_ipscan.faa' > ./filter/$a'_length_ipscan.faa.aln'
cp ./filter/$a'_length_ipscan.faa' ../draft_outputs/$a'_length_ipscan.faa'
cp ./filter/$a'_length_ipscan.faa.aln' ../draft_outputs/$a'_length_ipscan.faa.aln'
cd /data2/shane/Documents/Nezara_midgut_atlas/Nv_P450
done
for i in ./P450_search/draft_outputs/*.aln; do echo $i; grep ">" $i | wc -l; done
## manually check for lack of conserved P450 motif
mkdir Nv_final
/data2/shane/Applications/custom/fasta_remove.py ./P450_search/draft_outputs/NezVir_unigene.faa_length_ipscan.faa ./ref/Manual_remove.txt > ./Nv_final/NezVir_P450_final.faa
/data2/shane/meme/bin/meme /data2/shane/Documents/Nezara_midgut_atlas/Nv_P450/ref/meme/Combined_P450_Manual.faa -mod zoops -nmotifs 2 -o /data2/shane/Documents/Nezara_midgut_atlas/Nv_P450/ref/meme/P450_motif -protein -minw 5-maxw 10
### Align and Tree
mkdir align_tree
cp ./ref/for_tree/DroMel_AcyPis_P450_Manual.faa ./align_tree/DroMel_AcyPis_P450_Manual.faa
cp ./ref/for_tree/NezVir_P450_named.faa ./align_tree/NezVir_P450_named.fa
cat ./align_tree/NezVir_P450_named.fa ./align_tree/DroMel_AcyPis_P450_Manual.faa > ./align_tree/Nv_combined_.faa
a=./align_tree/Nv_combined_.faa
mafft --threadtb 24 $a > $a'.aln'
/home/pioannidis/Programs/trimAl/source/trimal -in $a'.aln' -out $a'.aln.trimm'
/data2/shane/Applications/custom/fasta_2_phylip.sh $a'.aln.trimm' > $a'.aln.trimm.phy'
b=$(echo $(basename $a) | cut -d '_' -f 1,2)
raxfile=/data2/shane/Documents/Nezara_midgut_atlas/Nv_P450/align_tree/Nv_combined_.faa.aln.trimm.phy
raxdir=/data2/shane/Documents/Nezara_midgut_atlas/Nv_P450/align_tree/
/data2/shane/Applications/raxml/raxmlHPC-PTHREADS-AVX -f a -x 12345 -p 12345 -N 500 -T 40 -m PROTGAMMAAUTO -s $raxfile -n $b'.tre' -w $raxdir
raw.p450=fread('./Nv_P450/Nov_Nelson_P450_update/Nv_P450_joined.csv',header=T) %>% select(Nv_gene,name,clan,length,sequence)
singles=raw.p450[!grepl('__',Nv_gene)]
doubles=raw.p450[grepl('__',Nv_gene)]
single.exp=merge(singles,trans.sum,by='Nv_gene') %>% select(-annotation)
l=list()
for(i in doubles$Nv_gene){
a=unlist(strsplit(i,split='__'))
fiv=trans.raw[Nv_gene==a[1]]
thr=trans.raw[Nv_gene==a[2]]
tot=rbind(fiv,thr) %>% select(-Nv_gene)
mns=apply(tot,2,mean) %>% t() %>% as.data.table()
mns$Nv_gene=i
mns$name=doubles[Nv_gene==i]$name
mns$sequence=doubles[Nv_gene==i]$sequence
mns$length=doubles[Nv_gene==i]$length
mns$clan=doubles[Nv_gene==i]$clan
l[[i]]=mns
}
double.exp=rbindlist(l) %>% rowwise() %>% mutate(M1_mean=mean(c(M1_1,M1_2,M1_3,M1_4)),M1_sd=sd(c(M1_1,M1_2,M1_3,M1_4))) %>%
mutate(M2_mean=mean(c(M2_1,M2_2,M2_3,M2_4)),M2_sd=sd(c(M2_1,M2_2,M2_3,M2_4))) %>%
mutate(M3_mean=mean(c(M3_1,M3_2,M3_3,M3_4)),M3_sd=sd(c(M3_1,M3_2,M3_3,M3_4))) %>%
mutate(M4H_mean=mean(c(M4H_1,M4H_2,M4H_3,M4H_4)),M4H_sd=sd(c(M4H_1,M4H_2,M4H_3,M4H_4))) %>%
mutate(C_mean=mean(c(C1,C2,C3,C4)),C_sd=sd(c(C1,C2,C3,C4))) %>%
select(Nv_gene,name,clan,length,M1_mean,M2_mean,M3_mean,M4H_mean,M1_sd,M2_sd,M3_sd,M4H_sd,C_mean,C_sd,sequence) %>%
data.table()
p450.exp.full=rbind(double.exp,single.exp)
write.xlsx(p450.exp.full,'./final_figures_tables/TableS8_Nv_P450_expression_table.xlsx',sheetName = 'Sheet1',row.names = F)
write.fasta(sequences=as.list(p450.exp.full$sequence),names=paste0('NezVir_',p450.exp.full$name,'__',p450.exp.full$Nv_gene),file.out = './final_figures_tables/FileS1_Nv_P450_Named.faa')
cd /data2/shane/Documents/Nezara_midgut_atlas/Nv_P450/Tree
nohup /data2/shane/Applications/custom/align_and_tree.sh Nv_P450_November.faa 24 &
tree=read.tree('./Nv_P450/align_tree/RAxML_bipartitions.Nv_Dm_P450_November.tre')
no.obs=length(tree$tip.label)
text.size=min(250/no.obs,3)
tree$tip.label=gsub('^CYP','NezVir_CYP',tree$tip.label)
tree$tip.label=gsub('Cyp','CYP',tree$tip.label)
tree$tip.label=gsub('_DN.+$','',tree$tip.label)
tree$tip.label=gsub('fragment','f',tree$tip.label)
tree$tip.label=gsub('term_','',tree$tip.label)
tree$tip.label=gsub('_$','',tree$tip.label)
unispec=c('DroMel','NezVir')
names(unispec)=c('blue','darkgreen')
cols=c()
for(j in tree$tip.label){cols=c(cols,names(unispec)[sapply(unispec,grepl,j)])}
xmax=max(tree$edge.length)*7
tree$col=cols
gp=ggtree(tree,size=2,branch.length = .1,layout='circular')
gp=gp+geom_tiplab(size=5,fontface='bold',align=T,color=cols,aes(angle=angle))
## Warning: Duplicated aesthetics after name standardisation: size
## Warning: Ignoring unknown parameters: fontface
#gp=gp+geom_strip('NezVir_CYP6MZ_C_f', 'DroMel_CYP28d1', barsize=2, color='red', label="associated taxa", offset.text=.1)
gp=gp+theme_tree()
gp=gp+ lims(x=c(0, 8))
#gp=gp+geom_hilight_encircle(node=3,fill="steelblue")
#gp=gp+geom_strip('DroMel_Cyp6g1','DroMel_Cyp28d1',barsize=2, color='blue', label = "another label", offset.text=.1)
gp=gp+geom_strip(1,167,offset=2.5,barsize=2, color='red', label = "CYP 4 Clan", offset.text=.3,fontsize=10,angle=-45)
gp=gp+geom_strip(138,158,offset=2.5,barsize=2, color='purple', label = "CYP 2 Clan", offset.text=.3,fontsize=10,angle=30)
gp=gp+geom_strip(128,159,offset=2.5,barsize=2, color='orange', label = "Mitochrondrial", offset.text=.5,fontsize=10,angle=60)
gp=gp+geom_strip(64,120,offset=2.5,barsize=2, color='bisque4', label = "CYP 3 Clan", offset.text=.5,fontsize=10)
print(gp)
ggsave('./final_figures_tables/Figure5_P450_Tree.svg',gp,device='svg',width=22,height=20)
slc.key=fread('./inputs/NezVirFinal_SLC_table.csv',col.names = c('Nv_gene','name'))
clean.slc=slc.key %>% separate(name,into=c('A','B','C','D','E','F')) %>% unite(col='SLC_family',A,B) %>% select(-C,-D,-E,-F) %>% filter(SLC_family %in% c('SLC_2','SLC_5','SLC_50','SLC_6','SLC_7','SLC_15','SLC_36')) %>% merge(trans.sum,by='Nv_gene') %>% select(-matches("sd")) %>% select(-annotation) %>% arrange(SLC_family) %>% data.table()
## Warning: Expected 6 pieces. Missing pieces filled with `NA` in 425 rows [1,
## 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, ...].
write.xlsx2(clean.slc,'./final_figures_tables/TableS9_Nv_Transporter_Expression_table.xlsx',row.names = F)
countif=function(vec,value){
return(length(vec[which(vec>value)]))
}
shane.transpose=function(dt,newcol){
new.rows=colnames(dt)[2:length(colnames(dt))]
a=transpose(dt)
colnames(a)=as.character(a[1,])
a=a[-1,]
b=mutate(a,newcol=new.rows) %>% select(newcol,everything())
return(b)
}
slc.sum=clean.slc %>% filter(M1_mean>1| M2_mean>1 | M3_mean>1 | M4H_mean>1) %>% group_by(SLC_family) %>% summarise(M1_avg=mean(M1_mean),M2_avg=mean(M2_mean),M3_avg=mean(M3_mean),M4H_avg=mean(M4H_mean),C_avg=mean(C_mean),
M1_total=sum(M1_mean), M2_total=sum(M2_mean), M3_total=sum(M3_mean), M4H_total=sum(M4H_mean),C_total=sum(C_mean),
M1_over5=countif(M1_mean,5), M2_over5=countif(M2_mean,5), M3_over5=countif(M3_mean,5), M4H_over5=countif(M4H_mean,5), C_over5=countif(C_mean,5), M1_over50=countif(M1_mean,50), M2_over50=countif(M2_mean,50), M3_over50=countif(M3_mean,50), M4H_over50=countif(M4H_mean,50), C_over50=countif(C_mean,50)) %>% data.table() %>% shane.transpose()
colnames(slc.sum)[1]="Measuremnet"
write.xlsx2(slc.sum,'./final_figures_tables/Table_3_Transporter_expression.xlsx',row.names = F)
fwrite(slc.sum,'./final_figures_tables/Table_3_Transporter_expression.csv')
rat.cat=function(x){
if(x<1){
return("blue")
}else if(x>1 & x<5){
return("black")
}else if(x>5){ return("red")
}
}
########## CREATE FIGURE 5
plot=clean.slc %>% rowwise() %>% mutate(anterior_avg=mean(c(M1_mean,M2_mean,M3_mean))) %>% filter(SLC_family=='SLC_36' | SLC_family=='SLC_7') %>% select(Nv_gene,SLC_family,M4H_mean,anterior_avg) %>% filter(M4H_mean>1 | anterior_avg >1) %>%
mutate(ratio=M4H_mean/anterior_avg) %>% gather(key='tissue',value='expression',M4H_mean,anterior_avg) %>%
mutate(color=as.factor(sapply(ratio,rat.cat))) %>%
#mutate(color=color.scale(log_transform(plot$ratio),extremes = c('steelblue','orangered'))) %>%
data.table()
log_transform=function(x){
a=log(as.numeric(x)+2,base=2)
return(a)
}
se=function(x){sd(x)/sqrt(x)}
plot$logexpression=log_transform(plot$expression)
plot$SLC_family=gsub('SLC_7','APC',plot$SLC_family)
plot$SLC_family=gsub('SLC_36','AAAP',plot$SLC_family)
#plot$color <- color.scale(log_transform(plot$ratio),extremes = c('steelblue','orangered'))
#plot$color <- as.color(plot$ratio_cat)
#cr
gp=ggplot(plot,aes(x=tissue,y=logexpression,fill=tissue))
gp=gp+facet_wrap(facets=vars(SLC_family),ncol=2,scales='fixed')
gp=gp+stat_boxplot(geom ='errorbar',width=.5)
gp=gp+geom_boxplot()
gp=gp+scale_fill_manual(values=c('grey95','grey50'))
gp=gp+labs(y='Transcript Expression Level (Log2 of TPM) \n',x='\n Gut Region')
gp=gp+scale_x_discrete(labels=c('Anterior Midgut Mean','M4 Mean'))
#gp=gp+coord_trans(y="log2")
gp=gp+theme_bw()
gp=gp+ theme(text=element_text(face="bold",family="serif"),panel.grid=element_blank(),axis.ticks.x=element_line(),
panel.border=element_rect(colour="black",fill=NA),strip.text=element_text(size=18),strip.background=element_rect("grey95"),axis.title=element_text(size=16),legend.position = "none",panel.background = element_blank())
print(gp)
ggsave(file='./final_figures_tables/Figure6_Amino_acid_M4_enrichment.svg',plot=gp,device='svg')
## Saving 7 x 5 in image