### Set workding directory
cd ~/Dropbox/Shane/omics_projects/Cactus_RNA_seq_new
### Set number of threads
THREADS=12
#### Set other file path variables
GFF=~/Dropbox/Shane/omics_projects/omics_ref/Aedes_Aegypti/Vectorbase/AedAeg_Vectorbase_annotation53.gff
GENOME=~/Dropbox/Shane/omics_projects/omics_ref/Aedes_Aegypti/Vectorbase/AedAeg_Vectorbase_Genome53.fna
TRANS=~/Dropbox/Shane/omics_projects/omics_ref/Aedes_Aegypti/Vectorbase/AedAeg_Vectorbase_Transcripts53_unigene.fna
HISAT_INDEX=../omics_ref/Aedes_Aegypti/Vectorbase/AedAeg_Vectorbase_Genome53_hisat2_index
## Create directory structure
mkdir -p FastQ_raw
mkdir -p Counts_output
mkdir -p Assembly
mkdir -p trimmed_fastq
mkdir -p mapping
mkdir -p ref
## Create counts_output subdirectories
mkdir -p Counts_output/FeatureCounts
### Mapping
mkdir -p mapping/mapping_rates
mkdir -p mapping/BAM_files
### Unzip Fasta files
for i in ./Zipped_fastQ/*.gz; do unpigz -p $THREADS -k $i; done
mv ./Zipped_fastQ/*.fq ./FastQ_raw/
### Generate a samples list
### Build HiSat2 index
hisat2-build -q -p 8 $GENOME $HISAT_INDEX
## Run for loop to perform kallisto and Hisat2 read mapping
cat ./ref/Unique_samples.txt | while read samp; do
### Create variables for names
both=$(ls ./FastQ_raw/*.fq | grep -E "/"$samp)
left=$(ls ./FastQ_raw/*.fq | grep "/"$samp | head -n 1)
right=$(ls ./FastQ_raw/*.fq | grep "/"$samp | tail -n 1)
leftbase=$(basename $left | perl -pe 's/.fq//g')
rightbase=$(basename $right | perl -pe 's/.fq//g')
### Trim fastQ reads
fastp -g --length_required 20 --detect_adapter_for_pe --thread $THREADS --in1 './FastQ_raw/'$leftbase'.fq' --in2 './FastQ_raw/'$rightbase'.fq' --out1 ./trimmed_fastq/$leftbase'_trimmed.fastq' --out2 ./trimmed_fastq/$rightbase'_trimmed.fastq'
rm fastp.json
rm fastp.html
### Set Trim variables
left_trim=$(ls ./trimmed_fastq/* | grep "/"$samp | head -n 1)
right_trim=$(ls ./trimmed_fastq/* | grep "/"$samp | tail -n 1)
### Map reads to genome with HiSat2
hisat2 --dta-cufflinks --max-intronlen 60000 -p $THREADS -x $HISAT_INDEX -1 $left_trim -2 $right_trim \
-S ./mapping/$samp.sam > ./mapping/mapping_rates/mapping_rate_$samp.txt 2>&1
samtools view -b --threads $THREADS ./mapping/$samp.sam > ./mapping/$samp.bam
samtools sort -m 2G --threads $THREADS ./mapping/$samp.bam -o ./mapping/$samp.sorted.bam
mv ./mapping/$samp.sorted.bam ./mapping/BAM_files/$samp.sorted.bam
rm ./mapping/$samp.sam; rm ./mapping/$samp.bam
done
#### Quanitfy reads with featureCounts
featureCounts -p -F 'GTF' -M --fraction -T $THREADS -t exon -g 'gene_id' -a $GFF -o ./Counts_output/FeatureCounts/FeatureCounts_AedAeg_RawCounts.tsv ./mapping/BAM_files/*.sorted.bam > ./Counts_output/FeatureCounts/AedAeg_FeatureCounts_mappingRate.txt 2>&1
#counts_to_tpm.R ./Counts_output/FeatureCounts/FeatureCounts_AedAeg_RawCounts.tsv > ./Counts_output/FeatureCounts/FeatureCounts_AedAeg_TPM.tsv
Rscript /mnt/c/Users/desktopadmin/Dropbox/Shane/Applications/Custom_Applications/counts_to_tpm.R ./Counts_output/FeatureCounts/FeatureCounts_AedAeg_RawCounts.tsv > ./Counts_output/FeatureCounts/FeatureCounts_AedAeg_TPM.tsv
dir.create('./Transcriptomic_outputs',showWarnings = F)
dir.create('./Transcriptomic_outputs/Volcano_plots',showWarnings = F)
dir.create('./Transcriptomic_outputs/DE_tables',showWarnings = F)
dir.create('./Transcriptomic_outputs/PCAs',showWarnings = F)
dir.create('./Transcriptomic_outputs/Expression_tables',showWarnings = F)
dir.create('./Transcriptomic_outputs/Kallisto',showWarnings = F)
dir.create('./GSEA/GSEA_tables/Protein_subsets',showWarnings = F)
dir.create('./GSEA/GSEA_tables/Transcriptome_subsets',showWarnings = F)
dir.create('./GSEA/GSEA_plots/Transcriptome',showWarnings = F)
dir.create('./GSEA/GSEA_plots/Proteome',showWarnings = F)
samples=c('W_CACT','W_LACZ','T_CACT','T_LACZ')
######################## Import functions ########################
freq.table=function(x){
#x=go.full
total=x$Geneid %>% unique() %>% length()
l=list()
for(i in unique(x$GO_code)){
sub=x[GO_code==i]
num.genes=unique(sub$Geneid) %>% length()
freq=num.genes/total
l[[i]]=data.table(GO_code=i,code_count=num.genes,frequency=freq,total=total)
}
return(rbindlist(l))
}
vlepo.enrich=function(trial,omic){
#trial=go.de.freq[[1]]
#omic=goterms.freq
if(nrow(trial)>0){
fischer_p=c()
test.list=list()
ref.list=list()
for(i in trial$GO_code){
test=trial[trial$GO_code==i,c('code_count','total')] %>% as.numeric()
ref=omic[omic$GO_code==i,c('code_count','total')] %>% as.numeric()
p.value=fisher.test(matrix(c(test[1],ref[1],test[2],ref[2]),nrow=2,ncol=2),alternative="greater")$p.value ## 3extract P value from fischer test
fischer_p=c(fischer_p,p.value)
names(test)=c('test_N','test_total')
test.list[[i]]=as.data.frame(t(test))
names(ref)=c('ref_N','ref_total')
ref.list[[i]]=as.data.frame(t(ref))
}
test.add=rbindlist(test.list)
ref.add=rbindlist(ref.list)
trial$fischP=fischer_p
trial$fdr=p.adjust(trial$fischP,method='fdr')
combined=cbind(test.add,ref.add,trial)
enriched= combined %>% arrange(fdr) %>% filter(fischP<.05) %>% filter(test_N>5) %>% select(-frequency,-code_count) %>%
mutate(test_freq=test_N/test_total, ref_freq=ref_N/ref_total) %>%
select(GO_code,fischP,fdr,everything()) %>% data.table()
return(enriched)
}
}
fc.rate.clean=function(sumfile){
sumfile[grepl('Process BAM file|Successfully assigned alignment',sumfile)] %>%
gsub('.sorted.bam','',.) %>%
gsub('\\|\\| Process BAM file (.+)\\.\\.\\..+$','\\1',.) %>%
gsub('^.+Successfully assigned alignments.+\\((.+)\\%.+$','\\1',.) %>% matrix(ncol=2,byrow=T) %>%
data.table() %>% dplyr::rename(Sample=V1,featureCounts_rate=V2)
}
rep.mean=function(data,x){
sample=select(data,matches(x))
sample=apply(sample,2,as.numeric)
return(rowMeans(sample))
}
rep.stderr=function(data,x){
sample=select(data,matches(x))
sample=apply(sample,2,as.numeric)
return(apply(sample,1,stderr))
}
stderr=function(x){sd(x)/sqrt(length(x))}
edger.de=function(count.table,cols){#,fdr=.05,fc=2){
idcol=colnames(count.table)[grepl('ID|id',colnames(count.table))][1]
count.table.2=count.table
rownames(count.table.2)=count.table[[idcol]]
count.table.2[[idcol]]=NULL
group=factor(cols)
delist=DGEList(counts=count.table.2,group=group)
### filter
keep=filterByExpr(delist)
delist=delist[keep, , keep.lib.sizes=FALSE]
## Add normalization
delist=calcNormFactors(delist)
delist = estimateCommonDisp(delist)
delist = estimateTagwiseDisp(delist)
#ps.countsWhole=delist$pseudo.counts %>% data.table()
#ps.countsWhole$Geneid=count.table[row.index][[idcol]]
#ps.countsTubule=delist$pseudo.counts %>% data.table()
#ps.countsTubule$Geneid=count.table[row.index][[idcol]]
#m=inner_join(ps.countsTubule,ps.countsWhole,by='Geneid')
#fwrite(arrange(m,Geneid),'./Transcriptomic_outputs/Pseudo_counts_Cactus.csv')
##perform test
et = exactTest(delist, pair=unique(cols))
tTags = topTags(et,n=NULL)
output=tTags$table
row.index=rownames(output) %>% as.numeric()
genes=count.table[row.index][[idcol]]
output[[idcol]]=genes
allgenes=select(count.table,all_of(idcol))
er.out=data.table(sampleA=unique(cols)[1], sampleB=unique(cols)[2], output) %>%
merge(allgenes,by=idcol,all=T) %>%
#filter(FDR<fdr) %>%
#filter(abs(logFC)>fc) %>%
arrange(FDR)
return(er.out)
}
# ######################## Annotation ########################
##### Annotate genes
protein.annot=fread('./ref/annotations/AedAeg_Vectorbase_transcript_annotation_clean.csv',header=F) %>% rename(Protein_ID=V1,Annotation=V2)
immune.annot=fread('./ref/annotations/ImmuneGene_Aedes_aegypti_Sept2021.csv') %>% select(Geneid,Gene_name) %>% rename(Immune_name=Gene_name)
abc.aa=fread('./ref/annotations/Final_ABC_table.tsv') %>% mutate(Geneid=gsub('(AAEL[0-9]+).+$','\\1',code)) %>% select(Geneid,fam) %>% rename(Transporter_family=fam)
slc.aa=fread('./ref/annotations/AedAeg_SLC_dictionary.csv') %>% mutate(Geneid=gsub('(AAEL[0-9]+).+$','\\1',code)) %>% select(Geneid,family) %>% rename(Transporter_family=family)
transporter.aa=rbindlist(list(abc.aa,slc.aa))
gene.annot=protein.annot %>% mutate(Geneid=gsub('(AAEL[0-9]+).+$','\\1',Protein_ID)) %>%
select(Geneid,Annotation) %>% unique.data.frame() %>%
merge(transporter.aa,by='Geneid',all.x=T) %>%
merge(immune.annot,by='Geneid',all.x=T)
fwrite(gene.annot,'./ref/Aedes_gene_annotation.csv')
#kable(gene.annot)
### Import featurecounts
aedaeg.feature.tpm=fread('./Counts_output/FeatureCounts/FeatureCounts_AedAeg_TPM.tsv') %>%
setNames(gsub('.sorted.bam','_TPM_fc',names(.))) #%>% mutate(Geneid=paste0(Geneid,'-p1')) %>% sra.label()
aedaeg.feature.counts=fread('./Counts_output/FeatureCounts/FeatureCounts_AedAeg_RawCounts.tsv') %>%
setNames(gsub('./mapping/BAM_files/(.+).sorted.bam','\\1_count_fc',names(.))) #%>% mutate(Geneid=paste0(Geneid,'-p1')) %>% sra.label()
### Estimate featureCount mapping efficiency
aedaeg.feature.summary=readLines('./Counts_output/FeatureCounts/AedAeg_FeatureCounts_mappingRate.txt') %>% fc.rate.clean() #%>% mutate(SRA=paste0(SRA,'_aedaeg'))
#### HiSAT-2 Mapping rate
l=list()
for(i in list.files('./mapping/mapping_rates/',full.names = T)){
sample=gsub('.+mapping_rate_(.+).txt$','\\1',i)
map.rate=readLines(i)[grepl('overall alignment rate',readLines(i))] %>% gsub('% overall alignment rate','',.) %>% as.numeric()
total=readLines(i)[grepl(' reads; of these:',readLines(i))] %>% gsub(' reads; of these:','',.) %>% as.numeric()
l[[i]]=data.table(Sample=sample,Hisat2_rate=map.rate,Total_reads=total)
}
## Warning in as.data.table.list(x, keep.rownames = keep.rownames, check.names =
## check.names, : Item 2 has 0 rows but longest item has 1; filled with NA
## Warning in as.data.table.list(x, keep.rownames = keep.rownames, check.names =
## check.names, : Item 3 has 0 rows but longest item has 1; filled with NA
## Warning in as.data.table.list(x, keep.rownames = keep.rownames, check.names =
## check.names, : Item 2 has 0 rows but longest item has 1; filled with NA
## Warning in as.data.table.list(x, keep.rownames = keep.rownames, check.names =
## check.names, : Item 3 has 0 rows but longest item has 1; filled with NA
## Warning in as.data.table.list(x, keep.rownames = keep.rownames, check.names =
## check.names, : Item 2 has 0 rows but longest item has 1; filled with NA
## Warning in as.data.table.list(x, keep.rownames = keep.rownames, check.names =
## check.names, : Item 3 has 0 rows but longest item has 1; filled with NA
## Warning in as.data.table.list(x, keep.rownames = keep.rownames, check.names =
## check.names, : Item 2 has 0 rows but longest item has 1; filled with NA
## Warning in as.data.table.list(x, keep.rownames = keep.rownames, check.names =
## check.names, : Item 3 has 0 rows but longest item has 1; filled with NA
## Warning in as.data.table.list(x, keep.rownames = keep.rownames, check.names =
## check.names, : Item 2 has 0 rows but longest item has 1; filled with NA
## Warning in as.data.table.list(x, keep.rownames = keep.rownames, check.names =
## check.names, : Item 3 has 0 rows but longest item has 1; filled with NA
## Warning in as.data.table.list(x, keep.rownames = keep.rownames, check.names =
## check.names, : Item 2 has 0 rows but longest item has 1; filled with NA
## Warning in as.data.table.list(x, keep.rownames = keep.rownames, check.names =
## check.names, : Item 3 has 0 rows but longest item has 1; filled with NA
## Warning in as.data.table.list(x, keep.rownames = keep.rownames, check.names =
## check.names, : Item 2 has 0 rows but longest item has 1; filled with NA
## Warning in as.data.table.list(x, keep.rownames = keep.rownames, check.names =
## check.names, : Item 3 has 0 rows but longest item has 1; filled with NA
## Warning in as.data.table.list(x, keep.rownames = keep.rownames, check.names =
## check.names, : Item 2 has 0 rows but longest item has 1; filled with NA
## Warning in as.data.table.list(x, keep.rownames = keep.rownames, check.names =
## check.names, : Item 3 has 0 rows but longest item has 1; filled with NA
## Warning in as.data.table.list(x, keep.rownames = keep.rownames, check.names =
## check.names, : Item 2 has 0 rows but longest item has 1; filled with NA
## Warning in as.data.table.list(x, keep.rownames = keep.rownames, check.names =
## check.names, : Item 3 has 0 rows but longest item has 1; filled with NA
## Warning in as.data.table.list(x, keep.rownames = keep.rownames, check.names =
## check.names, : Item 2 has 0 rows but longest item has 1; filled with NA
## Warning in as.data.table.list(x, keep.rownames = keep.rownames, check.names =
## check.names, : Item 3 has 0 rows but longest item has 1; filled with NA
## Warning in as.data.table.list(x, keep.rownames = keep.rownames, check.names =
## check.names, : Item 2 has 0 rows but longest item has 1; filled with NA
## Warning in as.data.table.list(x, keep.rownames = keep.rownames, check.names =
## check.names, : Item 3 has 0 rows but longest item has 1; filled with NA
hisat.map.rate=rbindlist(l)
overall.map=merge(hisat.map.rate,aedaeg.feature.summary,by='Sample')
fwrite(overall.map,'./Transcriptomic_outputs/Expression_tables/FeatureCounts_Statistics.csv')
#kable(overall.map)
#### Get averages for TPM
aedaeg.feature.tpm$W_LACZ_mean=rep.mean(aedaeg.feature.tpm,'W_LACZ')
aedaeg.feature.tpm$W_CACT_mean=rep.mean(aedaeg.feature.tpm,'W_CACT')
aedaeg.feature.tpm$T_LACZ_mean=rep.mean(aedaeg.feature.tpm,'T_LACZ')
aedaeg.feature.tpm$T_CACT_mean=rep.mean(aedaeg.feature.tpm,'T_CACT')
aedaeg.feature.tpm$W_LACZ_se=rep.stderr(aedaeg.feature.tpm,'W_LACZ')
aedaeg.feature.tpm$W_CACT_se=rep.stderr(aedaeg.feature.tpm,'W_CACT')
aedaeg.feature.tpm$T_LACZ_se=rep.stderr(aedaeg.feature.tpm,'T_LACZ')
aedaeg.feature.tpm$T_CACT_se=rep.stderr(aedaeg.feature.tpm,'T_CACT')
#### Make full table
fc.full.tpm=aedaeg.feature.tpm %>% select(-Length) %>% left_join(gene.annot,by='Geneid')
#### Summarize_data
fc.sum=select(fc.full.tpm,Geneid,Annotation,Transporter_family,Immune_name,contains('mean'),contains('se'))
fwrite(fc.full.tpm,'./Transcriptomic_outputs/Expression_tables/FeatureCounts_TPM_full.csv')
fwrite(fc.sum,'./Transcriptomic_outputs/Expression_tables/FeatureCounts_TPM_sum.csv')
#kable(fc.sum)
######################################## Differential Expression ######################################
fc.counts=aedaeg.feature.counts %>% select(-Chr,-Start,-End,-Strand,-Length)
cactusWhole.compare=select(fc.counts,Geneid,matches("^W"))
cactusTubule.compare=select(fc.counts,Geneid,matches("^T"))
### Run DE gene comparisons
fc.de.cactusWhole=edger.de(count.table=cactusWhole.compare,cols=c(rep("LACZ",3),rep("CactusWhole",3))) %>%
left_join(fc.sum,by='Geneid') %>%
arrange(FDR) %>%
mutate(logFC=logFC*-1)
fc.de.cactusTubule=edger.de(count.table=cactusTubule.compare,cols=c(rep("LACZ",3),rep("CactusTubule",3))) %>%
left_join(fc.sum,by='Geneid') %>%
arrange(FDR) %>%
mutate(logFC=logFC*-1)
fwrite(fc.de.cactusTubule,'./Transcriptomic_outputs/DE_tables/Differntial_expression_Tubule_Counts.csv')
fwrite(fc.de.cactusWhole,'./Transcriptomic_outputs/DE_tables/Differntial_expression_Whole_Counts.csv')
### Filter for significance based on Thresholds
fc.de.cactusWhole.sig=fc.de.cactusWhole %>% filter(FDR<.05) %>% filter(abs(logFC)>2)
fc.de.cactusWhole.sigUp=fc.de.cactusWhole %>% filter(logFC>2) %>% filter(FDR<.05)
fc.de.cactusWhole.sigDown=fc.de.cactusWhole %>% filter(logFC<(-2)) %>% filter(FDR<.05)
fwrite(fc.de.cactusWhole,'./Transcriptomic_outputs/DE_tables/Cactus_Whole.csv')
fwrite(fc.de.cactusWhole.sigUp,'./Transcriptomic_outputs/DE_tables/Cactus_Whole_up.csv')
fwrite(fc.de.cactusWhole.sigDown,'./Transcriptomic_outputs/DE_tables/Cactus_Whole_down.csv')
fc.de.cactusTubule.sig=fc.de.cactusTubule %>% filter(FDR<.05) %>% filter(abs(logFC)>2)
fc.de.cactusTubule.sigUp=fc.de.cactusTubule %>% filter(logFC>2) %>% filter(FDR<.05)
fc.de.cactusTubule.sigDown=fc.de.cactusTubule %>% filter(logFC<(-2)) %>% filter(FDR<.05)
fwrite(fc.de.cactusTubule.sigUp,'./Transcriptomic_outputs/DE_tables/Cactus_Tubules_up.csv')
fwrite(fc.de.cactusTubule.sigDown,'./Transcriptomic_outputs/DE_tables/Cactus_Tubules_down.csv')
######################################## Cross Compare DE genes ######################################
#### Upregulated Genes
l=list(fc.de.cactusTubule.sigUp$Geneid,fc.de.cactusWhole.sigUp$Geneid)
names(l)=c('Tubule Upregulated','Whole Upregulated')
gp=ggVennDiagram(l,label_size = 10,set_size=8)
gp=gp+scale_color_manual(values=c('black','black'))
gp=gp+ scale_fill_gradient(low="grey80",high = "grey80")
gp=gp+theme(legend.position='none')
ggsave(gp,file='./Transcriptomic_outputs/DE_tables/CactusUp.pdf',device='pdf',width=12,height=6)
commonly.up=fc.de.cactusTubule.sigUp %>% filter(Geneid %in% intersect(l[[1]],l[[2]]))
fwrite(commonly.up,'./Transcriptomic_outputs/DE_tables/Commonly_upregulated_Cactus_genes.csv')
#### Downregulated Genes
l2=list(fc.de.cactusTubule.sigDown$Geneid,fc.de.cactusWhole.sigDown$Geneid)
names(l2)=c('Tubule Downregulated','Whole Downregulated')
gp=ggVennDiagram(l2,label_size = 10,set_size=10)
gp=gp+scale_color_manual(values=c('black','black'))
gp=gp+ scale_fill_gradient(low="grey80",high = "grey80")
gp=gp+theme(legend.position='none')
print(gp)

ggsave(gp,file='./Transcriptomic_outputs/DE_tables/CactusUp.pdf',device='pdf',width=10,height=6)
commonly.down=fc.de.cactusTubule.sigDown %>% filter(Geneid %in% intersect(l2[[1]],l2[[2]]))
fwrite(commonly.down,'./Transcriptomic_outputs/DE_tables/Commonly_downregulated_Cactus_genes.csv')
hemo.prot=readLines('./GSEA/GSEA_lists/Hemolymph_CactusKD_UP_Wistar.grp')
onlytub=setdiff(fc.de.cactusTubule.sigUp$Geneid,fc.de.cactusWhole.sigUp$Geneid)
onlybod=setdiff(fc.de.cactusWhole.sigUp$Geneid,fc.de.cactusTubule.sigUp$Geneid)
tub.specific=gene.annot[Geneid %in% intersect(onlytub,hemo.prot)]
gene.annot[Geneid %in% intersect(onlybod,hemo.prot)]
## Geneid
## 1: AAEL003626
## 2: AAEL006131
## 3: AAEL006739
## 4: AAEL011455
## 5: AAEL012694
## 6: AAEL018131
## 7: AAEL021257
## 8: AAEL023637
## 9: AAEL025802
## 10: AAEL027790
## 11: AAEL027829
## Annotation
## 1: sodium/chloride dependent amino acid transporter
## 2: GD_N domain-containing protein [Source:UniProtKB/TrEMBL;Acc:A0A1S4FCR3]
## 3: adult cuticle protein, putative
## 4: C-Type Lectin (CTLMA12) - mannose binding
## 5: juvenile hormone-inducible protein, putative
## 6: unspecified product
## 7: unspecified product
## 8: unspecified product
## 9: unspecified product
## 10: unspecified product
## 11: unspecified product
## Transporter_family Immune_name
## 1: SLC_6 <NA>
## 2: <NA> <NA>
## 3: <NA> <NA>
## 4: <NA> CTLMA12
## 5: <NA> <NA>
## 6: <NA> <NA>
## 7: <NA> <NA>
## 8: <NA> <NA>
## 9: <NA> <NA>
## 10: <NA> <NA>
## 11: <NA> <NA>
fwrite(tub.specific,'./Transcriptomic_outputs/DE_tables/Tubule_specific_in_hemolymph.csv')
#### Volcano plot for Cactus Whole
fc.de.cactusWhole$Threshold = 'Not Significant'
fc.de.cactusWhole[FDR<.05 & abs(logFC)>2]$Threshold='Differentially Expressed'
fc.de.cactusWhole$Threshold=as.factor(fc.de.cactusWhole$Threshold)
fc.de.cactusWhole=fc.de.cactusWhole[logFC!=0]
fc.de.cactusWhole$Threshold=factor(fc.de.cactusWhole$Threshold,levels=c('Not Significant','Differentially Expressed','REL1 Enriched'))
gp=ggplot(data=fc.de.cactusWhole, aes(x=logFC, y =-log10(FDR), color=Threshold))
gp=gp+geom_point(alpha=.7, size=1.75)
gp=gp+xlim(c(-12,12))
gp=gp+ggtitle('Cactus Whole Body Volcano Plot')
gp=gp+scale_colour_manual(values=c('black','red','green'))
gp=gp+xlab("\nDownregulated in Cactus-KD <-------- LogFC --------> Upregulated in Cactus-KD") + ylab("-log10 FDR\n")
gp=gp+theme_bw()
gp=gp+theme(text=element_text(face="bold"),panel.grid=element_blank(),
axis.ticks.x=element_line(),panel.border=element_rect(colour="black",fill=NA),
axis.title=element_text(size=12),axis.text.x=element_text(size=10),
plot.title = element_text(hjust = 0.5,size=20))
print(gp)

ggsave(plot=gp,filename='./Transcriptomic_outputs/Volcano_plots/Cactus_Whole_Volcano_plot.pdf',device='pdf',height=5,width=10)
fc.de.cactusTubule$Threshold = 'Not Significant'
fc.de.cactusTubule[FDR<.05 & abs(logFC)>2]$Threshold='Differentially Expressed'
fc.de.cactusTubule$Threshold=as.factor(fc.de.cactusTubule$Threshold)
fc.de.cactusTubule=fc.de.cactusTubule[logFC!=0]
fc.de.cactusTubule$Threshold=factor(fc.de.cactusTubule$Threshold,levels=c('Not Significant','Differentially Expressed','REL1 Enriched'))
gp=ggplot(data=fc.de.cactusTubule, aes(x=logFC, y =-log10(FDR), color=Threshold))
gp=gp+geom_point(alpha=.7, size=1.75)
gp=gp+xlim(c(-16,16))
gp=gp+ggtitle('Cactus Tubule Volcano Plot')
gp=gp+scale_colour_manual(values=c('black','red','green'))
gp=gp+xlab("\nDownregulated in Cactus-KD <-------- LogFC --------> Upregulated in Cactus-KD") + ylab("-log10 FDR\n")
gp=gp+theme_bw()
gp=gp+theme(text=element_text(face="bold"),panel.grid=element_blank(),
axis.ticks.x=element_line(),panel.border=element_rect(colour="black",fill=NA),
axis.title=element_text(size=12),axis.text.x=element_text(size=10),
plot.title = element_text(hjust = 0.5,size=20))
print(gp)

ggsave(plot=gp,filename='./Transcriptomic_outputs/Volcano_plots/Cactus_Tubule_Volcano_plot.pdf',device='pdf',height=5,width=10)
sum.pca=select(fc.counts,-Geneid) %>% filter(complete.cases(.)) %>% t()
groups=c(rep('W_CACT',3),rep('W_LACZ',3),rep('T_CACT',3),rep('T_LACZ',3))
PCA1=prcomp(sum.pca)
pca2=fviz_pca_ind(PCA1,
col.ind = groups,
geom='point',
repel = TRUE,
addEllipses = TRUE, # Concentration ellipses
ellipse.type = "confidence",
legend.title = "Groups",# Avoid text overlapping
title=''
)
pca2=pca2+theme_bw()
pca2=pca2+scale_fill_manual(values=c('red1','blue','black','forestgreen'))
pca2=pca2+scale_color_manual(values=c('red1','blue','black','forestgreen'))
pca2=pca2+theme(
panel.grid=element_blank(),
rect=element_rect(fill=NULL),
legend.text=element_text(size=14),
legend.title=element_text(size=18,face='bold'),
plot.title = element_text(size=20,face='bold',hjust = 0.5))
print(pca2)

ggsave(plot=pca2,filename='./Transcriptomic_outputs/PCAs/Summary_PCA.pdf',device='pdf',height=6,width=12)
######################################## GO term analysis ########################################
go.terms=fread('./ref/Aedes_go_terms.txt',sep='\t',skip=1) %>% select(V2,V5) %>% rename(Geneid=V2,GO_code=V5) %>% unique.data.frame()
go.key=fread('./ref/GO_key_clean.txt')
go.full=left_join(go.terms,go.key)
## Joining, by = "GO_code"
goterms.freq=go.full %>% freq.table()
### extract GO terms and annotations for each gene
de.list=list(fc.de.cactusTubule.sigUp$Geneid,
fc.de.cactusWhole.sigUp$Geneid,
fc.de.cactusTubule.sigDown$Geneid,
fc.de.cactusWhole.sigDown$Geneid)
de.annot=lapply(de.list, function(x) fc.full.tpm[Geneid %in% x] %>% select(Geneid,Annotation))
go.de=lapply(de.list, function(x) go.full[Geneid %in% x]) ### Extract GO terms from Pearce reference for each gene in each de list
names(go.de)=c('Tubule_up','Whole_up','Tubule_down','Whole_down')
### Get frequency table for each GO term in each DE list
go.de.freq=list() ### set up empty list
for(i in names(go.de)){go.de.freq[[i]]=freq.table(go.de[[i]])}
#### perform enrichment analysis
goterms.enrich=lapply(go.de.freq,vlepo.enrich,goterms.freq)
######################################## Annotate GO enrichment ########################################
l=list()
for(i in names(goterms.enrich)){
enrich=goterms.enrich[[i]]
if(is.null(enrich)){next} ### if there are no enriched go terms then skip
annot=merge(enrich,go.key,by='GO_code',all.y=F,sort=F)
l[[i]]=data.table(annot[!duplicated(annot)],category=i)
}
######################################## Filter and Write GO enrichment ########################################
significant.go=rbindlist(l) %>% #### All GO terms
filter(fdr<5e-3) %>% ### FDR threshold
filter(GO_cat!='cellular_component') %>% ### Set category if you want
#select(GO_code,fdr,GO_name,category) %>%
rename(`GO code`=GO_code,`False Discovery Rate`=fdr,`GO Name`=GO_name,Comparison=category)
fwrite(significant.go,'./Transcriptomic_outputs/DE_tables/ALL_ENRICHED_GO.csv')
#### Import DE gene files
fc.de.cactusTubule=fread('./Transcriptomic_outputs/DE_tables/Differntial_expression_Tubule_Counts.csv') %>%
.[complete.cases(.)] %>% arrange(desc(logFC)) %>%
select(Geneid,logFC,Annotation)
sutopa.prot=fread('./Sutopa_proteomes/Sup_Table1_proteins.csv') %>%
select(`Majority protein IDs`,`Log2 ratio dsCactus/dsLacZ`) %>%
rename(Geneid=`Majority protein IDs`,logFC=`Log2 ratio dsCactus/dsLacZ`) %>%
mutate(Geneid=gsub('(^AAEL[0-9]+)-P.+$','\\1',Geneid)) %>%
filter(!duplicated(Geneid)) %>%
filter(grepl('AA',Geneid)) %>%
arrange(desc(logFC))
fc.de.cactusTubule$Geneid=factor(fc.de.cactusTubule$Geneid,levels=fc.de.cactusTubule$Geneid)
gp=ggplot(fc.de.cactusTubule,aes(x=Geneid,y=logFC))
gp=gp+geom_bar(stat='identity')
gp=gp+labs(y="Downregulated in LacZ-KD <--- LogFC ---> Upregulated in Cactus-KD\n",x='')
gp=gp+geom_vline(xintercept = nrow(fc.de.cactusTubule)/2,size=.5,linetype=2)
gp=gp+theme_bw()
gp=gp+theme(text=element_text(face="bold"),panel.grid=element_blank(),
axis.ticks.x=element_line(),panel.border=element_rect(colour="black",fill=NA),
axis.title=element_text(size=10),axis.text.x=element_blank(),
legend.position = 'none',plot.title = element_text(hjust = 0.5))
print(gp)

ggsave(plot=gp,filename='./GSEA/GSEA_plots/Transcriptomic_FC_distribution.pdf',width=12,height=6)
sutopa.prot$Geneid=factor(sutopa.prot$Geneid,levels=sutopa.prot$Geneid)
gp=ggplot(sutopa.prot,aes(x=Geneid,y=logFC))
gp=gp+geom_bar(stat='identity')
gp=gp+labs(y="Downregulated in LacZ-KD <--- LogFC ---> Upregulated in Cactus-KD\n",x='')
gp=gp+geom_vline(xintercept = nrow(sutopa.prot)/2,size=.5,linetype=2)
gp=gp+theme_bw()
gp=gp+theme(text=element_text(face="bold"),panel.grid=element_blank(),
axis.ticks.x=element_line(),panel.border=element_rect(colour="black",fill=NA),
axis.title=element_text(size=10),axis.text.x=element_blank(),
legend.position = 'none',plot.title = element_text(hjust = 0.5))
print(gp)

ggsave(plot=gp,filename='./GSEA/GSEA_plots/Proteomic_FC_distribution.pdf',width=12,height=6)
#### Import GSEA gene sets into list
gl=list()
for(i in list.files('GSEA/GSEA_lists/',full.names = T)[grepl('grp',list.files('GSEA/GSEA_lists'))]){
a=readLines(i,warn = F)
nam=a[1] %>% gsub('>','',.) %>% gsub(' ','_',.)
gl[[nam]]=a[-1]
}
### Make ranks files
trans.ranks=fc.de.cactusTubule$logFC
names(trans.ranks)=fc.de.cactusTubule$Geneid
prot.ranks=sutopa.prot$logFC
names(prot.ranks)=sutopa.prot$Geneid
#### Prepare GSEA tables
fg.trans=fgsea(gl, trans.ranks,minSize=5,eps=0) %>% arrange(padj) %>% select(pathway,pval,padj,log2err,ES,size)
colnames(fg.trans)=c('Gene Set','P-value','Adjusted P-value','Log2Error','Enrichment Score','Sample Size')
fg.prot=fgsea(gl, prot.ranks,minSize=5,eps=0) %>% arrange(padj) %>% select(pathway,pval,padj,log2err,ES,size)
colnames(fg.prot)=c('Gene Set','P-value','Adjusted P-value','Log2Error','Enrichment Score','Sample Size')
fwrite(fg.trans,'./GSEA/GSEA_tables/GSEA_table_transcript.csv')
fwrite(fg.prot,'./GSEA/GSEA_tables/GSEA_table_protein.csv')
for(i in names(gl)){
#### Make subset table
prot.table=sutopa.prot[Geneid %in% gl[[i]]] %>% merge(gene.annot)
trans.table=fc.de.cactusTubule[Geneid %in% gl[[i]]] %>% merge(gene.annot)
fwrite(prot.table,paste0('./GSEA/GSEA_tables/Protein_subsets/',i,'_GSEA_protein_table_subset.csv'))
fwrite(trans.table,paste0('./GSEA/GSEA_tables/Transcriptome_subsets/',i,'_GSEA_trans_table_subset.csv'))
trans.overlap=intersect(gl[[i]],names(trans.ranks))
if(length(trans.overlap)==0){
next
}else{
gp=plotEnrichment(gl[[i]], trans.ranks)
gp$layers[[5]]$aes_params$colour='blue'
gp$layers[[5]]$aes_params$size=1.5
gp=gp+labs(x='Cactus-KD Up <- Rank -> Cactus-KD Down',y='Enrichment Score')
gp=gp+ggtitle(i)
gp=gp+theme_bw()
gp=gp+theme(text=element_text(face="bold"),panel.grid=element_blank(),
axis.ticks.x=element_line(),panel.border=element_rect(colour="black",fill=NA),
axis.title=element_text(size=12),axis.text.x=element_text(size=10),
legend.position = 'none',plot.title = element_text(hjust = 0.5))
#print(gp)
ggsave(plot=gp,filename=paste0('./GSEA/GSEA_plots/Transcriptome/',i,'_GSEA_plot_transcriptomics.pdf'),device='pdf')
assign(paste(i,'_plot',sep=''),gp)
}
prot.overlap=intersect(gl[[i]],names(prot.ranks))
if(length(prot.overlap)==0){
next
}else{
gp=plotEnrichment(pathway=gl[[i]], stats = prot.ranks)
gp$layers[[5]]$aes_params$colour='blue'
gp$layers[[5]]$aes_params$size=1.5
gp=gp+labs(x='Upregulated in Cactus-KD <---- Rank ----> Downregulated in Cactus-KD',y='Enrichment Score')
gp=gp+ggtitle(i)
gp=gp+theme_bw()
gp=gp+theme(text=element_text(face="bold"),panel.grid=element_blank(),
axis.ticks.x=element_line(),panel.border=element_rect(colour="black",fill=NA),
axis.title=element_text(size=14),axis.text.x=element_text(size=10),
legend.position = 'none',plot.title = element_text(hjust = 0.5))
#print(gp)
ggsave(plot=gp,filename=paste0('./GSEA/GSEA_plots/Proteome/',i,'_GSEA_plot_proteomics.pdf'),device='pdf')
}
}
## Saving 7 x 5 in image
## Saving 7 x 5 in image
## Saving 7 x 5 in image
## Saving 7 x 5 in image
## Saving 7 x 5 in image
## Saving 7 x 5 in image
## Saving 7 x 5 in image
## Saving 7 x 5 in image
## Saving 7 x 5 in image
## Saving 7 x 5 in image
## Saving 7 x 5 in image
## Saving 7 x 5 in image
## Saving 7 x 5 in image
## Saving 7 x 5 in image
## Saving 7 x 5 in image
## Saving 7 x 5 in image
## Saving 7 x 5 in image
## Saving 7 x 5 in image
## Saving 7 x 5 in image
## Saving 7 x 5 in image
## Saving 7 x 5 in image
## Saving 7 x 5 in image
## Saving 7 x 5 in image
## Saving 7 x 5 in image
## Saving 7 x 5 in image
## Saving 7 x 5 in image
## Saving 7 x 5 in image
## Saving 7 x 5 in image
## Saving 7 x 5 in image
## Saving 7 x 5 in image
## Saving 7 x 5 in image
## Saving 7 x 5 in image
## Saving 7 x 5 in image
## Saving 7 x 5 in image
## Saving 7 x 5 in image
## Saving 7 x 5 in image
## Saving 7 x 5 in image
## Saving 7 x 5 in image
## Saving 7 x 5 in image
## Saving 7 x 5 in image
## Saving 7 x 5 in image
## Saving 7 x 5 in image
## Saving 7 x 5 in image
## Saving 7 x 5 in image
## Saving 7 x 5 in image
## Saving 7 x 5 in image
## Saving 7 x 5 in image
## Saving 7 x 5 in image
## Saving 7 x 5 in image
## Saving 7 x 5 in image
## Saving 7 x 5 in image
## Saving 7 x 5 in image
## Saving 7 x 5 in image
## Saving 7 x 5 in image
## Saving 7 x 5 in image
## Saving 7 x 5 in image
## Saving 7 x 5 in image
## Saving 7 x 5 in image
## Saving 7 x 5 in image
## Saving 7 x 5 in image
## Saving 7 x 5 in image
## Saving 7 x 5 in image
## Saving 7 x 5 in image
## Saving 7 x 5 in image
## Saving 7 x 5 in image
## Saving 7 x 5 in image
## Saving 7 x 5 in image
## Saving 7 x 5 in image
## Saving 7 x 5 in image
## Saving 7 x 5 in image
## Saving 7 x 5 in image
## Saving 7 x 5 in image
## Saving 7 x 5 in image
## Saving 7 x 5 in image
## Saving 7 x 5 in image
## Saving 7 x 5 in image
## Saving 7 x 5 in image
## Saving 7 x 5 in image
## Saving 7 x 5 in image
## Saving 7 x 5 in image
## Saving 7 x 5 in image
## Saving 7 x 5 in image
## Saving 7 x 5 in image
## Saving 7 x 5 in image
## Saving 7 x 5 in image
## Saving 7 x 5 in image
## Saving 7 x 5 in image
## Saving 7 x 5 in image
## Saving 7 x 5 in image
## Saving 7 x 5 in image
grids=list(Brugia_malayi_upregulated_plot,
REL1_up_plot,
REL2_up_plot,
wMelPop_plot)
grids2=grid.arrange(grobs = grids, ncol = 2)

ggsave(plot=grids2,filename = './GSEA/GSEA_plots/Grid_arrange_previous_papers.pdf',width=16,height=8)
grids=list(CTL_plot,
CLIPs_plot,
SRPN_plot,
vATPase_plot,
PPOs_plot,
AMPs_plot)
grids2=grid.arrange(grobs = grids, ncol = 3)

ggsave(plot=grids2,filename = './GSEA/GSEA_plots/Grid_arrange_families.pdf',width=16,height=8)
print(grids2)
## TableGrob (2 x 3) "arrange": 6 grobs
## z cells name grob
## 1 1 (1-1,1-1) arrange gtable[layout]
## 2 2 (1-1,2-2) arrange gtable[layout]
## 3 3 (1-1,3-3) arrange gtable[layout]
## 4 4 (2-2,1-1) arrange gtable[layout]
## 5 5 (2-2,2-2) arrange gtable[layout]
## 6 6 (2-2,3-3) arrange gtable[layout]