Cactus Transcriptomic Analysis

### Set workding directory
cd ~/Dropbox/Shane/omics_projects/Cactus_RNA_seq_new

### Set number of threads
THREADS=12

#### Set other file path variables
GFF=~/Dropbox/Shane/omics_projects/omics_ref/Aedes_Aegypti/Vectorbase/AedAeg_Vectorbase_annotation53.gff
GENOME=~/Dropbox/Shane/omics_projects/omics_ref/Aedes_Aegypti/Vectorbase/AedAeg_Vectorbase_Genome53.fna
TRANS=~/Dropbox/Shane/omics_projects/omics_ref/Aedes_Aegypti/Vectorbase/AedAeg_Vectorbase_Transcripts53_unigene.fna
HISAT_INDEX=../omics_ref/Aedes_Aegypti/Vectorbase/AedAeg_Vectorbase_Genome53_hisat2_index

## Create directory structure
mkdir -p FastQ_raw
mkdir -p Counts_output
mkdir -p Assembly
mkdir -p trimmed_fastq
mkdir -p mapping
mkdir -p ref


## Create counts_output subdirectories 
mkdir -p Counts_output/FeatureCounts

### Mapping
mkdir -p mapping/mapping_rates
mkdir -p mapping/BAM_files


### Unzip Fasta files
for i in ./Zipped_fastQ/*.gz; do unpigz -p $THREADS -k $i; done
mv ./Zipped_fastQ/*.fq ./FastQ_raw/

### Generate a samples list

### Build HiSat2 index
hisat2-build -q -p 8 $GENOME $HISAT_INDEX

## Run for loop to perform kallisto and Hisat2 read mapping
cat ./ref/Unique_samples.txt | while read samp; do

  ### Create variables for names
  both=$(ls ./FastQ_raw/*.fq | grep -E "/"$samp)
  left=$(ls ./FastQ_raw/*.fq | grep "/"$samp | head -n 1)
  right=$(ls ./FastQ_raw/*.fq | grep "/"$samp | tail -n 1)
  leftbase=$(basename $left | perl -pe 's/.fq//g')
  rightbase=$(basename $right | perl -pe 's/.fq//g')
  
  ### Trim fastQ reads
  fastp -g --length_required 20 --detect_adapter_for_pe --thread $THREADS --in1 './FastQ_raw/'$leftbase'.fq' --in2 './FastQ_raw/'$rightbase'.fq' --out1 ./trimmed_fastq/$leftbase'_trimmed.fastq' --out2 ./trimmed_fastq/$rightbase'_trimmed.fastq'
  rm fastp.json
  rm fastp.html    
    
  ### Set Trim variables
  left_trim=$(ls ./trimmed_fastq/* | grep "/"$samp | head -n 1)
  right_trim=$(ls ./trimmed_fastq/* | grep "/"$samp | tail -n 1)
  
  ### Map reads to genome with HiSat2
  hisat2 --dta-cufflinks --max-intronlen 60000 -p $THREADS -x $HISAT_INDEX -1 $left_trim -2 $right_trim \
  -S ./mapping/$samp.sam > ./mapping/mapping_rates/mapping_rate_$samp.txt 2>&1
  samtools view -b --threads $THREADS ./mapping/$samp.sam > ./mapping/$samp.bam
  samtools sort -m 2G --threads $THREADS ./mapping/$samp.bam -o ./mapping/$samp.sorted.bam

  mv  ./mapping/$samp.sorted.bam ./mapping/BAM_files/$samp.sorted.bam
  rm  ./mapping/$samp.sam; rm ./mapping/$samp.bam
done



#### Quanitfy reads with featureCounts
featureCounts -p -F 'GTF' -M --fraction -T $THREADS -t exon -g 'gene_id' -a $GFF -o ./Counts_output/FeatureCounts/FeatureCounts_AedAeg_RawCounts.tsv ./mapping/BAM_files/*.sorted.bam > ./Counts_output/FeatureCounts/AedAeg_FeatureCounts_mappingRate.txt 2>&1
#counts_to_tpm.R ./Counts_output/FeatureCounts/FeatureCounts_AedAeg_RawCounts.tsv > ./Counts_output/FeatureCounts/FeatureCounts_AedAeg_TPM.tsv
Rscript /mnt/c/Users/desktopadmin/Dropbox/Shane/Applications/Custom_Applications/counts_to_tpm.R ./Counts_output/FeatureCounts/FeatureCounts_AedAeg_RawCounts.tsv > ./Counts_output/FeatureCounts/FeatureCounts_AedAeg_TPM.tsv

dir.create('./Transcriptomic_outputs',showWarnings = F)
dir.create('./Transcriptomic_outputs/Volcano_plots',showWarnings = F)
dir.create('./Transcriptomic_outputs/DE_tables',showWarnings = F)
dir.create('./Transcriptomic_outputs/PCAs',showWarnings = F)
dir.create('./Transcriptomic_outputs/Expression_tables',showWarnings = F)
dir.create('./Transcriptomic_outputs/Kallisto',showWarnings = F)
dir.create('./GSEA/GSEA_tables/Protein_subsets',showWarnings = F)
dir.create('./GSEA/GSEA_tables/Transcriptome_subsets',showWarnings = F)
dir.create('./GSEA/GSEA_plots/Transcriptome',showWarnings = F)
dir.create('./GSEA/GSEA_plots/Proteome',showWarnings = F)

samples=c('W_CACT','W_LACZ','T_CACT','T_LACZ')

######################## Import functions ######################## 

freq.table=function(x){
  #x=go.full
  total=x$Geneid %>% unique() %>% length()
  l=list()
  for(i in unique(x$GO_code)){
    sub=x[GO_code==i]
    num.genes=unique(sub$Geneid) %>% length()
    freq=num.genes/total
    l[[i]]=data.table(GO_code=i,code_count=num.genes,frequency=freq,total=total)
  }
  return(rbindlist(l))
} 



vlepo.enrich=function(trial,omic){
  #trial=go.de.freq[[1]]
  #omic=goterms.freq
  if(nrow(trial)>0){
    
    fischer_p=c()
    test.list=list()
    ref.list=list()
    for(i in trial$GO_code){
      test=trial[trial$GO_code==i,c('code_count','total')] %>% as.numeric()
      ref=omic[omic$GO_code==i,c('code_count','total')] %>% as.numeric()
      p.value=fisher.test(matrix(c(test[1],ref[1],test[2],ref[2]),nrow=2,ncol=2),alternative="greater")$p.value ## 3extract P value from fischer test
      fischer_p=c(fischer_p,p.value)
      
      names(test)=c('test_N','test_total')
      test.list[[i]]=as.data.frame(t(test))
      
      names(ref)=c('ref_N','ref_total')
      ref.list[[i]]=as.data.frame(t(ref))
    }
    
    test.add=rbindlist(test.list)
    ref.add=rbindlist(ref.list)
    trial$fischP=fischer_p 
    trial$fdr=p.adjust(trial$fischP,method='fdr')
    combined=cbind(test.add,ref.add,trial)
    enriched= combined %>% arrange(fdr) %>% filter(fischP<.05) %>% filter(test_N>5) %>% select(-frequency,-code_count) %>% 
      mutate(test_freq=test_N/test_total, ref_freq=ref_N/ref_total) %>% 
      select(GO_code,fischP,fdr,everything()) %>% data.table()
    return(enriched)
  }
}





fc.rate.clean=function(sumfile){
  sumfile[grepl('Process BAM file|Successfully assigned alignment',sumfile)] %>%
    gsub('.sorted.bam','',.) %>%
    gsub('\\|\\| Process BAM file (.+)\\.\\.\\..+$','\\1',.) %>% 
    gsub('^.+Successfully assigned alignments.+\\((.+)\\%.+$','\\1',.) %>% matrix(ncol=2,byrow=T) %>%
    data.table() %>% dplyr::rename(Sample=V1,featureCounts_rate=V2)
}

rep.mean=function(data,x){
  sample=select(data,matches(x))
  sample=apply(sample,2,as.numeric)
  return(rowMeans(sample))
}

rep.stderr=function(data,x){
  sample=select(data,matches(x))
  sample=apply(sample,2,as.numeric)
  return(apply(sample,1,stderr))
}

stderr=function(x){sd(x)/sqrt(length(x))}

edger.de=function(count.table,cols){#,fdr=.05,fc=2){
  
  idcol=colnames(count.table)[grepl('ID|id',colnames(count.table))][1]
  
  count.table.2=count.table
  rownames(count.table.2)=count.table[[idcol]]
  count.table.2[[idcol]]=NULL
  
  group=factor(cols)
  delist=DGEList(counts=count.table.2,group=group)
  
  ### filter
  keep=filterByExpr(delist)
  delist=delist[keep, , keep.lib.sizes=FALSE]
  
  
  ## Add normalization
  delist=calcNormFactors(delist)
  delist = estimateCommonDisp(delist)
  delist = estimateTagwiseDisp(delist)
  
  #ps.countsWhole=delist$pseudo.counts %>% data.table()
  #ps.countsWhole$Geneid=count.table[row.index][[idcol]]
  
  #ps.countsTubule=delist$pseudo.counts %>% data.table()
  #ps.countsTubule$Geneid=count.table[row.index][[idcol]]
  
  #m=inner_join(ps.countsTubule,ps.countsWhole,by='Geneid')
  
  #fwrite(arrange(m,Geneid),'./Transcriptomic_outputs/Pseudo_counts_Cactus.csv')
  
  ##perform test
  et = exactTest(delist, pair=unique(cols))
  tTags = topTags(et,n=NULL)
  
  output=tTags$table 
  row.index=rownames(output) %>% as.numeric()
  genes=count.table[row.index][[idcol]]
  output[[idcol]]=genes
  allgenes=select(count.table,all_of(idcol))
  
  er.out=data.table(sampleA=unique(cols)[1], sampleB=unique(cols)[2], output) %>% 
    merge(allgenes,by=idcol,all=T) %>% 
    #filter(FDR<fdr) %>%  
    #filter(abs(logFC)>fc)  %>%
    arrange(FDR)
  
  return(er.out)
  
}

# ########################  Annotation ######################## 

##### Annotate genes
protein.annot=fread('./ref/annotations/AedAeg_Vectorbase_transcript_annotation_clean.csv',header=F) %>% rename(Protein_ID=V1,Annotation=V2)
immune.annot=fread('./ref/annotations/ImmuneGene_Aedes_aegypti_Sept2021.csv') %>% select(Geneid,Gene_name) %>% rename(Immune_name=Gene_name)
abc.aa=fread('./ref/annotations/Final_ABC_table.tsv') %>% mutate(Geneid=gsub('(AAEL[0-9]+).+$','\\1',code)) %>% select(Geneid,fam) %>% rename(Transporter_family=fam)
slc.aa=fread('./ref/annotations/AedAeg_SLC_dictionary.csv') %>% mutate(Geneid=gsub('(AAEL[0-9]+).+$','\\1',code)) %>% select(Geneid,family) %>% rename(Transporter_family=family)
transporter.aa=rbindlist(list(abc.aa,slc.aa))
gene.annot=protein.annot %>% mutate(Geneid=gsub('(AAEL[0-9]+).+$','\\1',Protein_ID)) %>% 
   select(Geneid,Annotation) %>% unique.data.frame() %>% 
   merge(transporter.aa,by='Geneid',all.x=T) %>%
   merge(immune.annot,by='Geneid',all.x=T)

fwrite(gene.annot,'./ref/Aedes_gene_annotation.csv')

#kable(gene.annot)

### Import featurecounts
aedaeg.feature.tpm=fread('./Counts_output/FeatureCounts/FeatureCounts_AedAeg_TPM.tsv') %>%
  setNames(gsub('.sorted.bam','_TPM_fc',names(.))) #%>% mutate(Geneid=paste0(Geneid,'-p1')) %>% sra.label()
aedaeg.feature.counts=fread('./Counts_output/FeatureCounts/FeatureCounts_AedAeg_RawCounts.tsv')  %>%
  setNames(gsub('./mapping/BAM_files/(.+).sorted.bam','\\1_count_fc',names(.))) #%>% mutate(Geneid=paste0(Geneid,'-p1')) %>% sra.label()


### Estimate featureCount mapping efficiency
aedaeg.feature.summary=readLines('./Counts_output/FeatureCounts/AedAeg_FeatureCounts_mappingRate.txt') %>% fc.rate.clean() #%>% mutate(SRA=paste0(SRA,'_aedaeg'))
#### HiSAT-2 Mapping rate
l=list()
for(i in list.files('./mapping/mapping_rates/',full.names = T)){
  sample=gsub('.+mapping_rate_(.+).txt$','\\1',i)
  map.rate=readLines(i)[grepl('overall alignment rate',readLines(i))] %>% gsub('% overall alignment rate','',.) %>% as.numeric()
  total=readLines(i)[grepl(' reads; of these:',readLines(i))] %>% gsub(' reads; of these:','',.) %>% as.numeric()
  l[[i]]=data.table(Sample=sample,Hisat2_rate=map.rate,Total_reads=total)
}

## Warning in as.data.table.list(x, keep.rownames = keep.rownames, check.names =
## check.names, : Item 2 has 0 rows but longest item has 1; filled with NA

## Warning in as.data.table.list(x, keep.rownames = keep.rownames, check.names =
## check.names, : Item 3 has 0 rows but longest item has 1; filled with NA

## Warning in as.data.table.list(x, keep.rownames = keep.rownames, check.names =
## check.names, : Item 2 has 0 rows but longest item has 1; filled with NA

## Warning in as.data.table.list(x, keep.rownames = keep.rownames, check.names =
## check.names, : Item 3 has 0 rows but longest item has 1; filled with NA

## Warning in as.data.table.list(x, keep.rownames = keep.rownames, check.names =
## check.names, : Item 2 has 0 rows but longest item has 1; filled with NA

## Warning in as.data.table.list(x, keep.rownames = keep.rownames, check.names =
## check.names, : Item 3 has 0 rows but longest item has 1; filled with NA

## Warning in as.data.table.list(x, keep.rownames = keep.rownames, check.names =
## check.names, : Item 2 has 0 rows but longest item has 1; filled with NA

## Warning in as.data.table.list(x, keep.rownames = keep.rownames, check.names =
## check.names, : Item 3 has 0 rows but longest item has 1; filled with NA

## Warning in as.data.table.list(x, keep.rownames = keep.rownames, check.names =
## check.names, : Item 2 has 0 rows but longest item has 1; filled with NA

## Warning in as.data.table.list(x, keep.rownames = keep.rownames, check.names =
## check.names, : Item 3 has 0 rows but longest item has 1; filled with NA

## Warning in as.data.table.list(x, keep.rownames = keep.rownames, check.names =
## check.names, : Item 2 has 0 rows but longest item has 1; filled with NA

## Warning in as.data.table.list(x, keep.rownames = keep.rownames, check.names =
## check.names, : Item 3 has 0 rows but longest item has 1; filled with NA

## Warning in as.data.table.list(x, keep.rownames = keep.rownames, check.names =
## check.names, : Item 2 has 0 rows but longest item has 1; filled with NA

## Warning in as.data.table.list(x, keep.rownames = keep.rownames, check.names =
## check.names, : Item 3 has 0 rows but longest item has 1; filled with NA

## Warning in as.data.table.list(x, keep.rownames = keep.rownames, check.names =
## check.names, : Item 2 has 0 rows but longest item has 1; filled with NA

## Warning in as.data.table.list(x, keep.rownames = keep.rownames, check.names =
## check.names, : Item 3 has 0 rows but longest item has 1; filled with NA

## Warning in as.data.table.list(x, keep.rownames = keep.rownames, check.names =
## check.names, : Item 2 has 0 rows but longest item has 1; filled with NA

## Warning in as.data.table.list(x, keep.rownames = keep.rownames, check.names =
## check.names, : Item 3 has 0 rows but longest item has 1; filled with NA

## Warning in as.data.table.list(x, keep.rownames = keep.rownames, check.names =
## check.names, : Item 2 has 0 rows but longest item has 1; filled with NA

## Warning in as.data.table.list(x, keep.rownames = keep.rownames, check.names =
## check.names, : Item 3 has 0 rows but longest item has 1; filled with NA

## Warning in as.data.table.list(x, keep.rownames = keep.rownames, check.names =
## check.names, : Item 2 has 0 rows but longest item has 1; filled with NA

## Warning in as.data.table.list(x, keep.rownames = keep.rownames, check.names =
## check.names, : Item 3 has 0 rows but longest item has 1; filled with NA

hisat.map.rate=rbindlist(l)

overall.map=merge(hisat.map.rate,aedaeg.feature.summary,by='Sample') 
fwrite(overall.map,'./Transcriptomic_outputs/Expression_tables/FeatureCounts_Statistics.csv')

#kable(overall.map)

#### Get averages for TPM
aedaeg.feature.tpm$W_LACZ_mean=rep.mean(aedaeg.feature.tpm,'W_LACZ')
aedaeg.feature.tpm$W_CACT_mean=rep.mean(aedaeg.feature.tpm,'W_CACT')
aedaeg.feature.tpm$T_LACZ_mean=rep.mean(aedaeg.feature.tpm,'T_LACZ')
aedaeg.feature.tpm$T_CACT_mean=rep.mean(aedaeg.feature.tpm,'T_CACT')

aedaeg.feature.tpm$W_LACZ_se=rep.stderr(aedaeg.feature.tpm,'W_LACZ')
aedaeg.feature.tpm$W_CACT_se=rep.stderr(aedaeg.feature.tpm,'W_CACT')
aedaeg.feature.tpm$T_LACZ_se=rep.stderr(aedaeg.feature.tpm,'T_LACZ')
aedaeg.feature.tpm$T_CACT_se=rep.stderr(aedaeg.feature.tpm,'T_CACT')

#### Make full table
fc.full.tpm=aedaeg.feature.tpm %>% select(-Length) %>% left_join(gene.annot,by='Geneid')

#### Summarize_data
fc.sum=select(fc.full.tpm,Geneid,Annotation,Transporter_family,Immune_name,contains('mean'),contains('se'))

fwrite(fc.full.tpm,'./Transcriptomic_outputs/Expression_tables/FeatureCounts_TPM_full.csv')
fwrite(fc.sum,'./Transcriptomic_outputs/Expression_tables/FeatureCounts_TPM_sum.csv')

#kable(fc.sum)

######################################## Differential Expression ######################################
fc.counts=aedaeg.feature.counts %>% select(-Chr,-Start,-End,-Strand,-Length) 

cactusWhole.compare=select(fc.counts,Geneid,matches("^W"))
cactusTubule.compare=select(fc.counts,Geneid,matches("^T"))

### Run DE gene comparisons
fc.de.cactusWhole=edger.de(count.table=cactusWhole.compare,cols=c(rep("LACZ",3),rep("CactusWhole",3))) %>%
  left_join(fc.sum,by='Geneid') %>% 
  arrange(FDR) %>% 
  mutate(logFC=logFC*-1)

fc.de.cactusTubule=edger.de(count.table=cactusTubule.compare,cols=c(rep("LACZ",3),rep("CactusTubule",3))) %>%
  left_join(fc.sum,by='Geneid') %>% 
  arrange(FDR) %>% 
  mutate(logFC=logFC*-1)

fwrite(fc.de.cactusTubule,'./Transcriptomic_outputs/DE_tables/Differntial_expression_Tubule_Counts.csv')
fwrite(fc.de.cactusWhole,'./Transcriptomic_outputs/DE_tables/Differntial_expression_Whole_Counts.csv')


### Filter for significance based on Thresholds
fc.de.cactusWhole.sig=fc.de.cactusWhole %>% filter(FDR<.05) %>% filter(abs(logFC)>2)
fc.de.cactusWhole.sigUp=fc.de.cactusWhole %>% filter(logFC>2) %>% filter(FDR<.05)
fc.de.cactusWhole.sigDown=fc.de.cactusWhole %>% filter(logFC<(-2)) %>% filter(FDR<.05)
fwrite(fc.de.cactusWhole,'./Transcriptomic_outputs/DE_tables/Cactus_Whole.csv')
fwrite(fc.de.cactusWhole.sigUp,'./Transcriptomic_outputs/DE_tables/Cactus_Whole_up.csv')
fwrite(fc.de.cactusWhole.sigDown,'./Transcriptomic_outputs/DE_tables/Cactus_Whole_down.csv')

fc.de.cactusTubule.sig=fc.de.cactusTubule %>% filter(FDR<.05) %>% filter(abs(logFC)>2)
fc.de.cactusTubule.sigUp=fc.de.cactusTubule %>% filter(logFC>2) %>% filter(FDR<.05)
fc.de.cactusTubule.sigDown=fc.de.cactusTubule %>% filter(logFC<(-2)) %>% filter(FDR<.05)
fwrite(fc.de.cactusTubule.sigUp,'./Transcriptomic_outputs/DE_tables/Cactus_Tubules_up.csv')
fwrite(fc.de.cactusTubule.sigDown,'./Transcriptomic_outputs/DE_tables/Cactus_Tubules_down.csv')

######################################## Cross Compare DE genes ######################################
#### Upregulated Genes
l=list(fc.de.cactusTubule.sigUp$Geneid,fc.de.cactusWhole.sigUp$Geneid)
names(l)=c('Tubule Upregulated','Whole Upregulated')


gp=ggVennDiagram(l,label_size = 10,set_size=8)
gp=gp+scale_color_manual(values=c('black','black'))
gp=gp+ scale_fill_gradient(low="grey80",high = "grey80")
gp=gp+theme(legend.position='none')
ggsave(gp,file='./Transcriptomic_outputs/DE_tables/CactusUp.pdf',device='pdf',width=12,height=6)

commonly.up=fc.de.cactusTubule.sigUp %>% filter(Geneid %in% intersect(l[[1]],l[[2]]))
fwrite(commonly.up,'./Transcriptomic_outputs/DE_tables/Commonly_upregulated_Cactus_genes.csv')

#### Downregulated Genes
l2=list(fc.de.cactusTubule.sigDown$Geneid,fc.de.cactusWhole.sigDown$Geneid)
names(l2)=c('Tubule Downregulated','Whole Downregulated')

gp=ggVennDiagram(l2,label_size = 10,set_size=10)
gp=gp+scale_color_manual(values=c('black','black'))
gp=gp+ scale_fill_gradient(low="grey80",high = "grey80")
gp=gp+theme(legend.position='none')

print(gp)

ggsave(gp,file='./Transcriptomic_outputs/DE_tables/CactusUp.pdf',device='pdf',width=10,height=6)

commonly.down=fc.de.cactusTubule.sigDown %>% filter(Geneid %in% intersect(l2[[1]],l2[[2]]))
fwrite(commonly.down,'./Transcriptomic_outputs/DE_tables/Commonly_downregulated_Cactus_genes.csv')

hemo.prot=readLines('./GSEA/GSEA_lists/Hemolymph_CactusKD_UP_Wistar.grp')

onlytub=setdiff(fc.de.cactusTubule.sigUp$Geneid,fc.de.cactusWhole.sigUp$Geneid)
onlybod=setdiff(fc.de.cactusWhole.sigUp$Geneid,fc.de.cactusTubule.sigUp$Geneid)

tub.specific=gene.annot[Geneid %in% intersect(onlytub,hemo.prot)]
gene.annot[Geneid %in% intersect(onlybod,hemo.prot)]

##         Geneid
##  1: AAEL003626
##  2: AAEL006131
##  3: AAEL006739
##  4: AAEL011455
##  5: AAEL012694
##  6: AAEL018131
##  7: AAEL021257
##  8: AAEL023637
##  9: AAEL025802
## 10: AAEL027790
## 11: AAEL027829
##                                                                  Annotation
##  1:                        sodium/chloride dependent amino acid transporter
##  2: GD_N domain-containing protein [Source:UniProtKB/TrEMBL;Acc:A0A1S4FCR3]
##  3:                                         adult cuticle protein, putative
##  4:                               C-Type Lectin (CTLMA12) - mannose binding
##  5:                            juvenile hormone-inducible protein, putative
##  6:                                                     unspecified product
##  7:                                                     unspecified product
##  8:                                                     unspecified product
##  9:                                                     unspecified product
## 10:                                                     unspecified product
## 11:                                                     unspecified product
##     Transporter_family Immune_name
##  1:              SLC_6        <NA>
##  2:               <NA>        <NA>
##  3:               <NA>        <NA>
##  4:               <NA>     CTLMA12
##  5:               <NA>        <NA>
##  6:               <NA>        <NA>
##  7:               <NA>        <NA>
##  8:               <NA>        <NA>
##  9:               <NA>        <NA>
## 10:               <NA>        <NA>
## 11:               <NA>        <NA>

fwrite(tub.specific,'./Transcriptomic_outputs/DE_tables/Tubule_specific_in_hemolymph.csv')

#### Volcano plot for Cactus Whole
fc.de.cactusWhole$Threshold = 'Not Significant'
fc.de.cactusWhole[FDR<.05 & abs(logFC)>2]$Threshold='Differentially Expressed'
fc.de.cactusWhole$Threshold=as.factor(fc.de.cactusWhole$Threshold)
fc.de.cactusWhole=fc.de.cactusWhole[logFC!=0]
fc.de.cactusWhole$Threshold=factor(fc.de.cactusWhole$Threshold,levels=c('Not Significant','Differentially Expressed','REL1 Enriched'))

gp=ggplot(data=fc.de.cactusWhole, aes(x=logFC, y =-log10(FDR), color=Threshold)) 
gp=gp+geom_point(alpha=.7, size=1.75)
gp=gp+xlim(c(-12,12))
gp=gp+ggtitle('Cactus Whole Body Volcano Plot')
gp=gp+scale_colour_manual(values=c('black','red','green'))
gp=gp+xlab("\nDownregulated in Cactus-KD   <--------  LogFC   -------->   Upregulated in Cactus-KD") + ylab("-log10 FDR\n") 
gp=gp+theme_bw() 
gp=gp+theme(text=element_text(face="bold"),panel.grid=element_blank(),
            axis.ticks.x=element_line(),panel.border=element_rect(colour="black",fill=NA),
            axis.title=element_text(size=12),axis.text.x=element_text(size=10),
            plot.title = element_text(hjust = 0.5,size=20))
print(gp)

ggsave(plot=gp,filename='./Transcriptomic_outputs/Volcano_plots/Cactus_Whole_Volcano_plot.pdf',device='pdf',height=5,width=10)

fc.de.cactusTubule$Threshold = 'Not Significant'
fc.de.cactusTubule[FDR<.05 & abs(logFC)>2]$Threshold='Differentially Expressed'
fc.de.cactusTubule$Threshold=as.factor(fc.de.cactusTubule$Threshold)
fc.de.cactusTubule=fc.de.cactusTubule[logFC!=0]
fc.de.cactusTubule$Threshold=factor(fc.de.cactusTubule$Threshold,levels=c('Not Significant','Differentially Expressed','REL1 Enriched'))

gp=ggplot(data=fc.de.cactusTubule, aes(x=logFC, y =-log10(FDR), color=Threshold)) 
gp=gp+geom_point(alpha=.7, size=1.75)
gp=gp+xlim(c(-16,16))
gp=gp+ggtitle('Cactus Tubule Volcano Plot')
gp=gp+scale_colour_manual(values=c('black','red','green'))
gp=gp+xlab("\nDownregulated in Cactus-KD   <--------  LogFC   -------->   Upregulated in Cactus-KD") + ylab("-log10 FDR\n") 
gp=gp+theme_bw() 
gp=gp+theme(text=element_text(face="bold"),panel.grid=element_blank(),
            axis.ticks.x=element_line(),panel.border=element_rect(colour="black",fill=NA),
            axis.title=element_text(size=12),axis.text.x=element_text(size=10),
            plot.title = element_text(hjust = 0.5,size=20))
print(gp)

ggsave(plot=gp,filename='./Transcriptomic_outputs/Volcano_plots/Cactus_Tubule_Volcano_plot.pdf',device='pdf',height=5,width=10)

sum.pca=select(fc.counts,-Geneid) %>% filter(complete.cases(.)) %>% t()
groups=c(rep('W_CACT',3),rep('W_LACZ',3),rep('T_CACT',3),rep('T_LACZ',3))

PCA1=prcomp(sum.pca)

pca2=fviz_pca_ind(PCA1,
                  col.ind = groups, 
                  geom='point',
                    repel = TRUE,
                  addEllipses = TRUE, # Concentration ellipses
                  ellipse.type = "confidence",
                  legend.title = "Groups",# Avoid text overlapping
                  title=''
                  
)

pca2=pca2+theme_bw()
pca2=pca2+scale_fill_manual(values=c('red1','blue','black','forestgreen'))
pca2=pca2+scale_color_manual(values=c('red1','blue','black','forestgreen'))
pca2=pca2+theme(
  panel.grid=element_blank(),
  rect=element_rect(fill=NULL),
  legend.text=element_text(size=14),
  legend.title=element_text(size=18,face='bold'),
  plot.title = element_text(size=20,face='bold',hjust = 0.5))
print(pca2)

ggsave(plot=pca2,filename='./Transcriptomic_outputs/PCAs/Summary_PCA.pdf',device='pdf',height=6,width=12)

######################################## GO term analysis ######################################## 
go.terms=fread('./ref/Aedes_go_terms.txt',sep='\t',skip=1) %>% select(V2,V5) %>% rename(Geneid=V2,GO_code=V5) %>% unique.data.frame()
go.key=fread('./ref/GO_key_clean.txt')
go.full=left_join(go.terms,go.key)

## Joining, by = "GO_code"

goterms.freq=go.full %>% freq.table()



### extract GO terms and annotations for each gene 
de.list=list(fc.de.cactusTubule.sigUp$Geneid,
             fc.de.cactusWhole.sigUp$Geneid,
             fc.de.cactusTubule.sigDown$Geneid,
             fc.de.cactusWhole.sigDown$Geneid)
de.annot=lapply(de.list, function(x) fc.full.tpm[Geneid %in% x] %>% select(Geneid,Annotation))
go.de=lapply(de.list, function(x) go.full[Geneid %in% x]) ### Extract GO terms from Pearce reference for each gene in each de list
names(go.de)=c('Tubule_up','Whole_up','Tubule_down','Whole_down')

### Get frequency table for each GO term in each DE list
go.de.freq=list() ### set up empty list
for(i in names(go.de)){go.de.freq[[i]]=freq.table(go.de[[i]])}

#### perform enrichment analysis
goterms.enrich=lapply(go.de.freq,vlepo.enrich,goterms.freq)

######################################## Annotate GO enrichment ######################################## 
l=list()
for(i in names(goterms.enrich)){
  enrich=goterms.enrich[[i]]
  if(is.null(enrich)){next} ### if there are no enriched go terms then skip
  annot=merge(enrich,go.key,by='GO_code',all.y=F,sort=F)
  l[[i]]=data.table(annot[!duplicated(annot)],category=i)
}

######################################## Filter and Write GO enrichment ######################################## 
significant.go=rbindlist(l) %>% #### All GO terms 
  filter(fdr<5e-3)  %>% ### FDR threshold
  filter(GO_cat!='cellular_component')  %>% ### Set category if you want
  #select(GO_code,fdr,GO_name,category) %>% 
  rename(`GO code`=GO_code,`False Discovery Rate`=fdr,`GO Name`=GO_name,Comparison=category)

fwrite(significant.go,'./Transcriptomic_outputs/DE_tables/ALL_ENRICHED_GO.csv')

#### Import DE gene files
fc.de.cactusTubule=fread('./Transcriptomic_outputs/DE_tables/Differntial_expression_Tubule_Counts.csv') %>% 
  .[complete.cases(.)] %>% arrange(desc(logFC)) %>%
  select(Geneid,logFC,Annotation)

sutopa.prot=fread('./Sutopa_proteomes/Sup_Table1_proteins.csv') %>% 
  select(`Majority protein IDs`,`Log2 ratio dsCactus/dsLacZ`) %>% 
  rename(Geneid=`Majority protein IDs`,logFC=`Log2 ratio dsCactus/dsLacZ`) %>%
  mutate(Geneid=gsub('(^AAEL[0-9]+)-P.+$','\\1',Geneid)) %>%
  filter(!duplicated(Geneid)) %>%
  filter(grepl('AA',Geneid)) %>%
  arrange(desc(logFC))

fc.de.cactusTubule$Geneid=factor(fc.de.cactusTubule$Geneid,levels=fc.de.cactusTubule$Geneid)

gp=ggplot(fc.de.cactusTubule,aes(x=Geneid,y=logFC))
gp=gp+geom_bar(stat='identity')
gp=gp+labs(y="Downregulated in LacZ-KD   <---  LogFC   --->   Upregulated in Cactus-KD\n",x='')
gp=gp+geom_vline(xintercept = nrow(fc.de.cactusTubule)/2,size=.5,linetype=2)
gp=gp+theme_bw()
gp=gp+theme(text=element_text(face="bold"),panel.grid=element_blank(),
            axis.ticks.x=element_line(),panel.border=element_rect(colour="black",fill=NA),
            axis.title=element_text(size=10),axis.text.x=element_blank(),
            legend.position = 'none',plot.title = element_text(hjust = 0.5))
print(gp)

ggsave(plot=gp,filename='./GSEA/GSEA_plots/Transcriptomic_FC_distribution.pdf',width=12,height=6)

sutopa.prot$Geneid=factor(sutopa.prot$Geneid,levels=sutopa.prot$Geneid)
gp=ggplot(sutopa.prot,aes(x=Geneid,y=logFC))
gp=gp+geom_bar(stat='identity')
gp=gp+labs(y="Downregulated in LacZ-KD   <---  LogFC   --->   Upregulated in Cactus-KD\n",x='')
gp=gp+geom_vline(xintercept = nrow(sutopa.prot)/2,size=.5,linetype=2)
gp=gp+theme_bw()
gp=gp+theme(text=element_text(face="bold"),panel.grid=element_blank(),
            axis.ticks.x=element_line(),panel.border=element_rect(colour="black",fill=NA),
            axis.title=element_text(size=10),axis.text.x=element_blank(),
            legend.position = 'none',plot.title = element_text(hjust = 0.5))
print(gp)

ggsave(plot=gp,filename='./GSEA/GSEA_plots/Proteomic_FC_distribution.pdf',width=12,height=6)

#### Import GSEA gene sets into list
gl=list()
for(i in list.files('GSEA/GSEA_lists/',full.names = T)[grepl('grp',list.files('GSEA/GSEA_lists'))]){
  a=readLines(i,warn = F)
  nam=a[1] %>% gsub('>','',.) %>% gsub(' ','_',.)
  gl[[nam]]=a[-1]
}

### Make ranks files
trans.ranks=fc.de.cactusTubule$logFC
names(trans.ranks)=fc.de.cactusTubule$Geneid

prot.ranks=sutopa.prot$logFC
names(prot.ranks)=sutopa.prot$Geneid


#### Prepare GSEA tables
fg.trans=fgsea(gl, trans.ranks,minSize=5,eps=0) %>% arrange(padj) %>% select(pathway,pval,padj,log2err,ES,size)
colnames(fg.trans)=c('Gene Set','P-value','Adjusted P-value','Log2Error','Enrichment Score','Sample Size')
fg.prot=fgsea(gl, prot.ranks,minSize=5,eps=0) %>% arrange(padj) %>% select(pathway,pval,padj,log2err,ES,size)
colnames(fg.prot)=c('Gene Set','P-value','Adjusted P-value','Log2Error','Enrichment Score','Sample Size')

fwrite(fg.trans,'./GSEA/GSEA_tables/GSEA_table_transcript.csv')
fwrite(fg.prot,'./GSEA/GSEA_tables/GSEA_table_protein.csv')

for(i in names(gl)){
  
  #### Make subset table
  prot.table=sutopa.prot[Geneid %in% gl[[i]]] %>% merge(gene.annot)
  trans.table=fc.de.cactusTubule[Geneid %in% gl[[i]]] %>% merge(gene.annot)
  
  fwrite(prot.table,paste0('./GSEA/GSEA_tables/Protein_subsets/',i,'_GSEA_protein_table_subset.csv'))
  fwrite(trans.table,paste0('./GSEA/GSEA_tables/Transcriptome_subsets/',i,'_GSEA_trans_table_subset.csv'))
  
  trans.overlap=intersect(gl[[i]],names(trans.ranks))
  
  if(length(trans.overlap)==0){
    next
  }else{
  
  gp=plotEnrichment(gl[[i]], trans.ranks)
  gp$layers[[5]]$aes_params$colour='blue'
  gp$layers[[5]]$aes_params$size=1.5
  gp=gp+labs(x='Cactus-KD Up   <- Rank ->   Cactus-KD Down',y='Enrichment Score')
  gp=gp+ggtitle(i)
  gp=gp+theme_bw() 
   gp=gp+theme(text=element_text(face="bold"),panel.grid=element_blank(),
              axis.ticks.x=element_line(),panel.border=element_rect(colour="black",fill=NA),
              axis.title=element_text(size=12),axis.text.x=element_text(size=10),
              legend.position = 'none',plot.title = element_text(hjust = 0.5))
  #print(gp)
  ggsave(plot=gp,filename=paste0('./GSEA/GSEA_plots/Transcriptome/',i,'_GSEA_plot_transcriptomics.pdf'),device='pdf')
  
  assign(paste(i,'_plot',sep=''),gp)
  
  }
  
  prot.overlap=intersect(gl[[i]],names(prot.ranks))
  
  if(length(prot.overlap)==0){
    next
  }else{
  
    gp=plotEnrichment(pathway=gl[[i]], stats = prot.ranks)
    gp$layers[[5]]$aes_params$colour='blue'
    gp$layers[[5]]$aes_params$size=1.5
    gp=gp+labs(x='Upregulated in Cactus-KD      <---- Rank ---->      Downregulated in Cactus-KD',y='Enrichment Score')
    gp=gp+ggtitle(i)
    gp=gp+theme_bw()
    gp=gp+theme(text=element_text(face="bold"),panel.grid=element_blank(),
                axis.ticks.x=element_line(),panel.border=element_rect(colour="black",fill=NA),
                axis.title=element_text(size=14),axis.text.x=element_text(size=10),
                legend.position = 'none',plot.title = element_text(hjust = 0.5))
    #print(gp)
    ggsave(plot=gp,filename=paste0('./GSEA/GSEA_plots/Proteome/',i,'_GSEA_plot_proteomics.pdf'),device='pdf')
  }
}

## Saving 7 x 5 in image
## Saving 7 x 5 in image
## Saving 7 x 5 in image
## Saving 7 x 5 in image
## Saving 7 x 5 in image
## Saving 7 x 5 in image
## Saving 7 x 5 in image
## Saving 7 x 5 in image
## Saving 7 x 5 in image
## Saving 7 x 5 in image
## Saving 7 x 5 in image
## Saving 7 x 5 in image
## Saving 7 x 5 in image
## Saving 7 x 5 in image
## Saving 7 x 5 in image
## Saving 7 x 5 in image
## Saving 7 x 5 in image
## Saving 7 x 5 in image
## Saving 7 x 5 in image
## Saving 7 x 5 in image
## Saving 7 x 5 in image
## Saving 7 x 5 in image
## Saving 7 x 5 in image
## Saving 7 x 5 in image
## Saving 7 x 5 in image
## Saving 7 x 5 in image
## Saving 7 x 5 in image
## Saving 7 x 5 in image
## Saving 7 x 5 in image
## Saving 7 x 5 in image
## Saving 7 x 5 in image
## Saving 7 x 5 in image
## Saving 7 x 5 in image
## Saving 7 x 5 in image
## Saving 7 x 5 in image
## Saving 7 x 5 in image
## Saving 7 x 5 in image
## Saving 7 x 5 in image
## Saving 7 x 5 in image
## Saving 7 x 5 in image
## Saving 7 x 5 in image
## Saving 7 x 5 in image
## Saving 7 x 5 in image
## Saving 7 x 5 in image
## Saving 7 x 5 in image
## Saving 7 x 5 in image
## Saving 7 x 5 in image
## Saving 7 x 5 in image
## Saving 7 x 5 in image
## Saving 7 x 5 in image
## Saving 7 x 5 in image
## Saving 7 x 5 in image
## Saving 7 x 5 in image
## Saving 7 x 5 in image
## Saving 7 x 5 in image
## Saving 7 x 5 in image
## Saving 7 x 5 in image
## Saving 7 x 5 in image
## Saving 7 x 5 in image
## Saving 7 x 5 in image
## Saving 7 x 5 in image
## Saving 7 x 5 in image
## Saving 7 x 5 in image
## Saving 7 x 5 in image
## Saving 7 x 5 in image
## Saving 7 x 5 in image
## Saving 7 x 5 in image
## Saving 7 x 5 in image
## Saving 7 x 5 in image
## Saving 7 x 5 in image
## Saving 7 x 5 in image
## Saving 7 x 5 in image
## Saving 7 x 5 in image
## Saving 7 x 5 in image
## Saving 7 x 5 in image
## Saving 7 x 5 in image
## Saving 7 x 5 in image
## Saving 7 x 5 in image
## Saving 7 x 5 in image
## Saving 7 x 5 in image
## Saving 7 x 5 in image
## Saving 7 x 5 in image
## Saving 7 x 5 in image
## Saving 7 x 5 in image
## Saving 7 x 5 in image
## Saving 7 x 5 in image
## Saving 7 x 5 in image
## Saving 7 x 5 in image
## Saving 7 x 5 in image
## Saving 7 x 5 in image

grids=list(Brugia_malayi_upregulated_plot,
     REL1_up_plot,
     REL2_up_plot,
     wMelPop_plot)


grids2=grid.arrange(grobs = grids, ncol = 2)

ggsave(plot=grids2,filename = './GSEA/GSEA_plots/Grid_arrange_previous_papers.pdf',width=16,height=8)


grids=list(CTL_plot,
           CLIPs_plot,
           SRPN_plot,
           vATPase_plot,
           PPOs_plot,
           AMPs_plot)
grids2=grid.arrange(grobs = grids, ncol = 3)

ggsave(plot=grids2,filename = './GSEA/GSEA_plots/Grid_arrange_families.pdf',width=16,height=8)
print(grids2)

## TableGrob (2 x 3) "arrange": 6 grobs
##   z     cells    name           grob
## 1 1 (1-1,1-1) arrange gtable[layout]
## 2 2 (1-1,2-2) arrange gtable[layout]
## 3 3 (1-1,3-3) arrange gtable[layout]
## 4 4 (2-2,1-1) arrange gtable[layout]
## 5 5 (2-2,2-2) arrange gtable[layout]
## 6 6 (2-2,3-3) arrange gtable[layout]

Cactus Transcriptomic Analysis

Shane Denecke

2022-08-09