Intro

Dear user. Welcome to the analysis of the Aw1 transcriptomic data. This data was generated by the Vontas group at IMBB in Heraklion, Crete and reperesents transcriptomics on Helicoverpa Zea Aw1 cells derived from the midgut. 4 replicates each were given for free-floating and cells in spheroid like clusters. Panos Ioannidis should be acknowledged for guidance o parts of this script most especially the initial steps with read processing.

Raw read processing

All raw reads were processed on a large server to make the run time much faster and all commands in this section are written in shell (bash).

Read quality control

We start with all of our raw reads in a folder called ‘raw_zipped_fastq’ which has all 16 zipped files (8 pairs of reads; 4 replicates each sample). The first thing is to unzip these and check them for quality with FastQC. This chunk will do both of those things and output all of the read pairs in folders within the “Samples” directory.

mkdir FastQC
mkdir Samples

for i in ./raw_zipped_fastq/*.gz
do
 sample=$(echo $(basename $i) | cut -d'.' -f 1 | cut -d '_' -f 1,2)
 fq_name=$(echo $i | cut -d'.' -f 1,2,3)
 unpigz -k -p 16 $i
 
 ## Run fastqc 
 /data2/shane/Applications/FastQC/fastqc $i 
 
 ## move files
 mkdir ./Samples/$sample
 
 mv $fq_name ./Samples/$sample
 mv ./raw_zipped_fastq/*.html ./FastQC
 mv ./raw_zipped_fastq/*.zip ./FastQC
done

Aligning reads to genome

We can now use the Hisat2 program to the reference genome generated in by Pearce et. al (2017; BMC biology).

##### Use Hisat2 to map reads to genome
hisat2-build -p 16 ./Hz_genome/Hz_genome.fna ./Hz_genome/Hz_genome_index
for i in ./Samples/Aw1*
do
base=$(echo $(basename $i))
cd $i

hisat2 --dta-cufflinks -p 36 -x ../../Hz_genome/Hz_genome_index -1 $base'_1.fastq' -2 $base'_2.fastq' -S out.sam > hisat2.stdout_stderr 2>&1
samtools view -b --threads 36 out.sam > out.bam 2> sam2bam.stderr
cd ../../
done

Quantify reads

Finally, we can use the featureCounts program to see how many genes fall within each of the annotated exons within the predicted genes in H. zea.

mkdir counts_output
featureCounts -F 'GTF' -M -s 2 -T 24 -p -t CDS -g gene_id -a ./Hz_genome/Hz_GTF_R.gtf -o ./counts_output/counts.txt ./Samples/Aw1*/out.bam
Rscript ./scripts/counts_to_tpm.R ./counts_output/counts.txt > ./counts_output/Aw1_TPM.tsv
Rscript ./scripts/Aw1_count_merge.R

R setup

Setup

Import packages

First we’lll get some packages installed and set our working directory etc.

shhh <- suppressPackageStartupMessages
shhh(library(data.table))
shhh(library(tidyr))
shhh(library(dplyr))
shhh(library(ggplot2))
shhh(library(knitr))
shhh(library(edgeR))
shhh(library(ggsci))
shhh(library(factoextra))
shhh(library(scales))
source('./inputs/Aw1_functions.R')
dir.create('plots',showWarnings = F)
dir.create('tables',showWarnings = F)

Import datasets

First we will import and clean up our files a bit. This will include:

  • Annotations of Helicoverpa zea genes
  • Raw counts data
  • 1:1 orthologues between Helicvoera armigera and Drosophila
  • Smooth septate junction genes in Drosophila
  • Midgut cell type marker genes
## import helicoverpa gene annotations
annot=fread('./inputs/Ha_Hz_master_key_by_gene.csv',select=c('Hz_OGS','annot','pearce_annot','ha_geneid'))
colnames(annot)=c('Hz_OGS','Annotation','pearce_annot','ha_geneid')


##Import table of 1:1 orthologues
oto=fread('./inputs/Ha_Dm_1to1_orthologues.csv')

##import smooth septate junction markers
dm.ssj=fread('./inputs/Dm_ssj_genes.csv')

## import gut cell type markers
dm.marker=fread('./inputs/Annotated_marker_candidates.csv')
colnames(dm.marker)=gsub('fbgn','Dm_FBgn',colnames(dm.marker))

Clean feature counts and convert to transcripts per million

## Import "raw" counts data into R and clean
counts=fread('./inputs/Aw1_counts.tsv') %>% 
  select(Geneid,Length,matches('./Samples'))  %>% rename(Hz_OGS=Geneid)
colnames(counts)=gsub('./Samples/','',colnames(counts))
colnames(counts)=gsub('/out.bam','',colnames(counts))
colnames(counts)=gsub('Aw1_','AW1_FF_',colnames(counts))
colnames(counts)=gsub('Aw1Sph_','AW1_SL_',colnames(counts))
fwrite(counts, './tables/counts_clean.csv')

cols=colnames(counts)
iter=grep("AW1",cols)
normlen=counts$Length/1000

tpm.raw=select(counts,Hz_OGS,Length)

for(i in iter){
  relevant.column=counts[[i]]
  rpk=relevant.column/normlen
  sc=sum(rpk)/1000000
  tpm=rpk/sc
  #colnam1=unlist(strsplit(cols[i],'/'))
  colnam2=cols[i]
  tpm.raw[[colnam2]]=tpm
}

tpm.aw1.sum=select(tpm.raw,AW1_FF_A:AW1_FF_D) %>% rowMeans()
tpm.sph.sum=select(tpm.raw,AW1_SL_A:AW1_SL_D) %>% rowMeans()
tpm.total=tpm.raw %>% 
  mutate(AW1_FF_mean=tpm.aw1.sum,AW1_SL_mean=tpm.sph.sum) %>%
  mutate(fold_change_spheroid=AW1_SL_mean/AW1_FF_mean) %>%
  mutate(fold_change_normal=AW1_FF_mean/AW1_SL_mean) %>% 
  data.table()

tpm.sum=select(tpm.total,Hz_OGS,Length,AW1_FF_mean,AW1_SL_mean,fold_change_spheroid,fold_change_normal)

PCA analysis

########################### PCA analysis
tpm.pca=tpm.raw %>% select(AW1_FF_A:AW1_SL_D) %>% as.matrix() %>% t()
groups=c(rep('AW1_FF',4),rep('AW1_SL',4))
cols=c(rep('red',4),rep('blue',4))


PCA1=prcomp(tpm.pca)

#tiff('./plots/Aw1_PCA.tiff',width=20,height=12,units='cm',res=300,compression='none')
fviz_pca_ind(PCA1,
             col.ind = groups, 
             palette = c("red","blue"),
             repel = TRUE,
             addEllipses = TRUE, # Concentration ellipses
             ellipse.type = "confidence",
             legend.title = "Groups",# Avoid text overlapping
             title='AW1 cell PCA'
             
)

#dev.off()

Differentially expression analysis

Run EdgeR

counts.de=counts %>% select(-Length)
rownames(counts.de)=counts$Hz_OGS
counts.de$Hz_OGS=NULL

group=factor(c(rep("AW1_FF",4),rep("AW1_SL",4)))
delist=DGEList(counts=counts.de,group=group)

### filter
keep=filterByExpr(delist)
delist=delist[keep, , keep.lib.sizes=FALSE]


## Add normalization
delist=calcNormFactors(delist)
delist = estimateCommonDisp(delist)
delist = estimateTagwiseDisp(delist)

##perform test
et = exactTest(delist, pair=c("AW1_FF", "AW1_SL"))
tTags = topTags(et,n=NULL)

output=tTags$table 
row.index=rownames(output) %>% as.numeric()
genes=counts[row.index]$Hz_OGS
output$Hz_OGS=genes
allgenes=select(counts,Hz_OGS)
all.de=data.table(sampleA="AW1_FF", sampleB="AW1_SL", output) %>% merge(allgenes,by='Hz_OGS',all=T) %>% 
  merge(annot,by='Hz_OGS',all.x=T) %>% arrange(FDR) %>%  
  data.table()

only.significant=all.de %>% filter(FDR<1e-5) %>%  select(Hz_OGS:Annotation) %>% arrange(Hz_OGS) %>% data.table()

a=tpm.sum[Hz_OGS %in% only.significant$Hz_OGS] %>% arrange(Hz_OGS) %>% select(-Hz_OGS) %>% data.table()
final.de.table=cbind(only.significant,a) %>% arrange(FDR) %>% data.table()
fwrite(final.de.table,'./tables/Aw1_Sph_DEG.csv')
kable(head(select(final.de.table,FDR,Hz_OGS,logFC,FDR,Annotation,AW1_FF_mean,AW1_SL_mean)))
FDR Hz_OGS logFC Annotation AW1_FF_mean AW1_SL_mean
0 HzOG215002 5.376077 uncharacterized protein LOC110383210 isoform X1 0.0486045 1.1522932
0 HzOG211173 3.996777 17-beta-hydroxysteroid dehydrogenase 14-like 3.5959812 28.3643052
0 HzOG207045 2.662483 carboxylesterase 1E 10.1907903 35.6623522
0 HzOG208829 5.433998 putative inorganic phosphate cotransporter 0.0172533 0.4390903
0 HzOG209290 4.081268 prostaglandin reductase 1-like isoform X1 1.0647131 8.9260595
0 HzOG212628 -3.721933 dual oxidase maturation factor 1 isoform X1 2.6333858 0.1238758

Volcano Plot

ssj.marker=merge(oto,dm.ssj,by='Dm_FBgn') %>%
  merge(annot,by='ha_geneid') %>% 
  unique.data.frame() %>% 
  select(ha_geneid,Dm_FBgn,Annotation,Hz_OGS) %>%
  merge(tpm.sum,annot='Hz_OGS') %>%
  filter(!is.infinite(fold_change_spheroid)) %>%
  mutate(volc=log2(fold_change_spheroid)) %>%
  data.table()


all.de$threshold = 1
all.de$threshold[all.de$FDR<1e-5]=2
all.de$threshold[all.de$ha_geneid %in% ssj.marker$ha_geneid]=3
all.de$threshold=as.factor(all.de$threshold)
all.de2=all.de[logFC!=0]



fwrite(ssj.marker,'./tables/SSJ_marker_expression.csv')



## make volcano plot
#tiff('./plots/volcano_plot.tiff',width=24,height=12,units='cm',res=300,compression='none')
gp=ggplot(data=all.de2, aes(x=logFC, y =-log10(FDR), color=threshold)) 
gp=gp+geom_point(alpha=.5, size=1.75)
gp=gp+geom_point(data=all.de2[threshold==3],alpha=.8, size=1.75)
gp=gp+lims(x=c(-6, 6)) 
gp=gp+scale_colour_manual(values=c('black','red','green'))
gp=gp+xlab("\nUpregulated in Free-Floating   <--------  LogFC   -------->   Upregulated in Spheroid-Like") + ylab("-log10 FDR\n") 
gp=gp+theme_bw() 
gp=gp+theme(text=element_text(face="bold",family="serif"),panel.grid=element_blank(),
                         axis.ticks.x=element_line(),panel.border=element_rect(colour="black",fill=NA),
                         axis.title=element_text(size=14),axis.text.x=element_text(size=10),
                         legend.position = 'none',plot.title = element_text(hjust = 0.5))
print(gp)

#whatever <- dev.off() 

GO analysis of differentially expressed genes

### GO analysis 

go.codes=fread('./inputs/GO_annotations_total.txt') %>% 
  rename(code_name=GO_code)
go.pearce=fread('./inputs/Pearce_2017_GO.csv',header=F,col.names = c('Hz_OGS','GO_term')) 
go.pearce$Hz_OGS=gsub('HaOG','',go.pearce$Hz_OGS)

go.de.fr=final.de.table[logFC<0]
go.de.sp=final.de.table[logFC>0]

go.de.fr$Hz_OGS=gsub('HzOG','',go.de.fr$Hz_OGS)
go.de.sp$Hz_OGS=gsub('HzOG','',go.de.sp$Hz_OGS)


full.go.fr=merge(go.de.fr,go.pearce,by='Hz_OGS')
full.go.sp=merge(go.de.sp,go.pearce,by='Hz_OGS')

fwrite(full.go.fr,'./tables/Full_GO_free.csv')
fwrite(full.go.fr,'./tables/Full_GO_spheroid.csv')

total.table=freq.table(go.pearce)
sub.table.fr=freq.table(full.go.fr)
sub.table.sp=freq.table(full.go.sp)


go.scores.fr=vlepo.enrich(sub.table.fr,total.table) %>% merge(go.codes,by='code_name') %>% filter(fdr<1e-1) %>% filter(test_N>5) %>% arrange(fdr)
go.scores.sp=vlepo.enrich(sub.table.sp,total.table) %>% merge(go.codes,by='code_name') %>% filter(fdr<1e-1) %>% filter(test_N>5) %>% arrange(fdr)

fwrite(go.scores.fr,'./tables/GO_final_free.csv')
fwrite(go.scores.sp,'./tables/GO_final_spheroid.csv')


kable(head(go.scores.sp))
code_name fischP fdr test_N test_total ref_N ref_total total test_freq ref_freq GO_name GO_cat
GO:0006950 0.0000354 0.0051861 6 190 49 17089 190 0.0315789 0.0028673 response to stress biological_process
GO:0016491 0.0000196 0.0051861 17 190 428 17089 190 0.0894737 0.0250454 oxidoreductase activity molecular_function
GO:0006118 0.0002262 0.0204858 9 190 171 17089 190 0.0473684 0.0100064 obsolete electron transport biological_process
GO:0009055 0.0005806 0.0243032 7 190 119 17089 190 0.0368421 0.0069635 electron transfer activity molecular_function

Marker analysis

Now lets see how our transcriptomic data looks with a bit more of a hypothesis driven approach. Firstly we will take a look at how different drosophila markers are expressed, in the normal (non-spheroid) cells. To do this we’ll use the one to one (oto) table which contains all 1:1 orthologues between Helicverpa and Drosophila

Cell type marker analysis

Lets start by looking at cell types. This chunk of code will find the 1:1 orthologues between Drosophila cell type markers and write the table to the Cell_marker_expression_values.csv file.

cell.marker=dm.marker %>% merge(annot,by='Hz_OGS') %>% unique.data.frame() %>% 
  select(ha_geneid,Dm_FBgn,project_id,wide_group,gene,Annotation,Hz_OGS) %>%
  merge(tpm.sum,key='Hz_OGS') %>%
  mutate(log_transform=log2(AW1_FF_mean + 1 - min(AW1_FF_mean))) %>%
  data.table()


dup=dm.marker[dm.marker$gene %>% duplicated()]$gene
dm.marker.uni=dm.marker[!gene %in% dup]
cell.marker=dm.marker.uni %>%
  merge(tpm.sum,key='Hz_OGS') %>% select(Hz_OGS,wide_group,gene,AW1_FF_mean,AW1_SL_mean) %>%
  melt(measure.vars=c('AW1_FF_mean','AW1_SL_mean'),variable.name='sample',value.name='expression.level') %>%
  mutate(expression.log=log2(expression.level)) %>% 
  data.table()

cell.marker$logplot=ifelse(!(cell.marker$expression.log>0),0,cell.marker$expression.log)


fwrite(cell.marker,'./tables/Cell_marker_expression_values.csv')
kable(head(cell.marker))
Hz_OGS wide_group gene sample expression.level expression.log logplot
HzOG201659 EE Dh31 AW1_FF_mean 0.0972036 -3.362847 0.000000
HzOG202243 EE NPFR AW1_FF_mean 0.1352005 -2.886827 0.000000
HzOG203257 EC vnd AW1_FF_mean 0.0517951 -4.271040 0.000000
HzOG204062 EC cad AW1_FF_mean 0.0428097 -4.545919 0.000000
HzOG204948 ISC mys AW1_FF_mean 75.5092847 6.238582 6.238582
HzOG206146 ISC Rel AW1_FF_mean 49.0470339 5.616094 5.616094

Now lets see how the expression looks. The plot will be split by markers identified by mantha and markers identified from the harvard website.

We can also save this plot as a pdf

##Plot cell type markers

#tiff('./plots/Cell_type_markers.tiff',width=20,height=12,units='cm',res=300,compression='none')
gp=ggplot(cell.marker,aes(x=wide_group,y=logplot,fill=wide_group))
gp=gp+facet_grid(.~sample)
gp=gp+geom_dotplot(binaxis = "y", stackdir = "center", binwidth = 1, 
                   colour = "black", position = position_dodge(width = 0.5), 
                   dotsize = .35)
gp=gp+labs(x='\nCell Type',y='Log2 of Expression Value\n')
gp=gp+scale_fill_rickandmorty()
#gp=gp+scale_y_log10(breaks = trans_breaks("log2", function(x) 2^x),labels = trans_format("log2", math_format(2^.x)))
#gp=gp+scale_fill_continuous(name = "n occurrences", trans="pseudo_log")
#gp=gp+scale_y_continuous(pseudo_log_trans())
gp=gp+scale_y_continuous(breaks=seq(0,10,1))
#gp=gp+coord_trans(y="log2")
gp=gp+theme(text=element_text(face="bold",family="serif"),panel.grid=element_blank(),
            axis.ticks.x=element_line(),panel.border=element_rect(colour="black",fill=NA),
            strip.text=element_text(size=20),strip.background=element_rect("white"),
            axis.title=element_text(size=17),axis.text.x=element_text(size=12),
            legend.position = 'none',plot.title = element_text(hjust = 0.5))
print(gp)

#dev.off()