Intro

Dear user. Welcome to the analysis of the Aw1 transcriptomic data. This data was generated by the Vontas group at IMBB in Heraklion, Crete and reperesents transcriptomics on Helicoverpa Zea Aw1 cells derived from the midgut. 4 replicates each were given for free-floating and cells in spheroid like clusters. Panos Ioannidis should be acknowledged for guidance o parts of this script most especially the initial steps with read processing.

Raw read processing

All raw reads were processed on a large server to make the run time much faster and all commands in this section are written in shell (bash).

Read quality control

We start with all of our raw reads in a folder called ‘raw_zipped_fastq’ which has all 16 zipped files (8 pairs of reads; 4 replicates each sample). The first thing is to unzip these and check them for quality with FastQC. This chunk will do both of those things and output all of the read pairs in folders within the “Samples” directory.

mkdir FastQC
mkdir Samples

for i in ./raw_zipped_fastq/*.gz
do
 sample=$(echo $(basename $i) | cut -d'.' -f 1 | cut -d '_' -f 1,2)
 fq_name=$(echo $i | cut -d'.' -f 1,2,3)
 unpigz -k -p 16 $i
 
 ## Run fastqc 
 /data2/shane/Applications/FastQC/fastqc $i 
 
 ## move files
 mkdir ./Samples/$sample
 
 mv $fq_name ./Samples/$sample
 mv ./raw_zipped_fastq/*.html ./FastQC
 mv ./raw_zipped_fastq/*.zip ./FastQC
done

Aligning reads to genome

We can now use the Hisat2 program to the reference genome generated in by Pearce et. al (2017; BMC biology).

##### Use Hisat2 to map reads to genome
hisat2-build -p 16 ./Hz_genome/Hz_genome.fna ./Hz_genome/Hz_genome_index
for i in ./Samples/Aw1*
do
base=$(echo $(basename $i))
cd $i

hisat2 --dta-cufflinks -p 36 -x ../../Hz_genome/Hz_genome_index -1 $base'_1.fastq' -2 $base'_2.fastq' -S out.sam > hisat2.stdout_stderr 2>&1
samtools view -b --threads 36 out.sam > out.bam 2> sam2bam.stderr
cd ../../
done

Quantify reads

Finally, we can use the featureCounts program to see how many genes fall within each of the annotated exons within the predicted genes in H. zea.

mkdir counts_output
featureCounts -F 'GTF' -M -s 2 -T 24 -p -t CDS -g gene_id -a ./Hz_genome/Hz_GTF_R.gtf -o ./counts_output/counts.txt ./Samples/Aw1*/out.bam
Rscript ./scripts/counts_to_tpm.R ./counts_output/counts.txt > ./counts_output/Aw1_TPM.tsv
Rscript ./scripts/Aw1_count_merge.R

R setup

Setup

Import packages

First we’lll get some packages installed and set our working directory etc.

shhh <- suppressPackageStartupMessages
shhh(library(data.table))
shhh(library(tidyr))
shhh(library(dplyr))
shhh(library(ggplot2))
shhh(library(knitr))
shhh(library(edgeR))
shhh(library(ggsci))
shhh(library(factoextra))
shhh(library(scales))
source('./inputs/Aw1_functions.R')
dir.create('plots',showWarnings = F)
dir.create('tables',showWarnings = F)

Import datasets

First we will import and clean up our files a bit. This will include:

Annotations of Helicoverpa zea genes
Raw counts data
1:1 orthologues between Helicvoera armigera and Drosophila
Smooth septate junction genes in Drosophila
Midgut cell type marker genes

## import helicoverpa gene annotations
annot=fread('./inputs/Ha_Hz_master_key_by_gene.csv',select=c('Hz_OGS','annot','pearce_annot','ha_geneid'))
colnames(annot)=c('Hz_OGS','Annotation','pearce_annot','ha_geneid')


##Import table of 1:1 orthologues
oto=fread('./inputs/Ha_Dm_1to1_orthologues.csv')

##import smooth septate junction markers
dm.ssj=fread('./inputs/Dm_ssj_genes.csv')

## import gut cell type markers
dm.marker=fread('./inputs/Annotated_marker_candidates.csv')
colnames(dm.marker)=gsub('fbgn','Dm_FBgn',colnames(dm.marker))

Clean feature counts and convert to transcripts per million

## Import "raw" counts data into R and clean
counts=fread('./inputs/Aw1_counts.tsv') %>% 
  select(Geneid,Length,matches('./Samples'))  %>% rename(Hz_OGS=Geneid)
colnames(counts)=gsub('./Samples/','',colnames(counts))
colnames(counts)=gsub('/out.bam','',colnames(counts))
colnames(counts)=gsub('Aw1_','AW1_FF_',colnames(counts))
colnames(counts)=gsub('Aw1Sph_','AW1_SL_',colnames(counts))
fwrite(counts, './tables/counts_clean.csv')

cols=colnames(counts)
iter=grep("AW1",cols)
normlen=counts$Length/1000

tpm.raw=select(counts,Hz_OGS,Length)

for(i in iter){
  relevant.column=counts[[i]]
  rpk=relevant.column/normlen
  sc=sum(rpk)/1000000
  tpm=rpk/sc
  #colnam1=unlist(strsplit(cols[i],'/'))
  colnam2=cols[i]
  tpm.raw[[colnam2]]=tpm
}

tpm.aw1.sum=select(tpm.raw,AW1_FF_A:AW1_FF_D) %>% rowMeans()
tpm.sph.sum=select(tpm.raw,AW1_SL_A:AW1_SL_D) %>% rowMeans()
tpm.total=tpm.raw %>% 
  mutate(AW1_FF_mean=tpm.aw1.sum,AW1_SL_mean=tpm.sph.sum) %>%
  mutate(fold_change_spheroid=AW1_SL_mean/AW1_FF_mean) %>%
  mutate(fold_change_normal=AW1_FF_mean/AW1_SL_mean) %>% 
  data.table()

tpm.sum=select(tpm.total,Hz_OGS,Length,AW1_FF_mean,AW1_SL_mean,fold_change_spheroid,fold_change_normal)

PCA analysis

########################### PCA analysis
tpm.pca=tpm.raw %>% select(AW1_FF_A:AW1_SL_D) %>% as.matrix() %>% t()
groups=c(rep('AW1_FF',4),rep('AW1_SL',4))
cols=c(rep('red',4),rep('blue',4))


PCA1=prcomp(tpm.pca)

#tiff('./plots/Aw1_PCA.tiff',width=20,height=12,units='cm',res=300,compression='none')
fviz_pca_ind(PCA1,
             col.ind = groups, 
             palette = c("red","blue"),
             repel = TRUE,
             addEllipses = TRUE, # Concentration ellipses
             ellipse.type = "confidence",
             legend.title = "Groups",# Avoid text overlapping
             title='AW1 cell PCA'
             
)

#dev.off()

Differentially expression analysis

Run EdgeR

counts.de=counts %>% select(-Length)
rownames(counts.de)=counts$Hz_OGS
counts.de$Hz_OGS=NULL

group=factor(c(rep("AW1_FF",4),rep("AW1_SL",4)))
delist=DGEList(counts=counts.de,group=group)

### filter
keep=filterByExpr(delist)
delist=delist[keep, , keep.lib.sizes=FALSE]


## Add normalization
delist=calcNormFactors(delist)
delist = estimateCommonDisp(delist)
delist = estimateTagwiseDisp(delist)

##perform test
et = exactTest(delist, pair=c("AW1_FF", "AW1_SL"))
tTags = topTags(et,n=NULL)

output=tTags$table 
row.index=rownames(output) %>% as.numeric()
genes=counts[row.index]$Hz_OGS
output$Hz_OGS=genes
allgenes=select(counts,Hz_OGS)
all.de=data.table(sampleA="AW1_FF", sampleB="AW1_SL", output) %>% merge(allgenes,by='Hz_OGS',all=T) %>% 
  merge(annot,by='Hz_OGS',all.x=T) %>% arrange(FDR) %>%  
  data.table()

only.significant=all.de %>% filter(FDR<1e-5) %>%  select(Hz_OGS:Annotation) %>% arrange(Hz_OGS) %>% data.table()

a=tpm.sum[Hz_OGS %in% only.significant$Hz_OGS] %>% arrange(Hz_OGS) %>% select(-Hz_OGS) %>% data.table()
final.de.table=cbind(only.significant,a) %>% arrange(FDR) %>% data.table()
fwrite(final.de.table,'./tables/Aw1_Sph_DEG.csv')
kable(head(select(final.de.table,FDR,Hz_OGS,logFC,FDR,Annotation,AW1_FF_mean,AW1_SL_mean)))

Hz_OGS	logFC	Annotation	AW1_FF_mean	AW1_SL_mean
HzOG215002	5.376077	uncharacterized protein LOC110383210 isoform X1	0.0486045	1.1522932
HzOG211173	3.996777	17-beta-hydroxysteroid dehydrogenase 14-like	3.5959812	28.3643052
HzOG207045	2.662483	carboxylesterase 1E	10.1907903	35.6623522
HzOG208829	5.433998	putative inorganic phosphate cotransporter	0.0172533	0.4390903
HzOG209290	4.081268	prostaglandin reductase 1-like isoform X1	1.0647131	8.9260595
HzOG212628	-3.721933	dual oxidase maturation factor 1 isoform X1	2.6333858	0.1238758

Volcano Plot

ssj.marker=merge(oto,dm.ssj,by='Dm_FBgn') %>%
  merge(annot,by='ha_geneid') %>% 
  unique.data.frame() %>% 
  select(ha_geneid,Dm_FBgn,Annotation,Hz_OGS) %>%
  merge(tpm.sum,annot='Hz_OGS') %>%
  filter(!is.infinite(fold_change_spheroid)) %>%
  mutate(volc=log2(fold_change_spheroid)) %>%
  data.table()


all.de$threshold = 1
all.de$threshold[all.de$FDR<1e-5]=2
all.de$threshold[all.de$ha_geneid %in% ssj.marker$ha_geneid]=3
all.de$threshold=as.factor(all.de$threshold)
all.de2=all.de[logFC!=0]



fwrite(ssj.marker,'./tables/SSJ_marker_expression.csv')



## make volcano plot
#tiff('./plots/volcano_plot.tiff',width=24,height=12,units='cm',res=300,compression='none')
gp=ggplot(data=all.de2, aes(x=logFC, y =-log10(FDR), color=threshold)) 
gp=gp+geom_point(alpha=.5, size=1.75)
gp=gp+geom_point(data=all.de2[threshold==3],alpha=.8, size=1.75)
gp=gp+lims(x=c(-6, 6)) 
gp=gp+scale_colour_manual(values=c('black','red','green'))
gp=gp+xlab("\nUpregulated in Free-Floating   <--------  LogFC   -------->   Upregulated in Spheroid-Like") + ylab("-log10 FDR\n") 
gp=gp+theme_bw() 
gp=gp+theme(text=element_text(face="bold",family="serif"),panel.grid=element_blank(),
                         axis.ticks.x=element_line(),panel.border=element_rect(colour="black",fill=NA),
                         axis.title=element_text(size=14),axis.text.x=element_text(size=10),
                         legend.position = 'none',plot.title = element_text(hjust = 0.5))
print(gp)

#whatever <- dev.off()

GO analysis of differentially expressed genes

### GO analysis 

go.codes=fread('./inputs/GO_annotations_total.txt') %>% 
  rename(code_name=GO_code)
go.pearce=fread('./inputs/Pearce_2017_GO.csv',header=F,col.names = c('Hz_OGS','GO_term')) 
go.pearce$Hz_OGS=gsub('HaOG','',go.pearce$Hz_OGS)

go.de.fr=final.de.table[logFC<0]
go.de.sp=final.de.table[logFC>0]

go.de.fr$Hz_OGS=gsub('HzOG','',go.de.fr$Hz_OGS)
go.de.sp$Hz_OGS=gsub('HzOG','',go.de.sp$Hz_OGS)


full.go.fr=merge(go.de.fr,go.pearce,by='Hz_OGS')
full.go.sp=merge(go.de.sp,go.pearce,by='Hz_OGS')

fwrite(full.go.fr,'./tables/Full_GO_free.csv')
fwrite(full.go.fr,'./tables/Full_GO_spheroid.csv')

total.table=freq.table(go.pearce)
sub.table.fr=freq.table(full.go.fr)
sub.table.sp=freq.table(full.go.sp)


go.scores.fr=vlepo.enrich(sub.table.fr,total.table) %>% merge(go.codes,by='code_name') %>% filter(fdr<1e-1) %>% filter(test_N>5) %>% arrange(fdr)
go.scores.sp=vlepo.enrich(sub.table.sp,total.table) %>% merge(go.codes,by='code_name') %>% filter(fdr<1e-1) %>% filter(test_N>5) %>% arrange(fdr)

fwrite(go.scores.fr,'./tables/GO_final_free.csv')
fwrite(go.scores.sp,'./tables/GO_final_spheroid.csv')


kable(head(go.scores.sp))

code_name	fischP	fdr	test_N	test_total	ref_N	ref_total	total	test_freq	ref_freq	GO_name	GO_cat
GO:0006950	0.0000354	0.0051861	6	190	49	17089	190	0.0315789	0.0028673	response to stress	biological_process
GO:0016491	0.0000196	0.0051861	17	190	428	17089	190	0.0894737	0.0250454	oxidoreductase activity	molecular_function
GO:0006118	0.0002262	0.0204858	9	190	171	17089	190	0.0473684	0.0100064	obsolete electron transport	biological_process
GO:0009055	0.0005806	0.0243032	7	190	119	17089	190	0.0368421	0.0069635	electron transfer activity	molecular_function

Marker analysis

Now lets see how our transcriptomic data looks with a bit more of a hypothesis driven approach. Firstly we will take a look at how different drosophila markers are expressed, in the normal (non-spheroid) cells. To do this we’ll use the one to one (oto) table which contains all 1:1 orthologues between Helicverpa and Drosophila

Cell type marker analysis

Lets start by looking at cell types. This chunk of code will find the 1:1 orthologues between Drosophila cell type markers and write the table to the Cell_marker_expression_values.csv file.

cell.marker=dm.marker %>% merge(annot,by='Hz_OGS') %>% unique.data.frame() %>% 
  select(ha_geneid,Dm_FBgn,project_id,wide_group,gene,Annotation,Hz_OGS) %>%
  merge(tpm.sum,key='Hz_OGS') %>%
  mutate(log_transform=log2(AW1_FF_mean + 1 - min(AW1_FF_mean))) %>%
  data.table()


dup=dm.marker[dm.marker$gene %>% duplicated()]$gene
dm.marker.uni=dm.marker[!gene %in% dup]
cell.marker=dm.marker.uni %>%
  merge(tpm.sum,key='Hz_OGS') %>% select(Hz_OGS,wide_group,gene,AW1_FF_mean,AW1_SL_mean) %>%
  melt(measure.vars=c('AW1_FF_mean','AW1_SL_mean'),variable.name='sample',value.name='expression.level') %>%
  mutate(expression.log=log2(expression.level)) %>% 
  data.table()

cell.marker$logplot=ifelse(!(cell.marker$expression.log>0),0,cell.marker$expression.log)


fwrite(cell.marker,'./tables/Cell_marker_expression_values.csv')
kable(head(cell.marker))

Hz_OGS	wide_group	gene	sample	expression.level	expression.log	logplot
HzOG201659	EE	Dh31	AW1_FF_mean	0.0972036	-3.362847	0.000000
HzOG202243	EE	NPFR	AW1_FF_mean	0.1352005	-2.886827	0.000000
HzOG203257	EC	vnd	AW1_FF_mean	0.0517951	-4.271040	0.000000
HzOG204062	EC	cad	AW1_FF_mean	0.0428097	-4.545919	0.000000
HzOG204948	ISC	mys	AW1_FF_mean	75.5092847	6.238582	6.238582
HzOG206146	ISC	Rel	AW1_FF_mean	49.0470339	5.616094	5.616094

Now lets see how the expression looks. The plot will be split by markers identified by mantha and markers identified from the harvard website.

We can also save this plot as a pdf

##Plot cell type markers

#tiff('./plots/Cell_type_markers.tiff',width=20,height=12,units='cm',res=300,compression='none')
gp=ggplot(cell.marker,aes(x=wide_group,y=logplot,fill=wide_group))
gp=gp+facet_grid(.~sample)
gp=gp+geom_dotplot(binaxis = "y", stackdir = "center", binwidth = 1, 
                   colour = "black", position = position_dodge(width = 0.5), 
                   dotsize = .35)
gp=gp+labs(x='\nCell Type',y='Log2 of Expression Value\n')
gp=gp+scale_fill_rickandmorty()
#gp=gp+scale_y_log10(breaks = trans_breaks("log2", function(x) 2^x),labels = trans_format("log2", math_format(2^.x)))
#gp=gp+scale_fill_continuous(name = "n occurrences", trans="pseudo_log")
#gp=gp+scale_y_continuous(pseudo_log_trans())
gp=gp+scale_y_continuous(breaks=seq(0,10,1))
#gp=gp+coord_trans(y="log2")
gp=gp+theme(text=element_text(face="bold",family="serif"),panel.grid=element_blank(),
            axis.ticks.x=element_line(),panel.border=element_rect(colour="black",fill=NA),
            strip.text=element_text(size=20),strip.background=element_rect("white"),
            axis.title=element_text(size=17),axis.text.x=element_text(size=12),
            legend.position = 'none',plot.title = element_text(hjust = 0.5))
print(gp)

#dev.off()

Aw1 transcriptome