Dear user. Welcome to the analysis of the Aw1 transcriptomic data. This data was generated by the Vontas group at IMBB in Heraklion, Crete and reperesents transcriptomics on Helicoverpa Zea Aw1 cells derived from the midgut. 4 replicates each were given for free-floating and cells in spheroid like clusters. Panos Ioannidis should be acknowledged for guidance o parts of this script most especially the initial steps with read processing.
All raw reads were processed on a large server to make the run time much faster and all commands in this section are written in shell (bash).
We start with all of our raw reads in a folder called ‘raw_zipped_fastq’ which has all 16 zipped files (8 pairs of reads; 4 replicates each sample). The first thing is to unzip these and check them for quality with FastQC. This chunk will do both of those things and output all of the read pairs in folders within the “Samples” directory.
mkdir FastQC
mkdir Samples
for i in ./raw_zipped_fastq/*.gz
do
sample=$(echo $(basename $i) | cut -d'.' -f 1 | cut -d '_' -f 1,2)
fq_name=$(echo $i | cut -d'.' -f 1,2,3)
unpigz -k -p 16 $i
## Run fastqc
/data2/shane/Applications/FastQC/fastqc $i
## move files
mkdir ./Samples/$sample
mv $fq_name ./Samples/$sample
mv ./raw_zipped_fastq/*.html ./FastQC
mv ./raw_zipped_fastq/*.zip ./FastQC
done
We can now use the Hisat2 program to the reference genome generated in by Pearce et. al (2017; BMC biology).
##### Use Hisat2 to map reads to genome
hisat2-build -p 16 ./Hz_genome/Hz_genome.fna ./Hz_genome/Hz_genome_index
for i in ./Samples/Aw1*
do
base=$(echo $(basename $i))
cd $i
hisat2 --dta-cufflinks -p 36 -x ../../Hz_genome/Hz_genome_index -1 $base'_1.fastq' -2 $base'_2.fastq' -S out.sam > hisat2.stdout_stderr 2>&1
samtools view -b --threads 36 out.sam > out.bam 2> sam2bam.stderr
cd ../../
done
Finally, we can use the featureCounts program to see how many genes fall within each of the annotated exons within the predicted genes in H. zea.
mkdir counts_output
featureCounts -F 'GTF' -M -s 2 -T 24 -p -t CDS -g gene_id -a ./Hz_genome/Hz_GTF_R.gtf -o ./counts_output/counts.txt ./Samples/Aw1*/out.bam
Rscript ./scripts/counts_to_tpm.R ./counts_output/counts.txt > ./counts_output/Aw1_TPM.tsv
Rscript ./scripts/Aw1_count_merge.R
First we’lll get some packages installed and set our working directory etc.
shhh <- suppressPackageStartupMessages
shhh(library(data.table))
shhh(library(tidyr))
shhh(library(dplyr))
shhh(library(ggplot2))
shhh(library(knitr))
shhh(library(edgeR))
shhh(library(ggsci))
shhh(library(factoextra))
shhh(library(scales))
source('./inputs/Aw1_functions.R')
dir.create('plots',showWarnings = F)
dir.create('tables',showWarnings = F)
First we will import and clean up our files a bit. This will include:
## import helicoverpa gene annotations
annot=fread('./inputs/Ha_Hz_master_key_by_gene.csv',select=c('Hz_OGS','annot','pearce_annot','ha_geneid'))
colnames(annot)=c('Hz_OGS','Annotation','pearce_annot','ha_geneid')
##Import table of 1:1 orthologues
oto=fread('./inputs/Ha_Dm_1to1_orthologues.csv')
##import smooth septate junction markers
dm.ssj=fread('./inputs/Dm_ssj_genes.csv')
## import gut cell type markers
dm.marker=fread('./inputs/Annotated_marker_candidates.csv')
colnames(dm.marker)=gsub('fbgn','Dm_FBgn',colnames(dm.marker))
## Import "raw" counts data into R and clean
counts=fread('./inputs/Aw1_counts.tsv') %>%
select(Geneid,Length,matches('./Samples')) %>% rename(Hz_OGS=Geneid)
colnames(counts)=gsub('./Samples/','',colnames(counts))
colnames(counts)=gsub('/out.bam','',colnames(counts))
colnames(counts)=gsub('Aw1_','AW1_FF_',colnames(counts))
colnames(counts)=gsub('Aw1Sph_','AW1_SL_',colnames(counts))
fwrite(counts, './tables/counts_clean.csv')
cols=colnames(counts)
iter=grep("AW1",cols)
normlen=counts$Length/1000
tpm.raw=select(counts,Hz_OGS,Length)
for(i in iter){
relevant.column=counts[[i]]
rpk=relevant.column/normlen
sc=sum(rpk)/1000000
tpm=rpk/sc
#colnam1=unlist(strsplit(cols[i],'/'))
colnam2=cols[i]
tpm.raw[[colnam2]]=tpm
}
tpm.aw1.sum=select(tpm.raw,AW1_FF_A:AW1_FF_D) %>% rowMeans()
tpm.sph.sum=select(tpm.raw,AW1_SL_A:AW1_SL_D) %>% rowMeans()
tpm.total=tpm.raw %>%
mutate(AW1_FF_mean=tpm.aw1.sum,AW1_SL_mean=tpm.sph.sum) %>%
mutate(fold_change_spheroid=AW1_SL_mean/AW1_FF_mean) %>%
mutate(fold_change_normal=AW1_FF_mean/AW1_SL_mean) %>%
data.table()
tpm.sum=select(tpm.total,Hz_OGS,Length,AW1_FF_mean,AW1_SL_mean,fold_change_spheroid,fold_change_normal)
########################### PCA analysis
tpm.pca=tpm.raw %>% select(AW1_FF_A:AW1_SL_D) %>% as.matrix() %>% t()
groups=c(rep('AW1_FF',4),rep('AW1_SL',4))
cols=c(rep('red',4),rep('blue',4))
PCA1=prcomp(tpm.pca)
#tiff('./plots/Aw1_PCA.tiff',width=20,height=12,units='cm',res=300,compression='none')
fviz_pca_ind(PCA1,
col.ind = groups,
palette = c("red","blue"),
repel = TRUE,
addEllipses = TRUE, # Concentration ellipses
ellipse.type = "confidence",
legend.title = "Groups",# Avoid text overlapping
title='AW1 cell PCA'
)
#dev.off()
counts.de=counts %>% select(-Length)
rownames(counts.de)=counts$Hz_OGS
counts.de$Hz_OGS=NULL
group=factor(c(rep("AW1_FF",4),rep("AW1_SL",4)))
delist=DGEList(counts=counts.de,group=group)
### filter
keep=filterByExpr(delist)
delist=delist[keep, , keep.lib.sizes=FALSE]
## Add normalization
delist=calcNormFactors(delist)
delist = estimateCommonDisp(delist)
delist = estimateTagwiseDisp(delist)
##perform test
et = exactTest(delist, pair=c("AW1_FF", "AW1_SL"))
tTags = topTags(et,n=NULL)
output=tTags$table
row.index=rownames(output) %>% as.numeric()
genes=counts[row.index]$Hz_OGS
output$Hz_OGS=genes
allgenes=select(counts,Hz_OGS)
all.de=data.table(sampleA="AW1_FF", sampleB="AW1_SL", output) %>% merge(allgenes,by='Hz_OGS',all=T) %>%
merge(annot,by='Hz_OGS',all.x=T) %>% arrange(FDR) %>%
data.table()
only.significant=all.de %>% filter(FDR<1e-5) %>% select(Hz_OGS:Annotation) %>% arrange(Hz_OGS) %>% data.table()
a=tpm.sum[Hz_OGS %in% only.significant$Hz_OGS] %>% arrange(Hz_OGS) %>% select(-Hz_OGS) %>% data.table()
final.de.table=cbind(only.significant,a) %>% arrange(FDR) %>% data.table()
fwrite(final.de.table,'./tables/Aw1_Sph_DEG.csv')
kable(head(select(final.de.table,FDR,Hz_OGS,logFC,FDR,Annotation,AW1_FF_mean,AW1_SL_mean)))
| FDR | Hz_OGS | logFC | Annotation | AW1_FF_mean | AW1_SL_mean |
|---|---|---|---|---|---|
| 0 | HzOG215002 | 5.376077 | uncharacterized protein LOC110383210 isoform X1 | 0.0486045 | 1.1522932 |
| 0 | HzOG211173 | 3.996777 | 17-beta-hydroxysteroid dehydrogenase 14-like | 3.5959812 | 28.3643052 |
| 0 | HzOG207045 | 2.662483 | carboxylesterase 1E | 10.1907903 | 35.6623522 |
| 0 | HzOG208829 | 5.433998 | putative inorganic phosphate cotransporter | 0.0172533 | 0.4390903 |
| 0 | HzOG209290 | 4.081268 | prostaglandin reductase 1-like isoform X1 | 1.0647131 | 8.9260595 |
| 0 | HzOG212628 | -3.721933 | dual oxidase maturation factor 1 isoform X1 | 2.6333858 | 0.1238758 |
ssj.marker=merge(oto,dm.ssj,by='Dm_FBgn') %>%
merge(annot,by='ha_geneid') %>%
unique.data.frame() %>%
select(ha_geneid,Dm_FBgn,Annotation,Hz_OGS) %>%
merge(tpm.sum,annot='Hz_OGS') %>%
filter(!is.infinite(fold_change_spheroid)) %>%
mutate(volc=log2(fold_change_spheroid)) %>%
data.table()
all.de$threshold = 1
all.de$threshold[all.de$FDR<1e-5]=2
all.de$threshold[all.de$ha_geneid %in% ssj.marker$ha_geneid]=3
all.de$threshold=as.factor(all.de$threshold)
all.de2=all.de[logFC!=0]
fwrite(ssj.marker,'./tables/SSJ_marker_expression.csv')
## make volcano plot
#tiff('./plots/volcano_plot.tiff',width=24,height=12,units='cm',res=300,compression='none')
gp=ggplot(data=all.de2, aes(x=logFC, y =-log10(FDR), color=threshold))
gp=gp+geom_point(alpha=.5, size=1.75)
gp=gp+geom_point(data=all.de2[threshold==3],alpha=.8, size=1.75)
gp=gp+lims(x=c(-6, 6))
gp=gp+scale_colour_manual(values=c('black','red','green'))
gp=gp+xlab("\nUpregulated in Free-Floating <-------- LogFC --------> Upregulated in Spheroid-Like") + ylab("-log10 FDR\n")
gp=gp+theme_bw()
gp=gp+theme(text=element_text(face="bold",family="serif"),panel.grid=element_blank(),
axis.ticks.x=element_line(),panel.border=element_rect(colour="black",fill=NA),
axis.title=element_text(size=14),axis.text.x=element_text(size=10),
legend.position = 'none',plot.title = element_text(hjust = 0.5))
print(gp)
#whatever <- dev.off()
### GO analysis
go.codes=fread('./inputs/GO_annotations_total.txt') %>%
rename(code_name=GO_code)
go.pearce=fread('./inputs/Pearce_2017_GO.csv',header=F,col.names = c('Hz_OGS','GO_term'))
go.pearce$Hz_OGS=gsub('HaOG','',go.pearce$Hz_OGS)
go.de.fr=final.de.table[logFC<0]
go.de.sp=final.de.table[logFC>0]
go.de.fr$Hz_OGS=gsub('HzOG','',go.de.fr$Hz_OGS)
go.de.sp$Hz_OGS=gsub('HzOG','',go.de.sp$Hz_OGS)
full.go.fr=merge(go.de.fr,go.pearce,by='Hz_OGS')
full.go.sp=merge(go.de.sp,go.pearce,by='Hz_OGS')
fwrite(full.go.fr,'./tables/Full_GO_free.csv')
fwrite(full.go.fr,'./tables/Full_GO_spheroid.csv')
total.table=freq.table(go.pearce)
sub.table.fr=freq.table(full.go.fr)
sub.table.sp=freq.table(full.go.sp)
go.scores.fr=vlepo.enrich(sub.table.fr,total.table) %>% merge(go.codes,by='code_name') %>% filter(fdr<1e-1) %>% filter(test_N>5) %>% arrange(fdr)
go.scores.sp=vlepo.enrich(sub.table.sp,total.table) %>% merge(go.codes,by='code_name') %>% filter(fdr<1e-1) %>% filter(test_N>5) %>% arrange(fdr)
fwrite(go.scores.fr,'./tables/GO_final_free.csv')
fwrite(go.scores.sp,'./tables/GO_final_spheroid.csv')
kable(head(go.scores.sp))
| code_name | fischP | fdr | test_N | test_total | ref_N | ref_total | total | test_freq | ref_freq | GO_name | GO_cat |
|---|---|---|---|---|---|---|---|---|---|---|---|
| GO:0006950 | 0.0000354 | 0.0051861 | 6 | 190 | 49 | 17089 | 190 | 0.0315789 | 0.0028673 | response to stress | biological_process |
| GO:0016491 | 0.0000196 | 0.0051861 | 17 | 190 | 428 | 17089 | 190 | 0.0894737 | 0.0250454 | oxidoreductase activity | molecular_function |
| GO:0006118 | 0.0002262 | 0.0204858 | 9 | 190 | 171 | 17089 | 190 | 0.0473684 | 0.0100064 | obsolete electron transport | biological_process |
| GO:0009055 | 0.0005806 | 0.0243032 | 7 | 190 | 119 | 17089 | 190 | 0.0368421 | 0.0069635 | electron transfer activity | molecular_function |
Now lets see how our transcriptomic data looks with a bit more of a hypothesis driven approach. Firstly we will take a look at how different drosophila markers are expressed, in the normal (non-spheroid) cells. To do this we’ll use the one to one (oto) table which contains all 1:1 orthologues between Helicverpa and Drosophila
Lets start by looking at cell types. This chunk of code will find the 1:1 orthologues between Drosophila cell type markers and write the table to the Cell_marker_expression_values.csv file.
cell.marker=dm.marker %>% merge(annot,by='Hz_OGS') %>% unique.data.frame() %>%
select(ha_geneid,Dm_FBgn,project_id,wide_group,gene,Annotation,Hz_OGS) %>%
merge(tpm.sum,key='Hz_OGS') %>%
mutate(log_transform=log2(AW1_FF_mean + 1 - min(AW1_FF_mean))) %>%
data.table()
dup=dm.marker[dm.marker$gene %>% duplicated()]$gene
dm.marker.uni=dm.marker[!gene %in% dup]
cell.marker=dm.marker.uni %>%
merge(tpm.sum,key='Hz_OGS') %>% select(Hz_OGS,wide_group,gene,AW1_FF_mean,AW1_SL_mean) %>%
melt(measure.vars=c('AW1_FF_mean','AW1_SL_mean'),variable.name='sample',value.name='expression.level') %>%
mutate(expression.log=log2(expression.level)) %>%
data.table()
cell.marker$logplot=ifelse(!(cell.marker$expression.log>0),0,cell.marker$expression.log)
fwrite(cell.marker,'./tables/Cell_marker_expression_values.csv')
kable(head(cell.marker))
| Hz_OGS | wide_group | gene | sample | expression.level | expression.log | logplot |
|---|---|---|---|---|---|---|
| HzOG201659 | EE | Dh31 | AW1_FF_mean | 0.0972036 | -3.362847 | 0.000000 |
| HzOG202243 | EE | NPFR | AW1_FF_mean | 0.1352005 | -2.886827 | 0.000000 |
| HzOG203257 | EC | vnd | AW1_FF_mean | 0.0517951 | -4.271040 | 0.000000 |
| HzOG204062 | EC | cad | AW1_FF_mean | 0.0428097 | -4.545919 | 0.000000 |
| HzOG204948 | ISC | mys | AW1_FF_mean | 75.5092847 | 6.238582 | 6.238582 |
| HzOG206146 | ISC | Rel | AW1_FF_mean | 49.0470339 | 5.616094 | 5.616094 |
Now lets see how the expression looks. The plot will be split by markers identified by mantha and markers identified from the harvard website.
We can also save this plot as a pdf
##Plot cell type markers
#tiff('./plots/Cell_type_markers.tiff',width=20,height=12,units='cm',res=300,compression='none')
gp=ggplot(cell.marker,aes(x=wide_group,y=logplot,fill=wide_group))
gp=gp+facet_grid(.~sample)
gp=gp+geom_dotplot(binaxis = "y", stackdir = "center", binwidth = 1,
colour = "black", position = position_dodge(width = 0.5),
dotsize = .35)
gp=gp+labs(x='\nCell Type',y='Log2 of Expression Value\n')
gp=gp+scale_fill_rickandmorty()
#gp=gp+scale_y_log10(breaks = trans_breaks("log2", function(x) 2^x),labels = trans_format("log2", math_format(2^.x)))
#gp=gp+scale_fill_continuous(name = "n occurrences", trans="pseudo_log")
#gp=gp+scale_y_continuous(pseudo_log_trans())
gp=gp+scale_y_continuous(breaks=seq(0,10,1))
#gp=gp+coord_trans(y="log2")
gp=gp+theme(text=element_text(face="bold",family="serif"),panel.grid=element_blank(),
axis.ticks.x=element_line(),panel.border=element_rect(colour="black",fill=NA),
strip.text=element_text(size=20),strip.background=element_rect("white"),
axis.title=element_text(size=17),axis.text.x=element_text(size=12),
legend.position = 'none',plot.title = element_text(hjust = 0.5))
print(gp)
#dev.off()