Download oyster reference transcriptome from Gannet with
curl. Make sure you change your directory to the location
where you want to save the file.
cd ~/github/zach-coursework/assignments/data
curl --insecure -O https://gannet.fish.washington.edu/seashell/bu-github/nb-2023/Cgigas/data/rna.fna
Use the index function in Kallisto to create an index for use in transcript quantification.
/home/shared/kallisto/kallisto \
index -i \
~/github/zach-coursework/assignments/data/cgigas_roslin_rna.index \
~/github/zach-coursework/assignments/data/rna.fna
Download RNA-seq data for each of your samples, dessication stress vs
control. Make sure you change your directory to where you want the files
to be located. This code completes a recursive wget
download to retrieve all sample files from Gannet.
cd ~/github/zach-coursework/assignments/data
wget --recursive --no-parent --no-directories \
--no-check-certificate \
--accept '*.fastq.gz' \
https://gannet.fish.washington.edu/seashell/bu-github/nb-2023/Cgigas/data/nopp/
Create a directory to house the output of the Kallisto quantification.
mkdir ~github/zach-coursework/assignments/output/kallisto_01
Run kallisto to quantify transcript abundances for each sample. This code uses xargs to run Kallisto transcript abundance quantification for each sample file.
find ~/github/zach-coursework/assignments/data/*fastq.gz \
| xargs basename -s _L001_R1_001.fastq.gz | xargs -I{} /home/shared/kallisto/kallisto \
quant -i ~/github/zach-coursework/assignments/data/cgigas_roslin_rna.index \
-o ~/github/zach-coursework/assignments/output/kallisto_01/{} \
-t 40 \
--single -l 100 -s 10 ~/github/zach-coursework/assignments/data/{}_L001_R1_001.fastq.gz
This code runs the abundance_estimates_to_matrix.pl
script from the Trinity RNA-Seq assembly software package. The script
takes in abundance estimates generated by the Kallisto RNA-Seq
quantification tool, and outputs a matrix of gene expression values.
--est_method kallisto: specifies the estimation method
used to generate the abundance estimates.--gene_trans_map none: specifies that there is no
gene-to-transcript mapping file provided.--out_prefix ~/github/zach-coursework/assignments/output/kallisto_01:
specifies the path and filename prefix for the output files.--name_sample_by_basedir: specifies that the sample
names should be derived from the directory names where the input files
are located.perl /home/shared/trinityrnaseq-v2.12.0/util/abundance_estimates_to_matrix.pl \
--est_method kallisto \
--gene_trans_map none \
--out_prefix ~/github/zach-coursework/assignments/output/kallisto_01 \
--name_sample_by_basedir \
~/github/zach-coursework/assignments/output/kallisto_01/D54_S145/abundance.tsv \
~/github/zach-coursework/assignments/output/kallisto_01/D56_S136/abundance.tsv \
~/github/zach-coursework/assignments/output/kallisto_01/D58_S144/abundance.tsv \
~/github/zach-coursework/assignments/output/kallisto_01/M45_S140/abundance.tsv \
~/github/zach-coursework/assignments/output/kallisto_01/M48_S137/abundance.tsv \
~/github/zach-coursework/assignments/output/kallisto_01/M89_S138/abundance.tsv \
~/github/zach-coursework/assignments/output/kallisto_01/D55_S146/abundance.tsv \
~/github/zach-coursework/assignments/output/kallisto_01/D57_S143/abundance.tsv \
~/github/zach-coursework/assignments/output/kallisto_01/D59_S142/abundance.tsv \
~/github/zach-coursework/assignments/output/kallisto_01/M46_S141/abundance.tsv \
~/github/zach-coursework/assignments/output/kallisto_01/M49_S139/abundance.tsv \
~/github/zach-coursework/assignments/output/kallisto_01/M90_S147/abundance.tsv \
~/github/zach-coursework/assignments/output/kallisto_01/N48_S194/abundance.tsv \
~/github/zach-coursework/assignments/output/kallisto_01/N50_S187/abundance.tsv \
~/github/zach-coursework/assignments/output/kallisto_01/N52_S184/abundance.tsv \
~/github/zach-coursework/assignments/output/kallisto_01/N54_S193/abundance.tsv \
~/github/zach-coursework/assignments/output/kallisto_01/N56_S192/abundance.tsv \
~/github/zach-coursework/assignments/output/kallisto_01/N58_S195/abundance.tsv \
~/github/zach-coursework/assignments/output/kallisto_01/N49_S185/abundance.tsv \
~/github/zach-coursework/assignments/output/kallisto_01/N51_S186/abundance.tsv \
~/github/zach-coursework/assignments/output/kallisto_01/N53_S188/abundance.tsv \
~/github/zach-coursework/assignments/output/kallisto_01/N55_S190/abundance.tsv \
~/github/zach-coursework/assignments/output/kallisto_01/N57_S191/abundance.tsv \
~/github/zach-coursework/assignments/output/kallisto_01/N59_S189/abundance.tsv
You will need the following packages to run DESeq2 and create figures.
if (!require("BiocManager", quietly = TRUE))
install.packages("BiocManager")
BiocManager::install("DESeq2")
install.packages(pheatmap)
library(DESeq2)
library(tidyverse)
library(pheatmap)
library(RColorBrewer)
library(data.table)
countmatrix <- read.delim("~/github/zach-coursework/assignments/output/kallisto_01.isoform.counts.matrix", header = TRUE, sep = '\t')
rownames(countmatrix) <- countmatrix$X
countmatrix <- countmatrix[,-1]
head(countmatrix)
## D54_S145 D56_S136 D58_S144 M45_S140 M48_S137 M89_S138 D55_S146
## XM_011442158.3 0.0000 0.0000 0.0000 0.000 0.0000 0.0000 0.000
## XM_034455365.1 12.0000 5.0000 1.0000 2.000 5.0000 4.0000 10.000
## XM_034482524.1 10.0000 13.0000 8.0000 24.000 10.0000 11.0000 9.000
## NM_001305301.1 86.3018 98.6476 92.5927 190.752 99.5275 82.5806 104.933
## XM_011443474.3 0.0000 0.0000 0.0000 0.000 0.0000 0.0000 0.000
## XM_011418006.3 0.0000 0.0000 0.0000 0.000 11.5983 0.0000 0.000
## D57_S143 D59_S142 M46_S141 M49_S139 M90_S147 N48_S194 N50_S187
## XM_011442158.3 0.000 0.0000 0.0000 0.000 0.000 0.000 0.0000
## XM_034455365.1 13.000 6.0000 16.0000 12.000 5.000 10.000 9.0000
## XM_034482524.1 16.000 18.0000 18.0000 14.000 18.000 8.000 17.0000
## NM_001305301.1 117.826 60.1965 31.5452 111.119 119.549 117.143 92.1648
## XM_011443474.3 0.000 0.0000 0.0000 0.000 0.000 0.000 0.0000
## XM_011418006.3 0.000 0.0000 0.0000 0.000 0.000 0.000 0.0000
## N52_S184 N54_S193 N56_S192 N58_S195 N49_S185 N51_S186
## XM_011442158.3 0.000 1.07834e-08 0.00000 0.000 0.000 0.000
## XM_034455365.1 5.000 8.00000e+00 4.00000 4.000 4.000 12.000
## XM_034482524.1 13.000 1.10000e+01 0.00000 16.000 7.000 14.000
## NM_001305301.1 164.783 9.60914e+01 8.59238 144.308 126.113 127.083
## XM_011443474.3 0.000 0.00000e+00 0.00000 0.000 0.000 0.000
## XM_011418006.3 0.000 0.00000e+00 0.00000 0.000 0.000 0.000
## N53_S188 N55_S190 N57_S191 N59_S189
## XM_011442158.3 0.000 0.000 0.0000 0.00000e+00
## XM_034455365.1 7.000 2.000 2.0000 9.00000e+00
## XM_034482524.1 4.000 9.000 12.0000 2.30000e+01
## NM_001305301.1 104.362 123.268 32.3209 3.86455e+01
## XM_011443474.3 0.000 0.000 0.0000 2.45518e-07
## XM_011418006.3 0.000 0.000 0.0000 0.00000e+00
Round the values in the countmatrix dataframe to the nearest integer
by using the round function. Check with
str.
countmatrix <- round(countmatrix, 0)
str(countmatrix)
## 'data.frame': 73307 obs. of 24 variables:
## $ D54_S145: num 0 12 10 86 0 0 0 0 0 0 ...
## $ D56_S136: num 0 5 13 99 0 0 0 2 0 0 ...
## $ D58_S144: num 0 1 8 93 0 0 0 0 107 21 ...
## $ M45_S140: num 0 2 24 191 0 0 0 0 0 0 ...
## $ M48_S137: num 0 5 10 100 0 12 0 0 0 8 ...
## $ M89_S138: num 0 4 11 83 0 0 0 0 0 23 ...
## $ D55_S146: num 0 10 9 105 0 0 0 2 0 27 ...
## $ D57_S143: num 0 13 16 118 0 0 0 5 0 0 ...
## $ D59_S142: num 0 6 18 60 0 0 0 2 0 21 ...
## $ M46_S141: num 0 16 18 32 0 0 0 3 0 7 ...
## $ M49_S139: num 0 12 14 111 0 0 0 1 0 8 ...
## $ M90_S147: num 0 5 18 120 0 0 0 1 0 1 ...
## $ N48_S194: num 0 10 8 117 0 0 0 0 0 0 ...
## $ N50_S187: num 0 9 17 92 0 0 0 0 0 4 ...
## $ N52_S184: num 0 5 13 165 0 0 0 0 0 6 ...
## $ N54_S193: num 0 8 11 96 0 0 0 0 0 4 ...
## $ N56_S192: num 0 4 0 9 0 0 0 0 0 0 ...
## $ N58_S195: num 0 4 16 144 0 0 0 0 0 0 ...
## $ N49_S185: num 0 4 7 126 0 0 0 0 0 6 ...
## $ N51_S186: num 0 12 14 127 0 0 0 1 0 0 ...
## $ N53_S188: num 0 7 4 104 0 0 0 0 0 7 ...
## $ N55_S190: num 0 2 9 123 0 0 0 1 0 6 ...
## $ N57_S191: num 0 2 12 32 0 0 1 1 0 12 ...
## $ N59_S189: num 0 9 23 39 0 0 0 2 0 5 ...
deseq2.colData <- data.frame(condition=factor(c(rep("control", 12), rep("desicated", 12))),
type=factor(rep("single-read", 24)))
rownames(deseq2.colData) <- colnames(data)
deseq2.dds <- DESeqDataSetFromMatrix(countData = countmatrix,
colData = deseq2.colData,
design = ~ condition)
This code performs differential expression analysis on the provided gene count matrix using the DESeq2 R package, as follows:
deseq2.dds <- DESeq(deseq2.dds)
deseq2.res <- results(deseq2.dds)
deseq2.res <- deseq2.res[order(rownames(deseq2.res)), ]
head(deseq2.res)
## log2 fold change (MLE): condition desicated vs control
## Wald test p-value: condition desicated vs control
## DataFrame with 6 rows and 6 columns
## baseMean log2FoldChange lfcSE stat pvalue
## <numeric> <numeric> <numeric> <numeric> <numeric>
## NM_001305288.1 0.181270 1.0453698 3.002647 0.348149 7.27728e-01
## NM_001305289.1 0.881457 -2.8119577 1.068276 -2.632239 8.48240e-03
## NM_001305290.1 145.913728 0.4580323 0.116185 3.942251 8.07203e-05
## NM_001305291.1 0.261701 0.5618449 1.587076 0.354013 7.23329e-01
## NM_001305292.1 2.902430 -1.2181330 0.763421 -1.595624 1.10573e-01
## NM_001305293.1 234.342117 0.0663449 0.131969 0.502731 6.15154e-01
## padj
## <numeric>
## NM_001305288.1 NA
## NM_001305289.1 NA
## NM_001305290.1 0.00956401
## NM_001305291.1 NA
## NM_001305292.1 0.59541971
## NM_001305293.1 0.95562321
Calculate the number of differentially expressed genes (DEGs) with an adjusted p-value less than or equal to 0.05. It does this by subsetting the deseq2.res object for rows where the adjusted p-value is not NA and is less than or equal to 0.05, and then returning the dimensions of this subset. Since the deseq2.res object contains a row for each gene, the number of rows in this subset corresponds to the number of DEGs.
# Count number of hits with adjusted p-value less then 0.05
dim(deseq2.res[!is.na(deseq2.res$padj) & deseq2.res$padj <= 0.05, ])
## [1] 607 6
This code generates a scatterplot of log2 fold changes against the mean normalized counts for all genes in the dataset, and highlights genes that are differentially expressed with adjusted p-value less than or equal to 0.05.
tmp <- deseq2.res
# The main plot
plot(tmp$baseMean, tmp$log2FoldChange, pch=20, cex=0.45, ylim=c(-3, 3), log="x", col="darkgray",
main="DEG Dessication (pval <= 0.05)",
xlab="mean of normalized counts",
ylab="Log2 Fold Change")
## Warning in xy.coords(x, y, xlabel, ylabel, log): 16484 x values <= 0 omitted
## from logarithmic plot
# Getting the significant points and plotting them again so they're a different color
tmp.sig <- deseq2.res[!is.na(deseq2.res$padj) & deseq2.res$padj <= 0.05, ]
points(tmp.sig$baseMean, tmp.sig$log2FoldChange, pch=20, cex=0.45, col="red")
# 2 FC lines
abline(h=c(-1,1), col="blue")
Write table to a tab separated file, found here
write.table(tmp.sig, "~/github/zach-coursework/assignments/output/DEGlist.tab", row.names = T)