This is product offers a workflow to download fastq files an perform differential gene expession analyis using kallisto for alignment and DESeq to statistal analysis.
A few weeks ago I perfected software installation, so I will not demonstrate that here. Please see this notebook for more.
| Sample | SampleID |
| D-control | D54 |
| D-control | D55 |
| D-control | D56 |
| D-control | D57 |
| D-control | D58 |
| D-control | D59 |
| D-control | M45 |
| D-control | M46 |
| D-control | M48 |
| D-control | M49 |
| D-control | M89 |
| D-control | M90 |
| D-desiccation | N48 |
| D-desiccation | N49 |
| D-desiccation | N50 |
| D-desiccation | N51 |
| D-desiccation | N52 |
| D-desiccation | N53 |
| D-desiccation | N54 |
| D-desiccation | N55 |
| D-desiccation | N56 |
| D-desiccation | N57 |
| D-desiccation | N58 |
| D-desiccation | N59 |
the files are located at https://gannet.fish.washington.edu/seashell/bu-github/nb-2023/Cgigas/data/nopp/
cd ../data
wget --recursive --no-parent --no-directories \
--accept '*001.fastq.gz' \
https://gannet.fish.washington.edu/seashell/bu-github/nb-2023/Cgigas/data/nopp/cd ../data
/home/shared/FastQC/fastqc *fastq.gz -o ../outputeval "$(/opt/anaconda/anaconda3/bin/conda shell.bash hook)"
conda activate
which multiqc
cd ../output
multiqc .MultiQC report available at https://gannet.fish.washington.edu/seashell/bu-github/steven-coursework/assignments/output/multiqc_report.html
cd ../data
curl -O https://gannet.fish.washington.edu/seashell/bu-github/nb-2023/Cgigas/data/rna.fna/home/shared/kallisto/kallisto \
index -i \
../data/cgigas_roslin_rna.index \
../data/rna.fnamkdir ../output/kallisto_01
find ../data/*_L001_R1_001.fastq.gz \
| xargs basename -s _L001_R1_001.fastq.gz | xargs -I{} /home/shared/kallisto/kallisto \
quant -i ../data/cgigas_roslin_rna.index \
-o ../output/kallisto_01/{} \
-t 40 \
--single -l 100 -s 10 ../data/{}_L001_R1_001.fastq.gzperl /home/shared/trinityrnaseq-v2.12.0/util/abundance_estimates_to_matrix.pl \
--est_method kallisto \
--gene_trans_map none \
--out_prefix ../output/kallisto_01 \
--name_sample_by_basedir \
../output/kallisto_01/D54_S145/abundance.tsv \
../output/kallisto_01/D56_S136/abundance.tsv \
../output/kallisto_01/D58_S144/abundance.tsv \
../output/kallisto_01/M45_S140/abundance.tsv \
../output/kallisto_01/M48_S137/abundance.tsv \
../output/kallisto_01/M89_S138/abundance.tsv \
../output/kallisto_01/D55_S146/abundance.tsv \
../output/kallisto_01/D57_S143/abundance.tsv \
../output/kallisto_01/D59_S142/abundance.tsv \
../output/kallisto_01/M46_S141/abundance.tsv \
../output/kallisto_01/M49_S139/abundance.tsv \
../output/kallisto_01/M90_S147/abundance.tsv \
../output/kallisto_01/N48_S194/abundance.tsv \
../output/kallisto_01/N50_S187/abundance.tsv \
../output/kallisto_01/N52_S184/abundance.tsv \
../output/kallisto_01/N54_S193/abundance.tsv \
../output/kallisto_01/N56_S192/abundance.tsv \
../output/kallisto_01/N58_S195/abundance.tsv \
../output/kallisto_01/N49_S185/abundance.tsv \
../output/kallisto_01/N51_S186/abundance.tsv \
../output/kallisto_01/N53_S188/abundance.tsv \
../output/kallisto_01/N55_S190/abundance.tsv \
../output/kallisto_01/N57_S191/abundance.tsv \
../output/kallisto_01/N59_S189/abundance.tsvcountmatrix <- read.delim("../output/kallisto_01.isoform.counts.matrix", header = TRUE, sep = '\t')
rownames(countmatrix) <- countmatrix$X
countmatrix <- countmatrix[,-1]
head(countmatrix)## D54_S145 D56_S136 D58_S144 M45_S140 M48_S137 M89_S138 D55_S146
## XM_011416156.3 0 0 0 0 0 0.0000 0
## XM_034446348.1 0 0 2 0 0 0.0000 0
## XM_034459753.1 0 0 0 0 0 0.0000 0
## XR_004602598.1 0 0 0 0 0 3.3511 0
## XM_034463888.1 0 0 0 0 0 0.0000 0
## XR_004601019.1 0 1 0 0 0 0.0000 4
## D57_S143 D59_S142 M46_S141 M49_S139 M90_S147 N48_S194 N50_S187
## XM_011416156.3 0 0 0 0 0 0 0
## XM_034446348.1 0 0 1 0 0 1 0
## XM_034459753.1 0 0 0 0 0 0 0
## XR_004602598.1 0 0 0 0 0 0 0
## XM_034463888.1 0 0 0 0 0 0 0
## XR_004601019.1 0 0 0 0 0 0 0
## N52_S184 N54_S193 N56_S192 N58_S195 N49_S185 N51_S186 N53_S188
## XM_011416156.3 0.00000 0.00000 0 0 0 0 0.00000
## XM_034446348.1 0.00000 1.00000 0 0 0 0 0.00000
## XM_034459753.1 0.00000 0.00000 0 0 0 0 0.00000
## XR_004602598.1 5.98477 3.27488 0 0 0 0 1.15991
## XM_034463888.1 0.00000 0.00000 0 0 0 0 0.00000
## XR_004601019.1 2.00000 0.00000 0 0 0 0 0.00000
## N55_S190 N57_S191 N59_S189
## XM_011416156.3 0 0 0.0000
## XM_034446348.1 0 0 0.0000
## XM_034459753.1 0 0 12.9937
## XR_004602598.1 0 0 0.0000
## XM_034463888.1 0 0 0.0000
## XR_004601019.1 0 0 0.0000
countmatrix <- round(countmatrix, 0)
str(countmatrix)## 'data.frame': 73307 obs. of 24 variables:
## $ D54_S145: num 0 0 0 0 0 0 0 2 0 0 ...
## $ D56_S136: num 0 0 0 0 0 1 0 2 0 1 ...
## $ D58_S144: num 0 2 0 0 0 0 0 5 0 0 ...
## $ M45_S140: num 0 0 0 0 0 0 0 4 1 1 ...
## $ M48_S137: num 0 0 0 0 0 0 9 5 1 0 ...
## $ M89_S138: num 0 0 0 3 0 0 0 16 1 1 ...
## $ D55_S146: num 0 0 0 0 0 4 0 7 1 1 ...
## $ D57_S143: num 0 0 0 0 0 0 5 5 0 0 ...
## $ D59_S142: num 0 0 0 0 0 0 0 6 1 0 ...
## $ M46_S141: num 0 1 0 0 0 0 0 9 0 1 ...
## $ M49_S139: num 0 0 0 0 0 0 0 7 1 0 ...
## $ M90_S147: num 0 0 0 0 0 0 0 5 0 1 ...
## $ N48_S194: num 0 1 0 0 0 0 0 4 1 1 ...
## $ N50_S187: num 0 0 0 0 0 0 0 6 0 1 ...
## $ N52_S184: num 0 0 0 6 0 2 2 4 0 0 ...
## $ N54_S193: num 0 1 0 3 0 0 0 12 0 1 ...
## $ N56_S192: num 0 0 0 0 0 0 0 2 0 0 ...
## $ N58_S195: num 0 0 0 0 0 0 0 8 0 0 ...
## $ N49_S185: num 0 0 0 0 0 0 2 13 0 1 ...
## $ N51_S186: num 0 0 0 0 0 0 0 7 0 0 ...
## $ N53_S188: num 0 0 0 1 0 0 0 2 0 2 ...
## $ N55_S190: num 0 0 0 0 0 0 0 7 0 1 ...
## $ N57_S191: num 0 0 0 0 0 0 0 2 0 0 ...
## $ N59_S189: num 0 0 13 0 0 0 0 5 0 0 ...
dim(countmatrix)
dim(deseq2.colData)length(colnames(data))deseq2.colData <- data.frame(condition=factor(c(rep("control", 12), rep("desicated", 12))),
type=factor(rep("single-read", 24)))
rownames(deseq2.colData) <- colnames(data)
deseq2.dds <- DESeqDataSetFromMatrix(countData = countmatrix,
colData = deseq2.colData,
design = ~ condition)deseq2.dds <- DESeq(deseq2.dds)
deseq2.res <- results(deseq2.dds)
deseq2.res <- deseq2.res[order(rownames(deseq2.res)), ]vsd <- vst(deseq2.dds, blind = FALSE)
plotPCA(vsd, intgroup = "condition")# Select top 50 differentially expressed genes
res <- results(deseq2.dds)
res_ordered <- res[order(res$padj), ]
top_genes <- row.names(res_ordered)[1:50]
# Extract counts and normalize
counts <- counts(deseq2.dds, normalized = TRUE)
counts_top <- counts[top_genes, ]
# Log-transform counts
log_counts_top <- log2(counts_top + 1)
# Generate heatmap
pheatmap(log_counts_top, scale = "row")head(deseq2.res)## log2 fold change (MLE): condition desicated vs control
## Wald test p-value: condition desicated vs control
## DataFrame with 6 rows and 6 columns
## baseMean log2FoldChange lfcSE stat pvalue
## <numeric> <numeric> <numeric> <numeric> <numeric>
## NM_001305288.1 0.180896 1.0376053 3.002647 0.345564 7.29671e-01
## NM_001305289.1 0.883924 -2.8199991 1.067976 -2.640509 8.27816e-03
## NM_001305290.1 145.563666 0.4576780 0.116258 3.936753 8.25915e-05
## NM_001305291.1 0.263385 0.5540863 1.585370 0.349500 7.26714e-01
## NM_001305292.1 2.903611 -1.2204901 0.763903 -1.597703 1.10109e-01
## NM_001305293.1 233.891565 0.0644989 0.131819 0.489298 6.24631e-01
## padj
## <numeric>
## NM_001305288.1 NA
## NM_001305289.1 NA
## NM_001305290.1 0.00956885
## NM_001305291.1 NA
## NM_001305292.1 0.58676490
## NM_001305293.1 0.95304181
# Count number of hits with adjusted p-value less then 0.05
dim(deseq2.res[!is.na(deseq2.res$padj) & deseq2.res$padj <= 0.05, ])## [1] 609 6
tmp <- deseq2.res
# The main plot
plot(tmp$baseMean, tmp$log2FoldChange, pch=20, cex=0.45, ylim=c(-3, 3), log="x", col="darkgray",
main="DEG Dessication (pval <= 0.05)",
xlab="mean of normalized counts",
ylab="Log2 Fold Change")
# Getting the significant points and plotting them again so they're a different color
tmp.sig <- deseq2.res[!is.na(deseq2.res$padj) & deseq2.res$padj <= 0.05, ]
points(tmp.sig$baseMean, tmp.sig$log2FoldChange, pch=20, cex=0.45, col="red")
# 2 FC lines
abline(h=c(-1,1), col="blue")# Prepare the data for plotting
res_df <- as.data.frame(deseq2.res)
res_df$gene <- row.names(res_df)
# Create volcano plot
volcano_plot <- ggplot(res_df, aes(x = log2FoldChange, y = -log10(padj), color = padj < 0.05)) +
geom_point(alpha = 0.6, size = 1.5) +
scale_color_manual(values = c("grey", "red")) +
labs(title = "Volcano Plot",
x = "Log2 Fold Change",
y = "-Log10 Adjusted P-value",
color = "Significantly\nDifferentially Expressed") +
theme_minimal() +
theme(panel.grid.major = element_blank(),
panel.grid.minor = element_blank(),
legend.position = "top")
print(volcano_plot)write.table(tmp.sig, "../output/DEGlist.tab", sep = '\t', row.names = T)deglist <- read.csv("../output/DEGlist.tab", sep = '\t', header = TRUE)
deglist$RowName <- rownames(deglist)
deglist2 <- deglist[, c("RowName", "pvalue")] # Optionally, reorder the columnsdatatable(deglist)cg_sp <- read.csv("https://raw.githubusercontent.com/sr320/nb-2022/main/C_gigas/analyses/CgR-blastp-sp.tab", header = FALSE, sep="\t") %>%
distinct(V1, .keep_all = TRUE)loc <- read.csv("https://raw.githubusercontent.com/sr320/nb-2022/main/C_gigas/analyses/LOC_Acc.tab", sep = " ", header = FALSE)comb <- left_join(loc, cg_sp, by = c("V2" = "V1")) %>%
left_join(deglist, by = c("V1" = "RowName"))gene_deg_status <- res_df %>%
mutate(degstaus = ifelse(padj < 0.05, 1, 0)) # Read the FASTA file
fasta_data <- readDNAStringSet("../data/rna.fna")
# Calculate gene lengths
gene_lengths <- width(fasta_data)
# Extract gene names/IDs from sequence IDs
gene_names <- sapply(names(fasta_data), function(x) strsplit(x, " ")[[1]][1])
# Create a data frame with gene IDs and lengths
gene_lengths_df <- data.frame(geneID = gene_names, length = gene_lengths)pwf <- nullp(gene_data, bias.data = gene_lengths)
GO_analysis <- goseq(pwf, gene2cat = "your_organism_GO_mapping")