Following Chris Stubben’s methods
Load necessary packages
suppressPackageStartupMessages({
library(DESeq2)
library(readr)
library(hciR)
library(hciRdata)
library(DT)
library(ggplot2)
})
Load sample metadata
samples<-read_tsv("~/WUSTL chemo v naive Metdata RNAseq.txt")
##
## ── Column specification ────────────────────────────────────────────────────────
## cols(
## Histology = col_character(),
## Sample = col_character(),
## Treatment = col_character()
## )
datatable(samples)
Load raw counts data
counts <- read_tsv("~/RPM_CvN_bulkRNAWUSTL.txt")
##
## ── Column specification ────────────────────────────────────────────────────────
## cols(
## gene_id = col_character(),
## PB513 = col_double(),
## PB511 = col_double(),
## PB512 = col_double(),
## PB518 = col_double(),
## PB510 = col_double(),
## PB517 = col_double(),
## PB520 = col_double(),
## AI197 = col_double(),
## AI192 = col_double(),
## PB384 = col_double(),
## PB383 = col_double(),
## PB431 = col_double(),
## PB442 = col_double(),
## AI187 = col_double()
## )
datatable(counts)
## Warning in instance$preRenderHook(instance): It seems your data is too big
## for client-side DataTables. You may consider server-side processing: https://
## rstudio.github.io/DT/server.html
Remove features with zero counts and features with 5 or fewer reads in every sample.
counts <- filter_counts(counts, n = 5)
## Removed 19022 features with 0 reads
## Removed 18316 features with <=5 maximum reads
Add biotype, mouse 94, and remove non-coding genes
counts <- semi_join(counts, filter(mouse94, biotype=="protein_coding"), by=c(gene_id="id"))
counts <- semi_join(counts, filter(mouse94,
!substr(gene_name, 1,3) %in% c("His", "Rpl", "Rps", "Pcd")), by=c(gene_id="id"))
datatable(counts)
## Warning in instance$preRenderHook(instance): It seems your data is too big
## for client-side DataTables. You may consider server-side processing: https://
## rstudio.github.io/DT/server.html
Run DESeq using ~Treatment in the design formula and get the regularized log transforms (rlog) for sample visualizations
dds <- deseq_from_tibble(counts, samples, minReplicates = 7, design = ~ Treatment)
## Warning in DESeqDataSet(se, design = design, ignoreRank): some variables in
## design formula are characters, converting to factors
## estimating size factors
## estimating dispersions
## gene-wise dispersion estimates
## mean-dispersion relationship
## final dispersion estimates
## fitting model and testing
## -- replacing outliers and refitting for 71 genes
## -- DESeq argument 'minReplicatesForReplace' = 7
## -- original counts are preserved in counts(dds)
## estimating dispersions
## fitting model and testing
dds <- estimateSizeFactors(dds)
invisible(counts(dds, normalized=TRUE))
rld <- r_log(dds)
Plot the first two principal components using the rlog values from the top 500 variable genes.
plot_pca(rld, "Treatment", tooltip=c("Histology", "Sample", "Treatment"), width=600, point_size=20)
## Registered S3 method overwritten by 'quantmod':
## method from
## as.zoo.data.frame zoo
PB383 and PB384 are very distinct chemo treated samples from remaining.
Compare chemo vs naive RPM tumors using 5% FDR
res1<-results_all(dds,mouse100)
## Using adjusted p-value < 0.05
## Adding shrunken fold changes to log2FoldChange
## 1. Chemo vs. Untreated: 43 up and 19 down regulated
## Note: 15 rows in results are missing from biomart table
datatable(res1)
## Warning in instance$preRenderHook(instance): It seems your data is too big
## for client-side DataTables. You may consider server-side processing: https://
## rstudio.github.io/DT/server.html
options(ggrepel.max.overlaps = Inf)
Plot fold changes and p-vals in volcano plot with FC cutoff of at least double for labeling.
plot_volcano(res1,labelsize=2.5, pvalue=c(1,5), foldchange_cutoff=1.2)+ggtitle("RPM Chemo v Naive DEGs")
Cluster up to the top 1000 significant genes and scale by rows so that values are z-scores and represent the number of std deviations from mean count value
x<-top_counts(res1, rld, top=1000)
datatable(x)
dim(x)
## [1] 62 15
Only 62 genes are significantly differentially expressed, 43 are up in chemo, 19 are down in chemo.
Plot map as pheatmap.
plot_genes(x,"Treatment",show_rownames=TRUE, scale="row", annotation_names_col=FALSE, fontsize_row=8, cellheight=8, cellwidth = 20)
END