06 2021 WUSTL Bulk RNA Seq Analysis

DESeq2 workflow

Following Chris Stubben’s methods

Load necessary packages

suppressPackageStartupMessages({ 
  library(DESeq2)
  library(readr)
  library(hciR)
  library(hciRdata)
  library(DT)
  library(ggplot2)
})

Load sample metadata

samples<-read_tsv("~/WUSTL chemo v naive Metdata RNAseq.txt")

## 
## ── Column specification ────────────────────────────────────────────────────────
## cols(
##   Histology = col_character(),
##   Sample = col_character(),
##   Treatment = col_character()
## )

datatable(samples)

Load raw counts data

counts <- read_tsv("~/RPM_CvN_bulkRNAWUSTL.txt")

## 
## ── Column specification ────────────────────────────────────────────────────────
## cols(
##   gene_id = col_character(),
##   PB513 = col_double(),
##   PB511 = col_double(),
##   PB512 = col_double(),
##   PB518 = col_double(),
##   PB510 = col_double(),
##   PB517 = col_double(),
##   PB520 = col_double(),
##   AI197 = col_double(),
##   AI192 = col_double(),
##   PB384 = col_double(),
##   PB383 = col_double(),
##   PB431 = col_double(),
##   PB442 = col_double(),
##   AI187 = col_double()
## )

datatable(counts)

## Warning in instance$preRenderHook(instance): It seems your data is too big
## for client-side DataTables. You may consider server-side processing: https://
## rstudio.github.io/DT/server.html

Remove features with zero counts and features with 5 or fewer reads in every sample.

counts <- filter_counts(counts, n = 5)

## Removed 19022 features with 0 reads

## Removed 18316 features with <=5 maximum reads

Add biotype, mouse 94, and remove non-coding genes

counts <- semi_join(counts, filter(mouse94, biotype=="protein_coding"), by=c(gene_id="id"))
counts <- semi_join(counts, filter(mouse94,
                                   !substr(gene_name, 1,3) %in% c("His", "Rpl", "Rps", "Pcd")), by=c(gene_id="id"))

datatable(counts)

## Warning in instance$preRenderHook(instance): It seems your data is too big
## for client-side DataTables. You may consider server-side processing: https://
## rstudio.github.io/DT/server.html

Run DESeq using ~Treatment in the design formula and get the regularized log transforms (rlog) for sample visualizations

dds <- deseq_from_tibble(counts, samples, minReplicates = 7, design = ~ Treatment)

## Warning in DESeqDataSet(se, design = design, ignoreRank): some variables in
## design formula are characters, converting to factors

## estimating size factors

## estimating dispersions

## gene-wise dispersion estimates

## mean-dispersion relationship

## final dispersion estimates

## fitting model and testing

## -- replacing outliers and refitting for 71 genes
## -- DESeq argument 'minReplicatesForReplace' = 7 
## -- original counts are preserved in counts(dds)

## estimating dispersions

## fitting model and testing

dds <- estimateSizeFactors(dds)
invisible(counts(dds, normalized=TRUE))

rld <- r_log(dds)

Plot the first two principal components using the rlog values from the top 500 variable genes.

plot_pca(rld, "Treatment", tooltip=c("Histology", "Sample", "Treatment"), width=600, point_size=20)

## Registered S3 method overwritten by 'quantmod':
##   method            from
##   as.zoo.data.frame zoo

PB383 and PB384 are very distinct chemo treated samples from remaining.

Compare chemo vs naive RPM tumors using 5% FDR

res1<-results_all(dds,mouse100)

## Using adjusted p-value < 0.05

## Adding shrunken fold changes to log2FoldChange

## 1. Chemo vs. Untreated: 43 up and 19 down regulated

## Note: 15 rows in results are missing from biomart table

datatable(res1)

## Warning in instance$preRenderHook(instance): It seems your data is too big
## for client-side DataTables. You may consider server-side processing: https://
## rstudio.github.io/DT/server.html

options(ggrepel.max.overlaps = Inf)

Plot fold changes and p-vals in volcano plot with FC cutoff of at least double for labeling.

plot_volcano(res1,labelsize=2.5, pvalue=c(1,5), foldchange_cutoff=1.2)+ggtitle("RPM Chemo v Naive DEGs")

Cluster up to the top 1000 significant genes and scale by rows so that values are z-scores and represent the number of std deviations from mean count value

x<-top_counts(res1, rld, top=1000)
datatable(x)

dim(x)

## [1] 62 15

Only 62 genes are significantly differentially expressed, 43 are up in chemo, 19 are down in chemo.

Plot map as pheatmap.

plot_genes(x,"Treatment",show_rownames=TRUE, scale="row", annotation_names_col=FALSE, fontsize_row=8, cellheight=8, cellwidth = 20)

END

06 2021 WUSTL Bulk RNA Seq Analysis

Abbie Ireland

6/17/2021

DESeq2 workflow