class: center, middle, title-slide # Fibrosis, RNA-Seq Differential Expression Analysis ### Michael Espero ### 4/9/2019 --- # The data The data used in this workflow contain experimental human gene expression data. Observations: 47,729 Variables: 8 We will work with this data to find differentially expressed genes. Specifically, we're going to get the differentially expressed genes in descending order of how suprising they are, given we're making **many** comparisons. --- # Variables **Target variable** Gene **Count variables** A matrix of counts representing the number of RNA in each sample. --- # Data Import ```r library(tidyverse) library(DESeq2) library(RCurl) library(pheatmap) library(DiffBind) library(skimr) url <- getURL("https://assets.datacamp.com/production/repositories/1766/datasets/bf1d0eff910f1b2cad36e5acdc2a182e95c63965/fibrosis_smoc2_rawcounts_unordered.csv") df <- read_csv(url) ``` ``` ## Warning: Missing column names filled in: 'X1' [1] ``` ```r skim(df) ``` ``` ## Skim summary statistics ## n obs: 47729 ## n variables: 8 ## ## ── Variable type:character ───────────────────────────────────────────────────────────────────────────────────────────────────────────────── ## variable missing complete n min max empty n_unique ## X1 0 47729 47729 18 18 0 47729 ## ## ── Variable type:numeric ─────────────────────────────────────────────────────────────────────────────────────────────────────────────────── ## variable missing complete n mean sd p0 p25 p50 p75 ## smoc2_fibrosis1 0 47729 47729 579.75 3635.32 0 0 1 184 ## smoc2_fibrosis2 0 47729 47729 466.28 3377.21 0 0 1 143 ## smoc2_fibrosis3 0 47729 47729 512.29 3448.86 0 0 1 165 ## smoc2_fibrosis4 0 47729 47729 452.22 3202.64 0 0 1 144 ## smoc2_normal1 0 47729 47729 408.22 4699.03 0 0 0 63 ## smoc2_normal3 0 47729 47729 462.21 6049.11 0 0 0 75 ## smoc2_normal4 0 47729 47729 502.73 5651.47 0 0 0 72 ## p100 hist ## 420026 ▇▁▁▁▁▁▁▁ ## 467692 ▇▁▁▁▁▁▁▁ ## 347826 ▇▁▁▁▁▁▁▁ ## 392520 ▇▁▁▁▁▁▁▁ ## 5e+05 ▇▁▁▁▁▁▁▁ ## 643989 ▇▁▁▁▁▁▁▁ ## 655931 ▇▁▁▁▁▁▁▁ ``` ```r glimpse(df) ``` ``` ## Observations: 47,729 ## Variables: 8 ## $ X1 <chr> "ENSMUSG00000102693", "ENSMUSG00000064842", "ENS… ## $ smoc2_fibrosis1 <dbl> 0, 0, 72, 0, 0, 0, 0, 0, 0, 1, 0, 9, 0, 0, 0, 0,… ## $ smoc2_fibrosis4 <dbl> 0, 0, 30, 0, 0, 0, 0, 0, 0, 1, 1, 2, 0, 0, 0, 0,… ## $ smoc2_normal1 <dbl> 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, … ## $ smoc2_normal3 <dbl> 0, 0, 3, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, … ## $ smoc2_fibrosis3 <dbl> 0, 0, 36, 0, 0, 0, 0, 0, 0, 1, 0, 7, 0, 0, 0, 0,… ## $ smoc2_normal4 <dbl> 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, … ## $ smoc2_fibrosis2 <dbl> 0, 0, 51, 0, 0, 0, 0, 0, 0, 1, 0, 3, 0, 0, 0, 0,… ``` --- # Labeling the Data ```r # Create genotype vector genotype <- c( "smoc2_oe", "smoc2_oe", "smoc2_oe", "smoc2_oe", "smoc2_oe", "smoc2_oe", "smoc2_oe" ) # Create condition vector condition <- c( "fibrosis", "fibrosis", "fibrosis", "fibrosis", "normal", "normal", "normal" ) # Create data frame smoc2_metadata <- data.frame(genotype, condition) # Assign the row names of the data frame rownames(smoc2_metadata) <- c( "smoc2_fibrosis1", "smoc2_fibrosis2", "smoc2_fibrosis3", "smoc2_fibrosis4", "smoc2_normal1", "smoc2_normal3", "smoc2_normal4" ) ``` --- # Data Processing ```r # Use the match() function to reorder the columns of the raw counts match(rownames(smoc2_metadata), colnames(df)) ``` ``` ## [1] 2 8 6 3 4 5 7 ``` ```r # Reorder the columns of the count data reordered_smoc2_rawcounts <- df[, match(rownames(smoc2_metadata), colnames(df))] # Make the DESeq2 object dds_smoc2 <- DESeqDataSetFromMatrix( countData = reordered_smoc2_rawcounts, colData = smoc2_metadata, design = ~condition ) # Get the size factors to use for normalization dds_smoc2 <- estimateSizeFactors(dds_smoc2) # Get the normalized counts smoc2_normalized_counts <- counts(dds_smoc2, normalized = T) ``` --- # Heatmap ```r # !Problem here!.... Transform the *normalized* counts vsd_smoc2 <- vst(dds_smoc2, blind = T) # Create heatmap of correlations in the sample vsd_smoc2 %>% assay() %>% cor() %>% pheatmap(annotation = select(smoc2_metadata, c("genotype", "condition"))) ``` <img src="fibrosis_de_workflow_files/figure-html/unnamed-chunk-5-1.png" width="700px" style="display: block; margin: auto;" /> --- # Dimensional Reduction ```r # Transform the normalized counts vsd_smoc2 <- vst(dds_smoc2, blind = TRUE) # Plot the PCA with 2 variance components plotPCA(vsd_smoc2, intgroup = "condition") ``` <img src="fibrosis_de_workflow_files/figure-html/unnamed-chunk-6-1.png" width="700px" style="display: block; margin: auto;" /> --- # Run DESeq We're applying a negative binomial model to account for overdispersion, meaning the variance within each level is higher that its mean. ```r # Ready a DESeq2 object dds_smoc2 <- DESeqDataSetFromMatrix( countData = reordered_smoc2_rawcounts, colData = smoc2_metadata, design = ~condition ) ``` ``` ## converting counts to integer mode ``` ```r # Run DESeq2 analysis dds_smo2_DESeq2 <- DESeq(dds_smoc2) ``` ``` ## estimating size factors ``` ``` ## estimating dispersions ``` ``` ## gene-wise dispersion estimates ``` ``` ## mean-dispersion relationship ``` ``` ## final dispersion estimates ``` ``` ## fitting model and testing ``` --- # Plot Dispersion Estimates ```r # Plot dispersions plotDispEsts(dds_smo2_DESeq2) ``` <img src="fibrosis_de_workflow_files/figure-html/unnamed-chunk-8-1.png" width="700px" style="display: block; margin: auto;" /> --- # Prepare Results ```r # Get the results of the DE analysis smoc2_res <- results(dds_smo2_DESeq2, contrast = c("condition", "fibrosis", "normal"), alpha = .05 ) # Shrink the log2 fold change estimates to be more accurate smoc2_res <- lfcShrink(dds_smo2_DESeq2, contrast = c("condition", "fibrosis", "normal"), res = smoc2_res ) ``` ``` ## using 'normal' for LFC shrinkage, the Normal prior from Love et al (2014). ## additional priors are available via the 'type' argument, see ?lfcShrink for details ``` ```r # Get results smoc2_res <- results(dds_smo2_DESeq2, contrast = c("condition", "fibrosis", "normal"), alpha = .05, lfcThreshold = .32 ) ``` --- # Results of DE ```r # Get an overview of the results summary(smoc2_res) ``` ``` ## ## out of 29556 with nonzero total read count ## adjusted p-value < 0.05 ## LFC > 0.32 (up) : 3716, 13% ## LFC < -0.32 (down) : 3322, 11% ## outliers [1] : 15, 0.051% ## low counts [2] : 7207, 24% ## (mean count < 1) ## [1] see 'cooksCutoff' argument of ?results ## [2] see 'independentFiltering' argument of ?results ``` --- # Subset DE data ```r # Generate a logical column smoc2_res_all <- data.frame(smoc2_res) %>% mutate(threshold = padj < 0.05) # Save results as a data frame smoc2_res_all <- data.frame(smoc2_res) # Subset the results to only return the significant genes with p-adjusted values less than 0.05 smoc2_res_sig <- subset(smoc2_res_all, padj < .05) ``` --- # MA Plot ```r # Create MA plot plotMA(smoc2_res) ``` <img src="fibrosis_de_workflow_files/figure-html/unnamed-chunk-12-1.png" width="700px" style="display: block; margin: auto;" /> --- # Volcano Plot ```r # Create the volcano plot threshold <- smoc2_res_sig$padj < 0.05 vplot <- ggplot(smoc2_res_sig) + geom_point(aes(x = log2FoldChange, y = -log10(padj), color = threshold)) + xlab("log2 fold change") + ylab("-log10 adjusted p-value") + theme( legend.position = "none", plot.title = element_text(size = rel(1.5), hjust = 0.5), axis.title = element_text(size = rel(1.25)) ) ``` --- # Show Volcano Plot ```r vplot ``` <img src="fibrosis_de_workflow_files/figure-html/unnamed-chunk-14-1.png" width="700px" style="display: block; margin: auto;" /> --- # DE Results With this frequentist approach it's necessary for us to adjust p-values. Here, we've already applied a BH procedure and arrange in descending order by how suprising each gene's expression was in the data at hand. ```r # Select significant genes with padj < 0.05 smoc2_sig <- subset(smoc2_res, padj < .05) %>% data.frame() %>% rownames_to_column(var = "geneID") # Extract the top 6 genes by padj values smoc2_sig %>% arrange(padj) %>% select(geneID, padj) %>% head(n = 6) ``` ``` ## geneID padj ## 1 2609 0.000000e+00 ## 2 1457 5.074722e-257 ## 3 1576 3.450235e-164 ## 4 1606 2.416206e-146 ## 5 4611 2.699924e-144 ## 6 1925 3.835968e-141 ``` Next, it will be helpful to tie these geneID numbers to more descriptive information about the genes of interest.