Deseq2

R Markdown

This R Markdown is for the analysis of RNAseq expression data for inflamation experiment. The input gene expression table has already been adjusted to only include integers and remove 0 (across all samples) expression values. The data contains 12 samples: 3-9week controls, 3-9 week induced, 3-1yr controls and 3 1-yrInduced.

##Load gene expression data and upper quarter normalized, samples were already rounded and zeros removed
CountData <- read.csv("AllExpFinal.csv", header = TRUE)
rownames(CountData) <- CountData[,1]
CountData <- CountData[,-1]
CountData <- data.matrix(CountData)
upQuart <- apply(CountData, 2, function(x){quantile(x, 0.75)})
m <- mean(upQuart)
upQuart <- m / upQuart
Count_upquart <- t(t(CountData) * upQuart)
Count_upquart <- round(Count_upquart)

##load in annotation file, this file has three columns: sample name, Time, Condition
Categories <- read.csv("2sample_annotation.csv", header = TRUE)
rownames(Categories) <- Categories[,1]
Categories <- Categories[,-1]

##Display part of expression table to ensure it is proessed correctly
##head(Count_upquart,2)

##check names match between expression data and annotaiton file adn are in correct order. Should both yield TRUE
all(rownames(Categories) %in% colnames(Count_upquart))

## [1] TRUE

all(rownames(Categories) == colnames(Count_upquart))

## [1] TRUE

##Run deseq2 code based on control vs induced categories
dds <- DESeqDataSetFromMatrix(countData = Count_upquart,
                              colData = Categories,
                              design = ~ Condition)

## converting counts to integer mode

## Warning in DESeqDataSet(se, design = design, ignoreRank): some variables in
## design formula are characters, converting to factors

### Transform counts for data visualization
rld <- rlog(dds, blind=TRUE)
### Plot PCA 
plotPCA(rld, intgroup="Condition")

##Results indicate that data points cluster closely based on condition not time

rld_mat <- assay(rld) 
### Compute pairwise correlation values
rld_cor <- cor(rld_mat)    ## cor() is a base R function

head(rld_cor)   ## check the output of cor(), make note of the rownames and colnames

##                    B162_12VP_9Control B162_11VP_9Control MG12_9Control
## B162_12VP_9Control          1.0000000          0.9965010     0.9926079
## B162_11VP_9Control          0.9965010          1.0000000     0.9902979
## MG12_9Control               0.9926079          0.9902979     1.0000000
## MG10_9Induced               0.9822320          0.9800075     0.9842966
## SB8_9Induced                0.9895245          0.9898655     0.9822946
## SB9_9Induced                0.9871168          0.9845494     0.9814795
##                    MG10_9Induced SB8_9Induced SB9_9Induced B59VP_1yrControl
## B162_12VP_9Control     0.9822320    0.9895245    0.9871168        0.9927273
## B162_11VP_9Control     0.9800075    0.9898655    0.9845494        0.9953385
## MG12_9Control          0.9842966    0.9822946    0.9814795        0.9874157
## MG10_9Induced          1.0000000    0.9834926    0.9857645        0.9759933
## SB8_9Induced           0.9834926    1.0000000    0.9915581        0.9876028
## SB9_9Induced           0.9857645    0.9915581    1.0000000        0.9819820
##                    B108VP_1yrControl B10VP_1yrControl B91VP_1yrInduced
## B162_12VP_9Control         0.9955993        0.9918534        0.9623152
## B162_11VP_9Control         0.9962800        0.9887638        0.9605717
## MG12_9Control              0.9900327        0.9855407        0.9558307
## MG10_9Induced              0.9796515        0.9726004        0.9735501
## SB8_9Induced               0.9896144        0.9849699        0.9784172
## SB9_9Induced               0.9848818        0.9853817        0.9828635
##                    P95VP_1yrInduced B49VP_1yrInduced
## B162_12VP_9Control        0.9612387        0.9760550
## B162_11VP_9Control        0.9578696        0.9729759
## MG12_9Control             0.9559911        0.9722182
## MG10_9Induced             0.9721756        0.9827613
## SB8_9Induced              0.9724919        0.9819904
## SB9_9Induced              0.9819000        0.9878769

### Plot heatmap
pheatmap(rld_cor)

##likelihood ratio test
dds_lrt <- DESeq(dds, test="LRT", reduced = ~ 1)

## estimating size factors

## estimating dispersions

## gene-wise dispersion estimates

## mean-dispersion relationship

## final dispersion estimates

## fitting model and testing

res_LRT <- results(dds_lrt)
# Subset the LRT results to return genes with pvalue < 0.05
sig_res_LRT <- res_LRT %>%
               data.frame() %>%
               rownames_to_column(var="gene") %>% 
               as_tibble() %>% 
               filter(padj < 0.05)
# List of significant genes with p < 0.05
sigLRT_genes <- sig_res_LRT %>% 
                pull(gene)
                
length(sigLRT_genes)

## [1] 3364

# Subset results for faster cluster finding 
clustering_sig_genes <- sig_res_LRT %>%
                  arrange(padj)



# Obtain rlog values for those significant genes
cluster_rlog <- rld_mat[clustering_sig_genes$gene, ]
# Use the `degPatterns` function from the 'DEGreport' package to show gene clusters across sample groups
clusters <- degPatterns(cluster_rlog, metadata = Categories, time = "Condition", col=NULL)

## A large number of genes was given-- please, make sure this is not an error. Normally, only DE genes will be useful for this function.

## Working with 3364 genes.

## Warning: `distinct_()` is deprecated as of dplyr 0.7.0.
## Please use `distinct()` instead.
## See vignette('programming') for more help
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_warnings()` to see where this warning was generated.

## Working with 3364 genes after filtering: minc > 15

## Joining, by = "merge"
## Joining, by = "merge"

## geom_path: Each group consists of only one observation. Do you need to adjust
## the group aesthetic?

# What type of data structure is the `clusters` output?
class(clusters)

## [1] "list"

clusters <- degPatterns(cluster_rlog, metadata = Categories, time="Time", col="Condition")

## A large number of genes was given-- please, make sure this is not an error. Normally, only DE genes will be useful for this function.

## Working with 3364 genes.

## Working with 3353 genes after filtering: minc > 15

## Joining, by = "merge"
## Joining, by = "merge"

class(clusters)

## [1] "list"

# Let's see what is stored in the `df` component
head(clusters$df)

##           genes cluster
## Igj         Igj       1
## Ccl28     Ccl28       1
## Tnfsf13 Tnfsf13       2
## Bcl3       Bcl3       1
## Irf4       Irf4       1
## Il2rg     Il2rg       1

deseq2Data <- DESeq(dds)

## estimating size factors

## estimating dispersions

## gene-wise dispersion estimates

## mean-dispersion relationship

## final dispersion estimates

## fitting model and testing

res_LRT

## log2 fold change (MLE): Condition Induced vs Control 
## LRT p-value: '~ Condition' vs '~ 1' 
## DataFrame with 16711 rows and 6 columns
##                baseMean log2FoldChange     lfcSE       stat      pvalue
##               <numeric>      <numeric> <numeric>  <numeric>   <numeric>
## Gm6639          1.41118      -0.571005   2.97148 0.03680892    0.847855
## 3110099E03Rik   1.25630      -3.213686   2.17297 2.25269254    0.133382
## Mc3r            1.28054      -2.664047   1.83252 2.05258043    0.151948
## Oog3            1.17969      -2.525635   2.25009 1.21903682    0.269550
## 9530059O14Rik   1.57546      -0.207885   2.30366 0.00818299    0.927922
## ...                 ...            ...       ...        ...         ...
## Spink3           348413      -5.167108  0.975713  19.678273 9.16374e-06
## Malat1           434492      -0.302690  0.337744   0.801728 3.70577e-01
## Sbpl             519919      -3.497944  0.767744  17.061520 3.61882e-05
## Sbp             1109007      -3.140676  0.771952  14.047488 1.78252e-04
## Rn45s           1502328      -0.436242  0.332190   1.718047 1.89945e-01
##                      padj
##                 <numeric>
## Gm6639           0.930091
## 3110099E03Rik    0.325920
## Mc3r             0.355002
## Oog3             0.505357
## 9530059O14Rik    0.966859
## ...                   ...
## Spink3        0.000161501
## Malat1        0.608174759
## Sbpl          0.000511089
## Sbp           0.001961359
## Rn45s         0.407854574

deseq2Data

## class: DESeqDataSet 
## dim: 16711 12 
## metadata(1): version
## assays(4): counts mu H cooks
## rownames(16711): Gm6639 3110099E03Rik ... Sbp Rn45s
## rowData names(22): baseMean baseVar ... deviance maxCooks
## colnames(12): B162_12VP_9Control B162_11VP_9Control ...
##   P95VP_1yrInduced B49VP_1yrInduced
## colData names(3): Time Condition sizeFactor

Deseq2

Alin

7/6/2020

R Markdown