This R Markdown is for the analysis of RNAseq expression data for inflamation experiment. The input gene expression table has already been adjusted to only include integers and remove 0 (across all samples) expression values. The data contains 12 samples: 3-9week controls, 3-9 week induced, 3-1yr controls and 3 1-yrInduced.
##Load gene expression data and upper quarter normalized, samples were already rounded and zeros removed
CountData <- read.csv("AllExpFinal.csv", header = TRUE)
rownames(CountData) <- CountData[,1]
CountData <- CountData[,-1]
CountData <- data.matrix(CountData)
upQuart <- apply(CountData, 2, function(x){quantile(x, 0.75)})
m <- mean(upQuart)
upQuart <- m / upQuart
Count_upquart <- t(t(CountData) * upQuart)
Count_upquart <- round(Count_upquart)
##load in annotation file, this file has three columns: sample name, Time, Condition
Categories <- read.csv("2sample_annotation.csv", header = TRUE)
rownames(Categories) <- Categories[,1]
Categories <- Categories[,-1]
##Display part of expression table to ensure it is proessed correctly
##head(Count_upquart,2)
##check names match between expression data and annotaiton file adn are in correct order. Should both yield TRUE
all(rownames(Categories) %in% colnames(Count_upquart))
## [1] TRUE
all(rownames(Categories) == colnames(Count_upquart))
## [1] TRUE
##Run deseq2 code based on control vs induced categories
dds <- DESeqDataSetFromMatrix(countData = Count_upquart,
colData = Categories,
design = ~ Condition)
## converting counts to integer mode
## Warning in DESeqDataSet(se, design = design, ignoreRank): some variables in
## design formula are characters, converting to factors
### Transform counts for data visualization
rld <- rlog(dds, blind=TRUE)
### Plot PCA
plotPCA(rld, intgroup="Condition")
##Results indicate that data points cluster closely based on condition not time
rld_mat <- assay(rld)
### Compute pairwise correlation values
rld_cor <- cor(rld_mat) ## cor() is a base R function
head(rld_cor) ## check the output of cor(), make note of the rownames and colnames
## B162_12VP_9Control B162_11VP_9Control MG12_9Control
## B162_12VP_9Control 1.0000000 0.9965010 0.9926079
## B162_11VP_9Control 0.9965010 1.0000000 0.9902979
## MG12_9Control 0.9926079 0.9902979 1.0000000
## MG10_9Induced 0.9822320 0.9800075 0.9842966
## SB8_9Induced 0.9895245 0.9898655 0.9822946
## SB9_9Induced 0.9871168 0.9845494 0.9814795
## MG10_9Induced SB8_9Induced SB9_9Induced B59VP_1yrControl
## B162_12VP_9Control 0.9822320 0.9895245 0.9871168 0.9927273
## B162_11VP_9Control 0.9800075 0.9898655 0.9845494 0.9953385
## MG12_9Control 0.9842966 0.9822946 0.9814795 0.9874157
## MG10_9Induced 1.0000000 0.9834926 0.9857645 0.9759933
## SB8_9Induced 0.9834926 1.0000000 0.9915581 0.9876028
## SB9_9Induced 0.9857645 0.9915581 1.0000000 0.9819820
## B108VP_1yrControl B10VP_1yrControl B91VP_1yrInduced
## B162_12VP_9Control 0.9955993 0.9918534 0.9623152
## B162_11VP_9Control 0.9962800 0.9887638 0.9605717
## MG12_9Control 0.9900327 0.9855407 0.9558307
## MG10_9Induced 0.9796515 0.9726004 0.9735501
## SB8_9Induced 0.9896144 0.9849699 0.9784172
## SB9_9Induced 0.9848818 0.9853817 0.9828635
## P95VP_1yrInduced B49VP_1yrInduced
## B162_12VP_9Control 0.9612387 0.9760550
## B162_11VP_9Control 0.9578696 0.9729759
## MG12_9Control 0.9559911 0.9722182
## MG10_9Induced 0.9721756 0.9827613
## SB8_9Induced 0.9724919 0.9819904
## SB9_9Induced 0.9819000 0.9878769
### Plot heatmap
pheatmap(rld_cor)
##likelihood ratio test
dds_lrt <- DESeq(dds, test="LRT", reduced = ~ 1)
## estimating size factors
## estimating dispersions
## gene-wise dispersion estimates
## mean-dispersion relationship
## final dispersion estimates
## fitting model and testing
res_LRT <- results(dds_lrt)
# Subset the LRT results to return genes with pvalue < 0.05
sig_res_LRT <- res_LRT %>%
data.frame() %>%
rownames_to_column(var="gene") %>%
as_tibble() %>%
filter(padj < 0.05)
# List of significant genes with p < 0.05
sigLRT_genes <- sig_res_LRT %>%
pull(gene)
length(sigLRT_genes)
## [1] 3364
# Subset results for faster cluster finding
clustering_sig_genes <- sig_res_LRT %>%
arrange(padj)
# Obtain rlog values for those significant genes
cluster_rlog <- rld_mat[clustering_sig_genes$gene, ]
# Use the `degPatterns` function from the 'DEGreport' package to show gene clusters across sample groups
clusters <- degPatterns(cluster_rlog, metadata = Categories, time = "Condition", col=NULL)
## A large number of genes was given-- please, make sure this is not an error. Normally, only DE genes will be useful for this function.
## Working with 3364 genes.
## Warning: `distinct_()` is deprecated as of dplyr 0.7.0.
## Please use `distinct()` instead.
## See vignette('programming') for more help
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_warnings()` to see where this warning was generated.
## Working with 3364 genes after filtering: minc > 15
## Joining, by = "merge"
## Joining, by = "merge"
## geom_path: Each group consists of only one observation. Do you need to adjust
## the group aesthetic?
# What type of data structure is the `clusters` output?
class(clusters)
## [1] "list"
clusters <- degPatterns(cluster_rlog, metadata = Categories, time="Time", col="Condition")
## A large number of genes was given-- please, make sure this is not an error. Normally, only DE genes will be useful for this function.
## Working with 3364 genes.
## Working with 3353 genes after filtering: minc > 15
## Joining, by = "merge"
## Joining, by = "merge"
class(clusters)
## [1] "list"
# Let's see what is stored in the `df` component
head(clusters$df)
## genes cluster
## Igj Igj 1
## Ccl28 Ccl28 1
## Tnfsf13 Tnfsf13 2
## Bcl3 Bcl3 1
## Irf4 Irf4 1
## Il2rg Il2rg 1
deseq2Data <- DESeq(dds)
## estimating size factors
## estimating dispersions
## gene-wise dispersion estimates
## mean-dispersion relationship
## final dispersion estimates
## fitting model and testing
res_LRT
## log2 fold change (MLE): Condition Induced vs Control
## LRT p-value: '~ Condition' vs '~ 1'
## DataFrame with 16711 rows and 6 columns
## baseMean log2FoldChange lfcSE stat pvalue
## <numeric> <numeric> <numeric> <numeric> <numeric>
## Gm6639 1.41118 -0.571005 2.97148 0.03680892 0.847855
## 3110099E03Rik 1.25630 -3.213686 2.17297 2.25269254 0.133382
## Mc3r 1.28054 -2.664047 1.83252 2.05258043 0.151948
## Oog3 1.17969 -2.525635 2.25009 1.21903682 0.269550
## 9530059O14Rik 1.57546 -0.207885 2.30366 0.00818299 0.927922
## ... ... ... ... ... ...
## Spink3 348413 -5.167108 0.975713 19.678273 9.16374e-06
## Malat1 434492 -0.302690 0.337744 0.801728 3.70577e-01
## Sbpl 519919 -3.497944 0.767744 17.061520 3.61882e-05
## Sbp 1109007 -3.140676 0.771952 14.047488 1.78252e-04
## Rn45s 1502328 -0.436242 0.332190 1.718047 1.89945e-01
## padj
## <numeric>
## Gm6639 0.930091
## 3110099E03Rik 0.325920
## Mc3r 0.355002
## Oog3 0.505357
## 9530059O14Rik 0.966859
## ... ...
## Spink3 0.000161501
## Malat1 0.608174759
## Sbpl 0.000511089
## Sbp 0.001961359
## Rn45s 0.407854574
deseq2Data
## class: DESeqDataSet
## dim: 16711 12
## metadata(1): version
## assays(4): counts mu H cooks
## rownames(16711): Gm6639 3110099E03Rik ... Sbp Rn45s
## rowData names(22): baseMean baseVar ... deviance maxCooks
## colnames(12): B162_12VP_9Control B162_11VP_9Control ...
## P95VP_1yrInduced B49VP_1yrInduced
## colData names(3): Time Condition sizeFactor