Introduction

Atopic dermatitis (eczema) is a chronic, inflammatory skin disease characterized by dysregulation of the type 2 inflammatory response. Using a publicly available RNAseq dataset from the Gene Expression Omnibus (GSE224783), we investigate gene expression differences in atopic dermatitis skin lesions vs. non-lesional skin. We first load the required packages and the dataset into R.

library(DESeq2)
library(tidyverse)
library(gplots)
countData <- read_csv('https://raw.githubusercontent.com/mss-genomics/first-meeting/ce48486c5e0122edfba55220a477b80c48cbd397/GSE224783_readcount_trimmed.csv')
countData
## # A tibble: 31,207 × 23
##    GeneID   nonlesional_1 nonlesional_2 lesion_1 nonlesional_3 lesion_2 lesion_3
##    <chr>            <dbl>         <dbl>    <dbl>         <dbl>    <dbl>    <dbl>
##  1 5S_rRNA              0             3        1             2        3        3
##  2 7SK                  6             3        3             8        2        5
##  3 A1BG                 0             0        1             1        2        0
##  4 A1BG-AS1             6             1        4             0        7        7
##  5 A1CF                 2             2        3             0        1        0
##  6 A2M               1112          1312     1463          1391     1714     1998
##  7 A2M-AS1             24             6        8             3        9        9
##  8 A2ML1            15507          4596     4388          7679     7217    23858
##  9 A2ML1-A…             4             0        1             0        0        1
## 10 A2ML1-A…             0             0        0             0        0        2
## # ℹ 31,197 more rows
## # ℹ 16 more variables: nonlesional_4 <dbl>, lesion_4 <dbl>,
## #   nonlesional_5 <dbl>, lesion_5 <dbl>, nonlesional_6 <dbl>, lesion_6 <dbl>,
## #   lesion_7 <dbl>, nonlesional_7 <dbl>, lesion_8 <dbl>, lesion_9 <dbl>,
## #   lesion_10 <dbl>, nonlesional_8 <dbl>, lesion_11 <dbl>, nonlesional_9 <dbl>,
## #   nonlesional_10 <dbl>, nonlesional_11 <dbl>

We visualize the log-transformed RNAseq count data for non-lesional sample 1.

logCountData <- log2(1+column_to_rownames(countData, "GeneID"))
hist(logCountData$nonlesional_1, br=100)

We assign each sample to its group, i.e, lesional or non-lesional, and build a DESeq2 object to normalize the raw count data.

group <- gsub("[^a-zA-Z]", "", colnames(logCountData))
colData <- as.data.frame(cbind(colnames(logCountData), group))
dds <- DESeqDataSetFromMatrix(countData = column_to_rownames(countData, "GeneID"), colData = colData, design = ~group)
dds <- DESeq(dds)
dds <- dds[rowSums(counts(dds)) > 5,] # remove genes with near zero expression accross samples
rld <- rlog(dds)
rld
## class: DESeqTransform 
## dim: 31176 22 
## metadata(1): version
## assays(1): ''
## rownames(31176): 5S_rRNA 7SK ... snoU2_19 snoZ5
## rowData names(24): baseMean baseVar ... dispFit rlogIntercept
## colnames(22): nonlesional_1 nonlesional_2 ... nonlesional_10
##   nonlesional_11
## colData names(4): V1 group sizeFactor replaceable

With principle component analysis, we can reduce the dimension of the dataset in order to visualize whether the gene expression profile from samples of the same group cluster together.

plotPCA(rld, intgroup = "group")

Biclustering is a technique that allows us to simultaneously group genes with similar expression accross samples together and group samples with similar expression profiles together.

# select top 50 genes with largest standard deviation among samples
n=25
x=assay(rld)
x = x[order(apply(x,1,sd),decreasing=TRUE),]
x = x[1:n,]

groups.colors = rainbow(length(unique(group)))

lmat = rbind(c(5,4),c(0,1),c(3,2))
lwid = c(1.5,4)
lhei = c(1,.2,4)

# distance function = 1-PCC (Pearson's correlation coefficient)
dist2 <- function(x, ...) as.dist(1-cor(t(x), method="pearson"))

# average linkage in hierarchical clustering
hclust2 <- function(x, method="average", ...) hclust(x, method=method, ...)

heatmap.2(x, distfun = dist2,hclustfun=hclust2,
          col=greenred(75), density.info="none", trace="none", scale="none", keysize=.5,
          key=T, symkey=F,
          ColSideColors=groups.colors[as.factor(group)],
          cexRow=1,
          srtCol=45,
          cexCol=1
)

We calculate the fold change difference of each gene accross the two groups with its associated p-value. Statistically-significant fold change differences are colored in blue. We note that highly expressed genes tend to have smaller fold change differences.

res <- results(dds)
DESeq2::plotMA(res, ylim=c(-5,5))

We can also visualize the relation between fold change differences and their associated p-values with a volcano plot.

res1 = as.data.frame(res)
res1 = mutate(res1, sig=ifelse(res$padj<0.1, "FDR<0.1", "Not significant"))
res1[which(abs(res1$log2FoldChange)<1.0), "sig"] = "Not significant"
ggplot(res1, aes(log2FoldChange, -log10(padj))) + geom_point(aes(col=sig)) + scale_color_manual(values=c("red", "black"))

Finally, we identify the genes that show the largest fold change difference across the two groups.

res = res[order(abs(res$log2FoldChange), decreasing = TRUE),]
res[1:10,]
## log2 fold change (MLE): group nonlesional vs lesion 
## Wald test p-value: group nonlesional vs lesion 
## DataFrame with 10 rows and 6 columns
##            baseMean log2FoldChange     lfcSE      stat      pvalue        padj
##           <numeric>      <numeric> <numeric> <numeric>   <numeric>   <numeric>
## DEFB4B     28.14587       -6.57316  1.096573  -5.99427 2.04399e-09 3.63104e-07
## MMP10       6.14548       -5.82018  1.356518  -4.29053 1.78247e-05 4.39822e-04
## G3793       6.00595        5.48603  0.874341   6.27448 3.50804e-10 9.74477e-08
## CYP1A2      6.88075        5.38101  1.230807   4.37194 1.23148e-05 3.29844e-04
## HSPD1P3     5.53260       -5.14063  0.857908  -5.99205 2.07213e-09 3.64926e-07
## SLC18A3     3.17115       -4.88175  1.596952  -3.05691 2.23629e-03 1.71672e-02
## CLEC3A      4.61486       -4.86383  1.435744  -3.38767 7.04882e-04 7.29380e-03
## LCE3B       3.51115       -4.74495  1.696070  -2.79762 5.14814e-03 3.17284e-02
## KRT6C   16195.62818       -4.61539  0.802962  -5.74796 9.03290e-09 1.18901e-06
## G19270      2.22669        4.60126  1.081988   4.25260 2.11301e-05 5.00308e-04