The following code analyzes intraparenchymal vs circulating blood
monocyte gene expression at day 4.
# Differential expression analysis with DESeq2
library(DESeq2)
library(data.table)
library(curl)
library(R.utils)
library(ggrepel)
# load counts table from GEO
urld <- "https://www.ncbi.nlm.nih.gov/geo/download/?format=file&type=rnaseq_counts"
path <- paste(urld, "acc=GSE163256", "file=GSE163256_raw_counts_GRCh38.p13_NCBI.tsv.gz", sep="&");
tbl <- as.matrix(data.table::fread(path, header=T, colClasses="integer"), rownames="GeneID")
# load gene annotations
apath <- paste(urld, "type=rnaseq_counts", "file=Human.GRCh38.p13.annot.tsv.gz", sep="&")
annot <- data.table::fread(apath, header=T, quote="", stringsAsFactors=F, data.table=F)
rownames(annot) <- annot$GeneID
# sample selection
gsms <- paste0("XXXXXXXXX100XXXXX00XXXXXXXXXX1XXXXXXXXX00XXXXX100X",
"XXXXXX1100XXXXXX1XXXXX00XXXXXX1XXXXXXXX100XXXXXXXX",
"XX1XXX1XXXXXXXXX1XXXXXXXX1XXXXXXXXX00XXXXXXX1XXXXX",
"XXXXX100XXXX100XXXXXXXXXX100XXXXXXXXXXXXX1XXXXXXXX",
"XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX",
"XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX",
"XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX",
"XXXXXXXXXXXXXXXXX")
sml <- strsplit(gsms, split="")[[1]]
# filter out excluded samples (marked as "X")
sel <- which(sml != "X")
sml <- sml[sel]
tbl <- tbl[ ,sel]
# group membership for samples
gs <- factor(sml)
groups <- make.names(c("hematoma_mono_4","blood_mono_4"))
levels(gs) <- groups
sample_info <- data.frame(Group = gs, row.names = colnames(tbl))
# pre-filter low count genes
# keep genes with at least N counts > 10, where N = size of smallest group
keep <- rowSums( tbl >= 10 ) >= min(table(gs))
tbl <- tbl[keep, ]
ds <- DESeqDataSetFromMatrix(countData=tbl, colData=sample_info, design= ~Group)
ds <- DESeq(ds, test="Wald", sfType="poscount")
# extract results for top genes table
r <- results (ds, contrast=c("Group", groups[1], groups[2]), alpha=0.05, pAdjustMethod ="fdr")
r2 <- merge(as.data.frame(r),annot,by=0,sort=F)
tT <- r[order(r$padj)[1:250],]
tT <- merge(as.data.frame(tT), annot, by=0, sort=F)
tT <- subset(tT, select=c("GeneID","padj","pvalue","lfcSE","stat","log2FoldChange","baseMean","Symbol","Description"))
plotDispEsts(ds, main="GSE163256 Dispersion Estimates")

# create histogram plot of p-values
hist(r$padj, breaks=seq(0, 1, length = 21), col = "grey", border = "white",
xlab = "", ylab = "", main = "GSE163256 Frequencies of padj-values")

# volcano plot
r2$diffexp <- "No"
r2$diffexp[r2$log2FoldChange>5 & r2$padj<1e-25] <- "UP"
r2$diffexp[r2$log2FoldChange< -5 & r2$padj<1e-25] <- "DOWN"
volcano <- ggplot(data=r2, aes(x=log2FoldChange, y= -log10(padj),col=diffexp))+
geom_point()+
theme_minimal()+
ylim(0,250)+
xlim(-5,15)+
geom_hline(yintercept= -log10(1e-25), col="red",lty=2,lwd=1.5)+
geom_vline(xintercept=c(-5,5),col="red",lty=2,lwd=1.5)+
labs(title="Neutrophils day 1: Hematoma vs Blood",x="log2(Fold Change)",y="-log10(P-adj)")+
scale_color_manual(values=c("black", "red"))+
theme(legend.position = "none", text=element_text(size=18,face="bold"),axis.text.x=element_text(size=16,face="bold"),axis.text.y=element_text(size=16,face="bold"))+
geom_text_repel(data=subset(r2,Symbol %in% c("APOE","APOC1","TREM2")), aes(label=(Symbol)),box.padding = 0.5, nudge_y = 0.1, cex=6, col="red")
volcano

# MD plot
par(mar=c(4,4,2,1), cex.main=1.5)
plot(log10(r$baseMean), r$log2FoldChange, main=paste(groups[1], "vs", groups[2]),
xlab="log10(mean of normalized counts)", ylab="log2FoldChange", pch=20, cex=0.5)
with(subset(r, padj<0.05 & abs(log2FoldChange) >= 1),
points(log10(baseMean), log2FoldChange, pch=20, col=(sign(log2FoldChange) + 3)/2, cex=1))
legend("bottomleft", title=paste("Padj<", 0.05, sep=""), legend=c("down", "up"), pch=20,col=1:2)
abline(h=0)

################################################################
# General expression data visualization
dat <- log10(counts(ds, normalized = T) + 1) # extract normalized counts
# box-and-whisker plot
lbl <- "log10(raw counts + 1)"
ord <- order(gs) # order samples by group
palette(c("#1B9E77", "#7570B3", "#E7298A", "#E6AB02", "#D95F02",
"#66A61E", "#A6761D", "#B32424", "#B324B3", "#666666"))
par(mar=c(7,4,2,1))
boxplot(dat[,ord], boxwex=0.6, notch=T, main="GSE163256", ylab="lg(norm.counts)", outline=F, las=2, col=gs[ord])
legend("topleft", groups, fill=palette(), bty="n")

### Heatmaps
library(pheatmap)
ntd <- normTransform(ds)
select <- order(rowMeans(counts(ds,normalized=TRUE)),decreasing=TRUE)[1:20]
df <- as.data.frame(colData(ds))[,c("Group","sizeFactor")]
df2 <- data.frame(Group=df[,1])
rownames(df2) <- rownames(df)
pheatmap(assay(ntd)[select,], cluster_rows=FALSE, show_rownames=TRUE,cluster_cols=TRUE, annotation_col=df2)

select2 <- c("348","341","54209","6622","4035","3949","3240")
pheatmap(assay(ntd)[select2,], cluster_rows=FALSE, show_rownames=TRUE,cluster_cols=TRUE, annotation_col=df2)
