RNA seq analysis for An Truong of Brenton group, CRUK CI.
Packages.
library(dplyr)
library(tidyverse)
library(ggplot2)
library(stringr)
library(AnnotationDbi)
library(org.Hs.eg.db)
library(limma)
library(edgeR)
library(DESeq2)
Load data.
rawCounts <- read.csv("~/Documents/PhD/Rstudio projects/brentonCopyNumberSignaturesOTTA_20230606/RNAseqForAn/SLX-22950.fc_counts.csv")
meta <- read.csv("~/Documents/PhD/Rstudio projects/brentonCopyNumberSignaturesOTTA_20230606/RNAseqForAn/SLX22950_metadata.csv")
Wrangle data.
# reformat sequencing IDs
colnames(rawCounts) <- str_sub(colnames(rawCounts), start = 1, end = 31)
# make ID column rownames
rawCounts <- rawCounts %>%
column_to_rownames(var = "X")
# add HR status column to metadata
meta <- meta %>%
dplyr::mutate(hr_status = case_when(cell_line == "PEO1s" ~ "HRD",
TRUE ~ "HRP"))
Annotate genes.
entrez <- rownames(rawCounts)
colGene <- c("ENTREZID", "SYMBOL", "GENETYPE", "ENSEMBL")
geneIdMapExAn <- AnnotationDbi::select(org.Hs.eg.db, keys = entrez, columns = colGene, keytype = "ENTREZID")
## 'select()' returned 1:many mapping between keys and columns
geneIdMapExAn <- geneIdMapExAn[!duplicated(geneIdMapExAn$ENTREZID), ]
geneIdMapExAn <- geneIdMapExAn[geneIdMapExAn$ENTREZID %in% entrez, ]
genesAn <- geneIdMapExAn$ENTREZID
rawCountsFinal <- rawCounts[genesAn, ]
Create a DGE object for DE analysis.
dgeUnTreAn <- DGEList(counts = rawCountsFinal, genes = geneIdMapExAn, samples = meta, group = meta$treatment)
Filter lowly expressed genes. Using manual filtering and the filterByExp() function
# filter lowly expressed genes
cutoff <- 4
drop <- which(apply(cpm(dgeUnTreAn), 1, max) < cutoff)
dgeUnTreAnFil <- dgeUnTreAn[-drop, , keep.lib.sizes = FALSE]
dim(dgeUnTreAnFil) # number of genes left after manual filtering
## [1] 13330 36
keep <- filterByExpr(dgeUnTreAnFil, group = dgeUnTreAnFil$samples$treatment)
dgeUnTreAnFilExp <- dgeUnTreAnFil[keep, ,keep.lib.sizes = FALSE ]
dim(dgeUnTreAnFilExp) # number of genes left after manual filtering and filterByExpr
## [1] 13164 36
boxplot(log(dgeUnTreAnFilExp$counts + 0.5), xaxt = "n", col = "turquoise4", xlab = "Samples") # look at raw log counts
boxplot(cpm(dgeUnTreAnFilExp$counts, log = TRUE), xaxt = "n", col = "turquoise4", xlab = "Samples") # look at log cpm normalised counts
plotSparsity(dgeUnTreAnFilExp$counts, normalized = TRUE)
Normalisation.
# calculate normalisation factors (does not normalise)
dgeUnTreAnFilExpNorm <- calcNormFactors(dgeUnTreAnFilExp)
barplot(dgeUnTreAnFilExpNorm$samples$norm.factors, names = colnames(dgeUnTreAnFilExpNorm), las = 2)
title("Barplot of normalisation factors")
Contrast: Auranofin treated vs untreated. Define group for contrast and covariate interactions
group <- interaction(dgeUnTreAnFilExpNorm$samples$treatment)
groupCol <- group # you could just use the original interaction vector, but I like to make a new one that I will overlay colours on.
levels(groupCol) <- c("#F8766D", "#00BFC4") # change levels to colour names/HEX codes.
# untreated == turquoise (#00BFC4)
# auranofin == tomato ("#F8766D")
groupCol <- as.character(groupCol) # make sure R knows the colour replacements are characters.
plotMDS(dgeUnTreAnFilExpNorm, col = groupCol, labels = NULL, pch = 20)
legend("topright", legend = c("auranofin", "untreated"), fill = c("#F8766D", "#00BFC4"))
# defining covariate interactions
cellLine <- interaction(dgeUnTreAnFilExpNorm$samples$cell_line)
cellLineCol <- cellLine
# Levels: PEO1ms PEO1s PEO4
levels(cellLineCol) <- c("darkorange", "cornflowerblue", "darkolivegreen3")
cellLineCol <- as.character(cellLineCol)
oxygen <- interaction(dgeUnTreAnFilExpNorm$samples$o2)
oxygenCol <- oxygen
# Levels: 5 21
levels(oxygenCol) <- c("aquamarine", "aquamarine4")
oxygenCol <- as.character(oxygenCol)
hrStatus <- interaction(dgeUnTreAnFilExpNorm$samples$hr_status)
hrStatusCol <- hrStatus
# Levels: HRD HRP
levels(hrStatusCol) <- c("brown1", "cadetblue1")
hrStatusCol <- as.character(hrStatusCol)
plotMDS(dgeUnTreAnFilExpNorm, col = cellLineCol, labels = NULL, pch = 20)
legend("topright", legend = c("PEO1ms", "PEO1s", "PEO4"), fill = c("darkorange", "cornflowerblue", "darkolivegreen3"))
plotMDS(dgeUnTreAnFilExpNorm, col = oxygenCol, labels = NULL, pch = 20)
legend("topright", legend = c("5", "21"), fill = c("aquamarine", "aquamarine4"))
plotMDS(dgeUnTreAnFilExpNorm, col = hrStatusCol, labels = NULL, pch = 20)
legend("topright", legend = c("HRD", "HRP"), fill = c("brown1", "cadetblue1"))
These MDS plots are extremely important. They show that most of
your variation comes from the cell line used and the HR status (which is
confounded with cell line). These covariates need to be included in our
design matrix, since we are not trying to investigate differences
between cell lines or HR status, we are primarily interested in drug
treatment and oxygen concentration. As HR status is so heavily
confounded by cell line (you only have one cell line to represent HRD),
I will not include it as a covariate in the design matrix (but if you
had more HRD cell lines, I would).
Define the design matrix.
# specify the model to be fitted
modelMatrixAn <- model.matrix(~ 0 + group + cellLine + oxygen)
# adjust for covariates including cellLine + oxygen
Voom transformation.
# voom transformation
voomUnTrAn <- voom(dgeUnTreAnFilExpNorm, modelMatrixAn, plot = TRUE)
# look at diagnostic plots
plotMDS(dgeUnTreAnFilExpNorm, col = groupCol, labels = NULL, pch = 20)
legend("topright", legend = c("auranofin", "untreated"), fill = c("#F8766D", "#00BFC4"))
Fit linear model.
# fitting a linear model using weighted least squares for each gene
fitUnTrAn <- lmFit(voomUnTrAn, modelMatrixAn)
head(coef(fitUnTrAn))
## groupauranofin groupuntreated cellLinePEO1s cellLinePEO4 oxygen21
## 653635 3.599856 3.535125 0.6199890492 0.05707597 -0.1871711
## 729737 4.402366 4.420981 0.5291908174 0.10503620 -0.1865550
## 102723897 4.044290 3.974194 0.5958142295 0.10313675 -0.1634215
## 100132287 2.761954 2.779897 -0.0002541628 0.27427793 -0.2241916
## 102465432 7.016830 6.739164 0.1144030979 0.19672967 -0.1829008
## 100133331 3.225128 3.237334 -0.3339100492 0.23180393 -0.1915055
Define contrast. *Compare treatment to untreated, meaning any DE/logFC you observe are in the treated group in reference to the untreated.
# comparison between groups (log-fold changes) are obtained as contrasts of these fitted linear models
contrUnTrAn <- makeContrasts(groupauranofin - groupuntreated, levels = modelMatrixAn)
contrUnTrAn
## Contrasts
## Levels groupauranofin - groupuntreated
## groupauranofin 1
## groupuntreated -1
## cellLinePEO1s 0
## cellLinePEO4 0
## oxygen21 0
tmpUnTrAn <- contrasts.fit(fitUnTrAn, contrasts = contrUnTrAn)
Statistical testing.
tmpUnTrAn <- eBayes(tmpUnTrAn, robust = TRUE) # use robust emperical bayes for statitsical testing to account for hypervariable genes
topTableUnTrAn <- topTable(tmpUnTrAn, sort.by = "P", n = Inf)
head(topTableUnTrAn, 20) # look at top 20 DE genes (sorted by p value)
## ENTREZID SYMBOL GENETYPE ENSEMBL logFC AveExpr
## 2583 2583 B4GALNT1 protein-coding ENSG00000135454 1.6714030 3.8823352
## 23231 23231 SEL1L3 protein-coding ENSG00000091490 1.0397791 5.6280912
## 3162 3162 HMOX1 protein-coding ENSG00000100292 5.4932390 6.1652117
## 1728 1728 NQO1 protein-coding ENSG00000181019 1.4298010 7.8916806
## 4199 4199 ME1 protein-coding ENSG00000065833 1.0355034 5.6611553
## 6696 6696 SPP1 protein-coding ENSG00000118785 5.3851186 1.6741337
## 7345 7345 UCHL1 protein-coding ENSG00000154277 1.7072072 1.9476067
## 2512 2512 FTL protein-coding ENSG00000087086 1.7494939 9.1575462
## 6507 6507 SLC1A3 protein-coding ENSG00000079215 -1.3188562 0.9264351
## 2539 2539 G6PD protein-coding ENSG00000160211 1.8029780 7.5566826
## 65983 65983 GRAMD2B protein-coding ENSG00000155324 -0.6615407 6.6010063
## 7086 7086 TKT protein-coding ENSG00000163931 0.7753176 8.1431254
## 3310 3310 HSPA6 protein-coding ENSG00000173110 3.6638308 1.5855824
## 6888 6888 TALDO1 protein-coding ENSG00000177156 0.7517363 6.6871927
## 4176 4176 MCM7 protein-coding ENSG00000166508 -0.4608884 7.6649269
## 4097 4097 MAFG protein-coding ENSG00000197063 0.7820889 6.8548194
## 5052 5052 PRDX1 protein-coding ENSG00000117450 0.6961976 8.6864745
## 8544 8544 PIR protein-coding ENSG00000087842 1.3952141 1.7043860
## 56666 56666 PANX2 protein-coding ENSG00000073150 2.0614431 3.2746960
## 2717 2717 GLA protein-coding ENSG00000102393 0.7407773 5.0151312
## t P.Value adj.P.Val B
## 2583 34.41842 8.166385e-31 1.075023e-26 59.25297
## 23231 27.22768 5.346174e-27 2.973543e-23 51.43116
## 3162 35.00579 6.776534e-27 2.973543e-23 50.65569
## 1728 29.40095 1.134634e-26 3.486357e-23 50.81690
## 4199 26.49246 1.474958e-26 3.486357e-23 50.47352
## 6696 28.58796 1.589041e-26 3.486357e-23 43.36094
## 7345 25.11976 1.051510e-25 1.962794e-22 46.92005
## 2512 29.19574 1.192826e-25 1.962794e-22 48.51598
## 6507 -23.20662 1.915232e-24 2.666013e-21 45.13028
## 2539 27.31374 2.025230e-24 2.666013e-21 45.73274
## 65983 -21.91009 1.544701e-23 1.848586e-20 43.59835
## 7086 21.81984 1.793294e-23 1.967243e-20 43.44656
## 3310 22.43794 4.549932e-23 4.607331e-20 39.08564
## 6888 20.81542 9.788593e-23 9.204074e-20 41.75291
## 4176 -20.25683 2.592164e-22 2.274883e-19 40.77098
## 4097 19.13574 4.079258e-21 3.356210e-18 38.04220
## 5052 18.39958 1.033631e-20 8.003952e-18 37.08080
## 8544 18.13256 1.303530e-20 9.533153e-18 35.35021
## 56666 19.55618 1.417423e-20 9.820506e-18 36.47236
## 2717 17.92807 1.937537e-20 1.275287e-17 36.49994
# Total number of DE genes based on adjusted p value < 0.05
length(which(topTableUnTrAn$adj.P.Val < 0.05))
## [1] 6334
# Total number of DE genes based on adjusted p value < 0.05 and |logFC| >= 1.0
length(which(topTableUnTrAn$adj.P.Val < 0.05 & abs(topTableUnTrAn$logFC) >= 1.0 ))
## [1] 107
# Total number of DE genes based on adjusted p value < 0.01 and |logFC| >= 1.0
length(which(topTableUnTrAn$adj.P.Val < 0.01 & abs(topTableUnTrAn$logFC) >= 1.0 ))
## [1] 104
dtUnTrAn <- decideTests(tmpUnTrAn, p.value = 0.05, lfc = 1.0)
summary(dtUnTrAn)
## groupauranofin - groupuntreated
## Down 45
## NotSig 13057
## Up 62
Plots for DE contrast: auranofin vs untreated.
volcanoplot(tmpUnTrAn, coef = "groupauranofin - groupuntreated", highlight = 20, names = tmpUnTrAn$genes$SYMBOL, main = "Auranofin - untreated")
limma::plotMA(tmpUnTrAn, coef = 1, status = dtUnTrAn[ ,"groupauranofin - groupuntreated"], hl.pch = 20, bg.col = "grey")
o <- order(tmpUnTrAn$p.value[,"groupauranofin - groupuntreated"])
x <- tmpUnTrAn$Amean
y <- tmpUnTrAn$coefficients[,"groupauranofin - groupuntreated"]
G <- tmpUnTrAn$genes$SYMBOL
text(x[o[1:20]], y[o[1:20]],labels = G[o[1:20]]) # to show the top 20 DE gene symbols
Gene ontology analyiss with Goana.
# subset de genes to include those of great significance and large-ish logFC
genesDeUnTrAn <- topTableUnTrAn %>%
mutate(bigLogFC = case_when(logFC < -1.0 | logFC > 1.0 ~ "keep",
TRUE ~ "discard")) %>%
dplyr::filter(bigLogFC == "keep") %>%
dplyr::filter(adj.P.Val < 0.05) %>%
dplyr::select(ENTREZID)
genesDeUnTrAn <- as.vector(genesDeUnTrAn)
## gene ontology
goUnTrAn <- goana(genesDeUnTrAn)
topGO(goUnTrAn)
## Term
## GO:0009605 response to external stimulus
## GO:1901700 response to oxygen-containing compound
## GO:0010033 response to organic substance
## GO:0070887 cellular response to chemical stimulus
## GO:0016491 oxidoreductase activity
## GO:0006950 response to stress
## GO:0018636 phenanthrene 9,10-monooxygenase activity
## GO:0016616 oxidoreductase activity, acting on the CH-OH group of donors, NAD or NADP as acceptor
## GO:0050896 response to stimulus
## GO:0071357 cellular response to type I interferon
## GO:0016614 oxidoreductase activity, acting on CH-OH group of donors
## GO:0071395 cellular response to jasmonic acid stimulus
## GO:0009753 response to jasmonic acid
## GO:0042221 response to chemical
## GO:0008219 cell death
## GO:0034340 response to type I interferon
## GO:0008207 C21-steroid hormone metabolic process
## GO:0006915 apoptotic process
## GO:0047086 ketosteroid monooxygenase activity
## GO:0032722 positive regulation of chemokine production
## Ont N ENTREZID P.ENTREZID
## GO:0009605 BP 2798 37 5.635978e-09
## GO:1901700 BP 1651 27 1.784031e-08
## GO:0010033 BP 3042 37 5.379891e-08
## GO:0070887 BP 3051 37 5.817260e-08
## GO:0016491 MF 737 17 1.113257e-07
## GO:0006950 BP 3868 42 1.126302e-07
## GO:0018636 MF 3 3 1.160131e-07
## GO:0016616 MF 128 8 2.303783e-07
## GO:0050896 BP 9030 70 2.782072e-07
## GO:0071357 BP 58 6 4.048219e-07
## GO:0016614 MF 140 8 4.583478e-07
## GO:0071395 BP 4 3 4.623884e-07
## GO:0009753 BP 4 3 4.623884e-07
## GO:0042221 BP 4410 44 5.630499e-07
## GO:0008219 BP 2105 28 6.965193e-07
## GO:0034340 BP 64 6 7.322055e-07
## GO:0008207 BP 37 5 1.009895e-06
## GO:0006915 BP 1892 26 1.063288e-06
## GO:0047086 MF 5 3 1.151827e-06
## GO:0032722 BP 70 6 1.250333e-06
goUnTrAn$adj.P.Val <- p.adjust(goUnTrAn$P.ENTREZID, method="BH")
goUnTrAn <- goUnTrAn %>%
rownames_to_column(var = "goTerm")