Background

RNA seq analysis for An Truong of Brenton group, CRUK CI.

Three cell lines analysed:

  • PE04
  • PE01s
  • PE01ms

Contrasts:

  • Treated with auranofin vs untreated (all oxygen conditions, all cell lines)
  • 5% vs 21% oxygen (hypoxic) (regardless of drug treatment)
  • Treated with auranofin vs untreated (at 21% oxygen - normal baseline)
  • Treated with auranofin vs untreated (at 5% oxygen - hypoxic baseline)

Data provided:

  • Raw counts generated using featureCounts by An
  • Raw counts and metadata sent by An via Brenton group Teams chat on 20230612 and 20230622.
  • Files stored locally at /Users/weir.a/Documents/PhD/Rstudio projects/brentonCopyNumberSignaturesOTTA_20230606/RNAseqForAn

Packages.

library(dplyr)
library(tidyverse)
library(ggplot2)
library(stringr)
library(AnnotationDbi)
library(org.Hs.eg.db)
library(limma)
library(edgeR)
library(DESeq2)

Load data.

rawCounts <- read.csv("~/Documents/PhD/Rstudio projects/brentonCopyNumberSignaturesOTTA_20230606/RNAseqForAn/SLX-22950.fc_counts.csv")

meta <- read.csv("~/Documents/PhD/Rstudio projects/brentonCopyNumberSignaturesOTTA_20230606/RNAseqForAn/SLX22950_metadata.csv")

Wrangle data.

# reformat sequencing IDs
colnames(rawCounts) <- str_sub(colnames(rawCounts), start = 1, end = 31)

# make ID column rownames
rawCounts <- rawCounts %>% 
  column_to_rownames(var = "X") 

# add HR status column to metadata
meta <- meta %>% 
  dplyr::mutate(hr_status = case_when(cell_line == "PEO1s" ~ "HRD", 
                                      TRUE ~ "HRP"))

Annotate genes.

entrez <- rownames(rawCounts)
colGene <- c("ENTREZID", "SYMBOL", "GENETYPE", "ENSEMBL")
geneIdMapExAn <- AnnotationDbi::select(org.Hs.eg.db, keys = entrez, columns = colGene, keytype = "ENTREZID")
## 'select()' returned 1:many mapping between keys and columns
geneIdMapExAn <- geneIdMapExAn[!duplicated(geneIdMapExAn$ENTREZID), ]

geneIdMapExAn <- geneIdMapExAn[geneIdMapExAn$ENTREZID %in% entrez, ]

genesAn <-  geneIdMapExAn$ENTREZID

rawCountsFinal <- rawCounts[genesAn, ]

Create a DGE object for DE analysis.

dgeUnTreAn <- DGEList(counts = rawCountsFinal, genes = geneIdMapExAn, samples = meta, group = meta$treatment) 

Filter lowly expressed genes. Using manual filtering and the filterByExp() function

# filter lowly expressed genes
cutoff <- 4
drop <- which(apply(cpm(dgeUnTreAn), 1, max) < cutoff)
dgeUnTreAnFil <- dgeUnTreAn[-drop, , keep.lib.sizes = FALSE] 
dim(dgeUnTreAnFil) # number of genes left after manual filtering
## [1] 13330    36
keep <- filterByExpr(dgeUnTreAnFil, group = dgeUnTreAnFil$samples$treatment)

dgeUnTreAnFilExp <- dgeUnTreAnFil[keep, ,keep.lib.sizes = FALSE ]
dim(dgeUnTreAnFilExp) # number of genes left after manual filtering and filterByExpr
## [1] 13164    36
boxplot(log(dgeUnTreAnFilExp$counts + 0.5), xaxt = "n", col = "turquoise4", xlab = "Samples") # look at raw log counts

boxplot(cpm(dgeUnTreAnFilExp$counts, log = TRUE), xaxt = "n", col = "turquoise4", xlab = "Samples") # look at log cpm normalised counts

plotSparsity(dgeUnTreAnFilExp$counts, normalized = TRUE)

Normalisation.

# calculate normalisation factors (does not normalise)
dgeUnTreAnFilExpNorm <- calcNormFactors(dgeUnTreAnFilExp)

barplot(dgeUnTreAnFilExpNorm$samples$norm.factors, names = colnames(dgeUnTreAnFilExpNorm), las = 2)
title("Barplot of normalisation factors")

Contrast: Auranofin treated vs untreated. Define group for contrast and covariate interactions

group <-  interaction(dgeUnTreAnFilExpNorm$samples$treatment)
groupCol <- group # you could just use the original interaction vector, but I like to make a new one that I will overlay colours on.
levels(groupCol) <- c("#F8766D", "#00BFC4") # change levels to colour names/HEX codes.
# untreated == turquoise (#00BFC4)
# auranofin == tomato ("#F8766D")
groupCol <- as.character(groupCol) # make sure R knows the colour replacements are characters.

plotMDS(dgeUnTreAnFilExpNorm, col = groupCol, labels = NULL, pch = 20)
legend("topright", legend = c("auranofin", "untreated"), fill =  c("#F8766D", "#00BFC4"))

# defining covariate interactions

cellLine <- interaction(dgeUnTreAnFilExpNorm$samples$cell_line)
cellLineCol <- cellLine 
# Levels: PEO1ms PEO1s PEO4 
levels(cellLineCol) <- c("darkorange", "cornflowerblue", "darkolivegreen3") 
cellLineCol <- as.character(cellLineCol) 

oxygen <- interaction(dgeUnTreAnFilExpNorm$samples$o2)
oxygenCol <- oxygen
# Levels: 5 21
levels(oxygenCol) <- c("aquamarine", "aquamarine4")
oxygenCol <- as.character(oxygenCol)

hrStatus <- interaction(dgeUnTreAnFilExpNorm$samples$hr_status)
hrStatusCol <- hrStatus
# Levels: HRD HRP
levels(hrStatusCol) <- c("brown1", "cadetblue1")
hrStatusCol <- as.character(hrStatusCol)

plotMDS(dgeUnTreAnFilExpNorm, col = cellLineCol, labels = NULL, pch = 20)
legend("topright", legend = c("PEO1ms", "PEO1s", "PEO4"), fill =  c("darkorange", "cornflowerblue", "darkolivegreen3"))

plotMDS(dgeUnTreAnFilExpNorm, col = oxygenCol, labels = NULL, pch = 20)
legend("topright", legend = c("5", "21"), fill =  c("aquamarine", "aquamarine4"))

plotMDS(dgeUnTreAnFilExpNorm, col = hrStatusCol, labels = NULL, pch = 20)
legend("topright", legend = c("HRD", "HRP"), fill =  c("brown1", "cadetblue1"))

These MDS plots are extremely important. They show that most of your variation comes from the cell line used and the HR status (which is confounded with cell line). These covariates need to be included in our design matrix, since we are not trying to investigate differences between cell lines or HR status, we are primarily interested in drug treatment and oxygen concentration. As HR status is so heavily confounded by cell line (you only have one cell line to represent HRD), I will not include it as a covariate in the design matrix (but if you had more HRD cell lines, I would).

Define the design matrix.

# specify the model to be fitted
modelMatrixAn <- model.matrix(~ 0 + group + cellLine + oxygen)
# adjust for covariates including cellLine + oxygen

Voom transformation.

# voom transformation
voomUnTrAn <- voom(dgeUnTreAnFilExpNorm, modelMatrixAn, plot = TRUE) 

# look at diagnostic plots
plotMDS(dgeUnTreAnFilExpNorm, col = groupCol, labels = NULL, pch = 20)
legend("topright", legend = c("auranofin", "untreated"), fill =  c("#F8766D", "#00BFC4"))

Fit linear model.

# fitting a linear model using weighted least squares for each gene
fitUnTrAn <- lmFit(voomUnTrAn, modelMatrixAn)
head(coef(fitUnTrAn))
##           groupauranofin groupuntreated cellLinePEO1s cellLinePEO4   oxygen21
## 653635          3.599856       3.535125  0.6199890492   0.05707597 -0.1871711
## 729737          4.402366       4.420981  0.5291908174   0.10503620 -0.1865550
## 102723897       4.044290       3.974194  0.5958142295   0.10313675 -0.1634215
## 100132287       2.761954       2.779897 -0.0002541628   0.27427793 -0.2241916
## 102465432       7.016830       6.739164  0.1144030979   0.19672967 -0.1829008
## 100133331       3.225128       3.237334 -0.3339100492   0.23180393 -0.1915055

Define contrast. *Compare treatment to untreated, meaning any DE/logFC you observe are in the treated group in reference to the untreated.

# comparison between groups (log-fold changes) are obtained as contrasts of these fitted linear models
contrUnTrAn <- makeContrasts(groupauranofin - groupuntreated, levels = modelMatrixAn)
contrUnTrAn
##                 Contrasts
## Levels           groupauranofin - groupuntreated
##   groupauranofin                               1
##   groupuntreated                              -1
##   cellLinePEO1s                                0
##   cellLinePEO4                                 0
##   oxygen21                                     0
tmpUnTrAn <- contrasts.fit(fitUnTrAn, contrasts = contrUnTrAn)

Statistical testing.

tmpUnTrAn <- eBayes(tmpUnTrAn, robust = TRUE) # use robust emperical bayes for statitsical testing to account for hypervariable genes

topTableUnTrAn <- topTable(tmpUnTrAn, sort.by = "P", n = Inf)
head(topTableUnTrAn, 20) # look at top 20 DE genes (sorted by p value)
##       ENTREZID   SYMBOL       GENETYPE         ENSEMBL      logFC   AveExpr
## 2583      2583 B4GALNT1 protein-coding ENSG00000135454  1.6714030 3.8823352
## 23231    23231   SEL1L3 protein-coding ENSG00000091490  1.0397791 5.6280912
## 3162      3162    HMOX1 protein-coding ENSG00000100292  5.4932390 6.1652117
## 1728      1728     NQO1 protein-coding ENSG00000181019  1.4298010 7.8916806
## 4199      4199      ME1 protein-coding ENSG00000065833  1.0355034 5.6611553
## 6696      6696     SPP1 protein-coding ENSG00000118785  5.3851186 1.6741337
## 7345      7345    UCHL1 protein-coding ENSG00000154277  1.7072072 1.9476067
## 2512      2512      FTL protein-coding ENSG00000087086  1.7494939 9.1575462
## 6507      6507   SLC1A3 protein-coding ENSG00000079215 -1.3188562 0.9264351
## 2539      2539     G6PD protein-coding ENSG00000160211  1.8029780 7.5566826
## 65983    65983  GRAMD2B protein-coding ENSG00000155324 -0.6615407 6.6010063
## 7086      7086      TKT protein-coding ENSG00000163931  0.7753176 8.1431254
## 3310      3310    HSPA6 protein-coding ENSG00000173110  3.6638308 1.5855824
## 6888      6888   TALDO1 protein-coding ENSG00000177156  0.7517363 6.6871927
## 4176      4176     MCM7 protein-coding ENSG00000166508 -0.4608884 7.6649269
## 4097      4097     MAFG protein-coding ENSG00000197063  0.7820889 6.8548194
## 5052      5052    PRDX1 protein-coding ENSG00000117450  0.6961976 8.6864745
## 8544      8544      PIR protein-coding ENSG00000087842  1.3952141 1.7043860
## 56666    56666    PANX2 protein-coding ENSG00000073150  2.0614431 3.2746960
## 2717      2717      GLA protein-coding ENSG00000102393  0.7407773 5.0151312
##               t      P.Value    adj.P.Val        B
## 2583   34.41842 8.166385e-31 1.075023e-26 59.25297
## 23231  27.22768 5.346174e-27 2.973543e-23 51.43116
## 3162   35.00579 6.776534e-27 2.973543e-23 50.65569
## 1728   29.40095 1.134634e-26 3.486357e-23 50.81690
## 4199   26.49246 1.474958e-26 3.486357e-23 50.47352
## 6696   28.58796 1.589041e-26 3.486357e-23 43.36094
## 7345   25.11976 1.051510e-25 1.962794e-22 46.92005
## 2512   29.19574 1.192826e-25 1.962794e-22 48.51598
## 6507  -23.20662 1.915232e-24 2.666013e-21 45.13028
## 2539   27.31374 2.025230e-24 2.666013e-21 45.73274
## 65983 -21.91009 1.544701e-23 1.848586e-20 43.59835
## 7086   21.81984 1.793294e-23 1.967243e-20 43.44656
## 3310   22.43794 4.549932e-23 4.607331e-20 39.08564
## 6888   20.81542 9.788593e-23 9.204074e-20 41.75291
## 4176  -20.25683 2.592164e-22 2.274883e-19 40.77098
## 4097   19.13574 4.079258e-21 3.356210e-18 38.04220
## 5052   18.39958 1.033631e-20 8.003952e-18 37.08080
## 8544   18.13256 1.303530e-20 9.533153e-18 35.35021
## 56666  19.55618 1.417423e-20 9.820506e-18 36.47236
## 2717   17.92807 1.937537e-20 1.275287e-17 36.49994
# Total number of DE genes based on adjusted p value < 0.05
length(which(topTableUnTrAn$adj.P.Val < 0.05))
## [1] 6334
# Total number of DE genes based on adjusted p value < 0.05 and |logFC| >= 1.0
length(which(topTableUnTrAn$adj.P.Val < 0.05 & abs(topTableUnTrAn$logFC) >= 1.0 ))
## [1] 107
# Total number of DE genes based on adjusted p value < 0.01 and |logFC| >= 1.0
length(which(topTableUnTrAn$adj.P.Val < 0.01 & abs(topTableUnTrAn$logFC) >= 1.0 ))
## [1] 104
dtUnTrAn <- decideTests(tmpUnTrAn, p.value = 0.05, lfc = 1.0)
summary(dtUnTrAn)
##        groupauranofin - groupuntreated
## Down                                45
## NotSig                           13057
## Up                                  62

Plots for DE contrast: auranofin vs untreated.

volcanoplot(tmpUnTrAn, coef = "groupauranofin - groupuntreated", highlight = 20, names = tmpUnTrAn$genes$SYMBOL, main = "Auranofin - untreated")

limma::plotMA(tmpUnTrAn, coef = 1, status = dtUnTrAn[ ,"groupauranofin - groupuntreated"], hl.pch = 20, bg.col = "grey")
o <- order(tmpUnTrAn$p.value[,"groupauranofin - groupuntreated"])
x <- tmpUnTrAn$Amean
y <- tmpUnTrAn$coefficients[,"groupauranofin - groupuntreated"]
G <- tmpUnTrAn$genes$SYMBOL
text(x[o[1:20]], y[o[1:20]],labels = G[o[1:20]]) # to show the top 20 DE gene symbols

Gene ontology analyiss with Goana.

# subset de genes to include those of great significance and large-ish logFC
genesDeUnTrAn <- topTableUnTrAn %>%
  mutate(bigLogFC = case_when(logFC < -1.0 | logFC > 1.0 ~ "keep",
                              TRUE ~ "discard")) %>% 
  dplyr::filter(bigLogFC == "keep") %>% 
  dplyr::filter(adj.P.Val < 0.05) %>% 
  dplyr::select(ENTREZID) 
genesDeUnTrAn <- as.vector(genesDeUnTrAn)  

## gene ontology
goUnTrAn <- goana(genesDeUnTrAn)
topGO(goUnTrAn)
##                                                                                             Term
## GO:0009605                                                         response to external stimulus
## GO:1901700                                                response to oxygen-containing compound
## GO:0010033                                                         response to organic substance
## GO:0070887                                                cellular response to chemical stimulus
## GO:0016491                                                               oxidoreductase activity
## GO:0006950                                                                    response to stress
## GO:0018636                                              phenanthrene 9,10-monooxygenase activity
## GO:0016616 oxidoreductase activity, acting on the CH-OH group of donors, NAD or NADP as acceptor
## GO:0050896                                                                  response to stimulus
## GO:0071357                                                cellular response to type I interferon
## GO:0016614                              oxidoreductase activity, acting on CH-OH group of donors
## GO:0071395                                           cellular response to jasmonic acid stimulus
## GO:0009753                                                             response to jasmonic acid
## GO:0042221                                                                  response to chemical
## GO:0008219                                                                            cell death
## GO:0034340                                                         response to type I interferon
## GO:0008207                                                 C21-steroid hormone metabolic process
## GO:0006915                                                                     apoptotic process
## GO:0047086                                                    ketosteroid monooxygenase activity
## GO:0032722                                           positive regulation of chemokine production
##            Ont    N ENTREZID   P.ENTREZID
## GO:0009605  BP 2798       37 5.635978e-09
## GO:1901700  BP 1651       27 1.784031e-08
## GO:0010033  BP 3042       37 5.379891e-08
## GO:0070887  BP 3051       37 5.817260e-08
## GO:0016491  MF  737       17 1.113257e-07
## GO:0006950  BP 3868       42 1.126302e-07
## GO:0018636  MF    3        3 1.160131e-07
## GO:0016616  MF  128        8 2.303783e-07
## GO:0050896  BP 9030       70 2.782072e-07
## GO:0071357  BP   58        6 4.048219e-07
## GO:0016614  MF  140        8 4.583478e-07
## GO:0071395  BP    4        3 4.623884e-07
## GO:0009753  BP    4        3 4.623884e-07
## GO:0042221  BP 4410       44 5.630499e-07
## GO:0008219  BP 2105       28 6.965193e-07
## GO:0034340  BP   64        6 7.322055e-07
## GO:0008207  BP   37        5 1.009895e-06
## GO:0006915  BP 1892       26 1.063288e-06
## GO:0047086  MF    5        3 1.151827e-06
## GO:0032722  BP   70        6 1.250333e-06
goUnTrAn$adj.P.Val <- p.adjust(goUnTrAn$P.ENTREZID, method="BH")

goUnTrAn <- goUnTrAn %>% 
  rownames_to_column(var = "goTerm")