library(Biobase)
## Loading required package: BiocGenerics
## 
## Attaching package: 'BiocGenerics'
## The following objects are masked from 'package:stats':
## 
##     IQR, mad, sd, var, xtabs
## The following objects are masked from 'package:base':
## 
##     anyDuplicated, append, as.data.frame, basename, cbind, colnames,
##     dirname, do.call, duplicated, eval, evalq, Filter, Find, get, grep,
##     grepl, intersect, is.unsorted, lapply, Map, mapply, match, mget,
##     order, paste, pmax, pmax.int, pmin, pmin.int, Position, rank,
##     rbind, Reduce, rownames, sapply, setdiff, sort, table, tapply,
##     union, unique, unsplit, which.max, which.min
## Welcome to Bioconductor
## 
##     Vignettes contain introductory material; view with
##     'browseVignettes()'. To cite Bioconductor, see
##     'citation("Biobase")', and for packages 'citation("pkgname")'.
library(ggpubr)
## Loading required package: ggplot2
library(data.table)
library(DT)

GSE37614

You’re right, I don’t see any CEL files. God only knows what they used back in those days. I found two zipped directories containing both ‘raw’ and ‘processed’ files at https://www.ebi.ac.uk/arrayexpress/files/E-GEOD-37614/. I downloaded all available files:

wget https://www.ebi.ac.uk/arrayexpress/files/E-GEOD-37614/E-GEOD-37614.processed.1.zip && mkdir processed && unzip E-GEOD-37614.processed.1.zip && mv GSM* processed/
wget https://www.ebi.ac.uk/arrayexpress/files/E-GEOD-37614/E-GEOD-37614.raw.1.zip && mkdir raw && unzip E-GEOD-37614.raw.1.zip && mv GSM* raw/
wget https://www.ebi.ac.uk/arrayexpress/files/E-GEOD-37614/E-GEOD-37614.sdrf.txt
wget https://www.ebi.ac.uk/arrayexpress/files/A-GEOD-10558/A-GEOD-10558.adf.txt

Your directory tree should look like this:

├── [3.9M]  A-GEOD-10558.adf.txt
├── [3.1M]  E-GEOD-37614.processed.1.zip
├── [7.4M]  E-GEOD-37614.raw.1.zip
├── [ 15K]  E-GEOD-37614.sdrf.txt
├── [4.0K]  processed
│   ├── [405K]  GSM923165_sample_table.txt
│   ├── [405K]  GSM923186_sample_table.txt
│   ├── [405K]  GSM923193_sample_table.txt
│   ├── [405K]  GSM923194_sample_table.txt
│   ├── [405K]  GSM923195_sample_table.txt
│   ├── [405K]  GSM923205_sample_table.txt
│   ├── [405K]  GSM923206_sample_table.txt
│   ├── [405K]  GSM923207_sample_table.txt
│   ├── [405K]  GSM923208_sample_table.txt
│   ├── [405K]  GSM923209_sample_table.txt
│   ├── [405K]  GSM923210_sample_table.txt
│   ├── [405K]  GSM923211_sample_table.txt
│   ├── [405K]  GSM923236_sample_table.txt
│   ├── [405K]  GSM923237_sample_table.txt
│   ├── [405K]  GSM923238_sample_table.txt
│   ├── [405K]  GSM923239_sample_table.txt
│   ├── [405K]  GSM923240_sample_table.txt
│   ├── [405K]  GSM923241_sample_table.txt
│   └── [405K]  GSM923242_sample_table.txt
└── [4.0K]  raw
    ├── [1009K]  GSM923165_raw_TNBC.TB123.b1.txt
    ├── [1009K]  GSM923186_raw_TNBC.TB125.b1.txt
    ├── [1.3M]  GSM923193_raw_TNBC.TB134.txt
    ├── [1009K]  GSM923194_raw_ER_.TB71.b1.txt
    ├── [1009K]  GSM923195_raw_ER_.TB75.b1.txt
    ├── [1009K]  GSM923205_raw_ER_.TB130.b1.txt
    ├── [1.3M]  GSM923206_raw_ER_.TB98.txt
    ├── [1009K]  GSM923207_raw_ER_.TB120.b1.txt
    ├── [1009K]  GSM923208_raw_Her2_.TB76.b1.txt
    ├── [1.3M]  GSM923209_raw_Her2_.TB117.txt
    ├── [1009K]  GSM923210_raw_Her2_.TB136.b1.txt
    ├── [1.3M]  GSM923211_raw_Her2_.TB122.txt
    ├── [1009K]  GSM923236_raw_TNBC.TB160.b2.txt
    ├── [1009K]  GSM923237_raw_TNBC.TB162.b2.txt
    ├── [1009K]  GSM923238_raw_TNBC.TB164.b2.txt
    ├── [1009K]  GSM923239_raw_ER_.TB163.b2.txt
    ├── [1009K]  GSM923240_raw_ER_.TB165.b2.txt
    ├── [1009K]  GSM923241_raw_Her2_.TB129.b2.txt
    └── [1009K]  GSM923242_raw_Her2_ER_.TB148.b2.txt

2 directories, 42 files

I don’t care much for processed data so let’s start with the raw data :)

Raw Data Staging

Load Metadata

Stage the SDRF file. We will use the column denoting file names to read in the raw files.

SDRF <- read.delim("/data/projects/cmurray/E-GEOD-37614.sdrf.txt")
rownames(SDRF) <- SDRF$Array.Data.File
DT::datatable(SDRF, options = list(scrollX = TRUE, pageLength = 19, scroller = TRUE))

Load Signal files

These are not CEL files so we can’t use oligo::read.celfiles(), just write a function to stick each file together in a dataframe :)

gen_raw <- function(path, pattern){
  files = list.files(path, pattern, full.names=T, recursive=T, include.dirs=T)
  mat = as.data.frame(do.call(cbind, lapply(files, function(x) data.table::fread(x, stringsAsFactors=FALSE))))
  ID_REF = mat[,1]
  mat = as.data.frame(mat[,!grepl("ID_REF", colnames(mat))])
  rownames(mat) = ID_REF
  return(mat)
}

raw.dat <- gen_raw("/data/projects/cmurray/raw/", "\\.txt$")
dim(raw.dat)
## [1] 47231    23

The column names are nonsensical. Next step is to use to metadata file to append the correct sample names to each column.

file_names <- list.files("/data/projects/cmurray/raw/", "\\.txt$", full.names=T, recursive=T, include.dirs=T)
file_names <- sub(".*/", "", file_names)
head(file_names)
## [1] "GSM923165_raw_TNBC.TB123.b1.txt" "GSM923186_raw_TNBC.TB125.b1.txt"
## [3] "GSM923193_raw_TNBC.TB134.b1.txt" "GSM923193_raw_TNBC.TB134.b2.txt"
## [5] "GSM923194_raw_ER_.TB71.b1.txt"   "GSM923195_raw_ER_.TB75.b1.txt"
sample_names <- sub("\\_.*", "", file_names)
head(sample_names)
## [1] "GSM923165" "GSM923186" "GSM923193" "GSM923193" "GSM923194" "GSM923195"
length(sample_names) == ncol(raw.dat) #FALSE
## [1] TRUE

There are more columns in raw.dat (23) than there are sample names in the annotation file (19).

Pain

They did something shitty here (not sure how your GEO dataset handled this) - for all of the samples with a b1,b2 batch annotation, there are two columns for ‘signal’ in these files. I’m not 100% sure how to handle this yet. Take a look below to see what I mean:

pwd
/data/projects/cmurray/raw/
head -n 4 *.txt
==> GSM923165_raw_TNBC.TB123.b1.txt <==
ID_REF  VALUE
ILMN_1653768    388.184
ILMN_1668377    353.682
ILMN_1824784    369.376

==> GSM923186_raw_TNBC.TB125.b1.txt <==
ID_REF  VALUE
ILMN_1653768    336.9
ILMN_1668377    374.261
ILMN_1824784    371.367

==> GSM923193_raw_TNBC.TB134.txt <==
ID_REF  VALUE   VALUE2
ILMN_1653768    345.349 206.512
ILMN_1668377    344.038 248.29
ILMN_1824784    402.025 251.595

==> GSM923194_raw_ER_.TB71.b1.txt <==
ID_REF  VALUE
ILMN_1653768    383.756
ILMN_1668377    341.217
ILMN_1824784    359.835

==> GSM923195_raw_ER_.TB75.b1.txt <==
ID_REF  VALUE
ILMN_1653768    308.575
ILMN_1668377    378.362
ILMN_1824784    372.161

==> GSM923205_raw_ER_.TB130.b1.txt <==
ID_REF  VALUE
ILMN_1653768    297.507
ILMN_1668377    340.106
ILMN_1824784    423.984

==> GSM923206_raw_ER_.TB98.txt <==
ID_REF  VALUE   VALUE2
ILMN_1653768    354.972 203.633
ILMN_1668377    371.799 191.744
ILMN_1824784    387.36  297.442

You can see it in the file names too, any file that does not end with .b1.txt/.b2.txt are ones with two batches present in the file.


Thinking out loud we could split the files in bash and update the sdrf file accordingly.

1. Isolate the offending files

awk 'BEGIN{FS="\t"}; {if($4=="b1,b2") print $0}' E-GEOD-37614.sdrf.txt > b1.b2_batches.sdrf
wc -l b1.b2_batches.sdrf
4
awk 'BEGIN{FS="\t"}; {print $29}' b1.b2_batches.sdrf > little_fuckers.txt
cat little_fuckers.txt
GSM923211_raw_Her2_.TB122.txt
GSM923209_raw_Her2_.TB117.txt
GSM923206_raw_ER_.TB98.txt
GSM923193_raw_TNBC.TB134.txt

2. Split 2 batch file into 2 files

Code below takes the files which have 3 fields (NF = 3) and returns (NF=1, NF=2) for Batch 1, and (NF=1, NF=3) for batch 2.

cd raw/
while read -r line; do fn=$(basename $line .txt); awk '{print $(NF-2)" "$NF}' $line > ${fn}.b2.txt; awk '{print $(NF-2)" "$(NF-1)}' $line > ${fn}.b1.txt; done < ../little_fuckers.txt

Take a look at the directory:

GSM923165_raw_TNBC.TB123.b1.txt  GSM923206_raw_ER_.TB98.b2.txt     GSM923211_raw_Her2_.TB122.b2.txt
GSM923186_raw_TNBC.TB125.b1.txt  GSM923206_raw_ER_.TB98.txt        GSM923211_raw_Her2_.TB122.txt
GSM923193_raw_TNBC.TB134.b1.txt  GSM923207_raw_ER_.TB120.b1.txt    GSM923236_raw_TNBC.TB160.b2.txt
GSM923193_raw_TNBC.TB134.b2.txt  GSM923208_raw_Her2_.TB76.b1.txt   GSM923237_raw_TNBC.TB162.b2.txt
GSM923193_raw_TNBC.TB134.txt     GSM923209_raw_Her2_.TB117.b1.txt  GSM923238_raw_TNBC.TB164.b2.txt
GSM923194_raw_ER_.TB71.b1.txt    GSM923209_raw_Her2_.TB117.b2.txt  GSM923239_raw_ER_.TB163.b2.txt
GSM923195_raw_ER_.TB75.b1.txt    GSM923209_raw_Her2_.TB117.txt     GSM923240_raw_ER_.TB165.b2.txt
GSM923205_raw_ER_.TB130.b1.txt   GSM923210_raw_Her2_.TB136.b1.txt  GSM923241_raw_Her2_.TB129.b2.txt
GSM923206_raw_ER_.TB98.b1.txt    GSM923211_raw_Her2_.TB122.b1.txt  GSM923242_raw_Her2_ER_.TB148.b2.txt

For example, GSM923193_raw_TNBC.TB134.txt now has a batch 1 and batch 2 file. I assumed that VALUE is b1, and VALUE2 is batch 2. This is actually a huge assumption on my behalf but sure look we can see how it looks downstream - don’t over think it for now (me totally over thinking it).

3. Remove the original 2 signal files

while read -r line; do rm $line; done < ../little_fuckers.txt

Load Updated Metadata

We need to update the sdrf file to reflect the new files we made, and the ones we deleted.

Not a chance I’m able to code this dynamically so I just did it in excel. I’ll send you the updated file E-GEOD-37614.sdrf_fixed.txt :)

SDRF <- read.delim("/data/projects/cmurray/E-GEOD-37614.sdrf_fixed.txt")
rownames(SDRF) <- SDRF$Array.Data.File
DT::datatable(SDRF, options = list(scrollX = TRUE, pageLength = 23, scroller = TRUE))

Create Count Matrix, part II

## same as first attempt tbh
gen_raw <- function(path, pattern){
  files = list.files(path, pattern, full.names=T, recursive=T, include.dirs=T)
  mat = as.data.frame(do.call(cbind, lapply(files, function(x) data.table::fread(x, stringsAsFactors=FALSE))))
  ID_REF = mat[,1]
  mat = as.data.frame(mat[,!grepl("ID_REF", colnames(mat))])
  rownames(mat) = ID_REF
  return(mat)
}

raw.dat <- gen_raw("/data/projects/cmurray/raw/", "\\.txt$")

## attach sample names to cols
file_names <- list.files("/data/projects/cmurray/raw/", "\\.txt$", full.names=T, recursive=T, include.dirs=T)
file_names <- sub(".*/", "", file_names)
sample_names <- sub("\\_.*", "", file_names)
length(sample_names) == ncol(raw.dat)
## [1] TRUE

TRUE thank fuck.

Append ID’s to matrix

Super super important that you match these correctly. match() function for the win here.

# we need to append batch ID's to the samplenames - sorry but has to happen for unique colnames. 
SDRF$samp_names <- paste(SDRF$Assay.Name, SDRF$Characteristics..batch., sep=":")
SDRF$samp_names
##  [1] "GSM923242:b2" "GSM923241:b2" "GSM923240:b2" "GSM923239:b2" "GSM923238:b2"
##  [6] "GSM923237:b2" "GSM923236:b2" "GSM923211:b1" "GSM923211:b2" "GSM923210:b1"
## [11] "GSM923209:b1" "GSM923209:b2" "GSM923208:b1" "GSM923207:b1" "GSM923206:b1"
## [16] "GSM923206:b2" "GSM923205:b1" "GSM923195:b1" "GSM923194:b1" "GSM923193:b1"
## [21] "GSM923193:b2" "GSM923186:b1" "GSM923165:b1"
# below is most important part to make sure columns match file contents
SDRF <- SDRF[match(file_names, SDRF$Array.Data.File),]
colnames(raw.dat) <- SDRF$samp_names

## the below is correct, check it against the head -n 4 *.txt file command above
head(raw.dat[1:5, 1:5])
##              GSM923165:b1 GSM923186:b1 GSM923193:b1 GSM923193:b2 GSM923194:b1
## ILMN_1653768      388.184      336.900      345.349      206.512      383.756
## ILMN_1668377      353.682      374.261      344.038      248.290      341.217
## ILMN_1824784      369.376      371.367      402.025      251.595      359.835
## ILMN_2066534      417.759      317.597      427.092      273.403      423.359
## ILMN_1731640      375.803      418.458      404.662      341.895      315.882

finally done pissing around with formatting - fuck those guys.

Create Eset from scratch

Fairly sure that ExpressionSet objects are S4 objects (ew) but the essentially hold ‘slots’ for data. no reason we cant make our own one.

SDRF <- Biobase::AnnotatedDataFrame(SDRF)
rownames(SDRF) <- colnames(raw.dat)
my_set <- Biobase::ExpressionSet(assayData = as.matrix(raw.dat), phenoData = SDRF)
my_set
## ExpressionSet (storageMode: lockedEnvironment)
## assayData: 47231 features, 23 samples 
##   element names: exprs 
## protocolData: none
## phenoData
##   sampleNames: GSM923165:b1 GSM923186:b1 ... GSM923242:b2 (23 total)
##   varLabels: Source.Name Comment..Sample_source_name. ... samp_names
##     (41 total)
##   varMetadata: labelDescription
## featureData: none
## experimentData: use 'experimentData(object)'
## Annotation:

its missing a few slots but has everything we need to do stuff.

Data Processing

Check sample distributions, fix via normalization, filter if we can.

Raw Data Boxplots

Ok we have raw data so I fully expect the boxplot to look shit

sml <- as.character(as.numeric(as.factor(SDRF@data$Characteristics..class.)))
gs <- factor(sml)
groups <- SDRF@data$Characteristics..class. # HER2+ER+ sample? 
stupid_orientation <- unique(groups)
stupid_orientation <- c(stupid_orientation[2], stupid_orientation[3], stupid_orientation[4], stupid_orientation[1])
levels(gs) <- stupid_orientation
my_set@phenoData$group <- gs
ord <- order(gs)

palette <- palette(c("#1B9E77", "#7570B3", "#E7298A", "#A6761D"))

par(mar=c(7,4,2,1))
title <- "GSE37614 Log2 Raw Signals"
boxplot(log2(exprs(my_set[,ord])), boxwex=0.6, notch=T, main=title, outline=FALSE, las=2, col=gs[ord])
legend("topleft", stupid_orientation, fill=palette(), bty="n")

shit indeed!

Quantile normalization

Usually do this with a dedicated package for microarrays (unfortunately not compatible with our object) but again no reason we cant do it from scratch.

quantile_normalisation <- function(df){
  df_rank <- apply(df,2,rank,ties.method="min")
  df_sorted <- data.frame(apply(df, 2, sort))
  df_mean <- apply(df_sorted, 1, mean)

  index_to_mean <- function(my_index, my_mean){
    return(my_mean[my_index])
  }
df_final <- apply(df_rank, 2, index_to_mean, my_mean=df_mean)
  rownames(df_final) <- rownames(df)
  return(df_final)
}

qnorm <- quantile_normalisation(raw.dat)

Qnorm Boxplots

par(mar=c(7,4,2,1))
title <- "GSE37614 Log2 Quantile Norm McDonald"
boxplot(log2(qnorm[,ord]), boxwex=0.6, notch=T, main=title, outline=FALSE, las=2, col=gs[ord])
legend("topleft", stupid_orientation, fill=palette(), bty="n")

nice B-)

Update ExpressionSet Obj

we can bin the raw data and replace it with the quantile normalized data.

exprs(my_set) <- qnorm

## check that actually saved
par(mar=c(7,4,2,1))
title <- "GSE37614 Log2 Quantile Norm McDonald"
boxplot(log2(exprs(my_set[,ord])), boxwex=0.6, notch=T, main=title, outline=FALSE, las=2, col=gs[ord])
legend("topleft", stupid_orientation, fill=palette(), bty="n")

plot the same as the last one when we switch in log2(exprs(my_set[,ord])) so yep it did save to the eset obj properly.

Expression Distribution

Sanity check here to see if our normlized expression distribution looks like the one you produced with the preprocessed GEO data. I’m happy enough that it follows the same dist. The only difference is your plots y axis max goes to 0.0006 - mine goes to 0.008.

ex <- exprs(my_set)
par(mar=c(4,4,2,1))
title <- paste ("Normalized Expression Density", sep ="")
limma::plotDensities(ex, group=gs, main=title, legend ="topright")

Probe Filtering

Little peak on the left we want to take care of.

# Probe filtering
medians <- rowMedians(ex)

hist(log2(medians + 1), 150, col = "cornsilk1", freq = FALSE, 
             main = "Histogram of the median intensities", 
             border = "antiquewhite4", xlab = "Median intensities")

Couldn’t get the min threshold to work for some reason.

Remove multimapping probes

probes <- rownames(my_set)

library(illuminaHumanv4.db)
## Loading required package: AnnotationDbi
## Loading required package: stats4
## Loading required package: IRanges
## Loading required package: S4Vectors
## 
## Attaching package: 'S4Vectors'
## The following objects are masked from 'package:data.table':
## 
##     first, second
## The following objects are masked from 'package:base':
## 
##     expand.grid, I, unname
## 
## Attaching package: 'IRanges'
## The following object is masked from 'package:data.table':
## 
##     shift
## Loading required package: org.Hs.eg.db
## 
## 
annotation <- AnnotationDbi::select(illuminaHumanv4.db,
                                    keys = probes,
                                    columns = c("SYMBOL", "GENENAME"),
                                    keytype = "PROBEID")
## 'select()' returned 1:many mapping between keys and columns
## 'select()' returned 1:many mapping between keys and columns
annotation <- subset(annotation, !is.na(SYMBOL))

## resolve multi maps. 
library(dplyr)
## 
## Attaching package: 'dplyr'
## The following object is masked from 'package:AnnotationDbi':
## 
##     select
## The following objects are masked from 'package:IRanges':
## 
##     collapse, desc, intersect, setdiff, slice, union
## The following objects are masked from 'package:S4Vectors':
## 
##     first, intersect, rename, setdiff, setequal, union
## The following objects are masked from 'package:data.table':
## 
##     between, first, last
## The following object is masked from 'package:Biobase':
## 
##     combine
## The following objects are masked from 'package:BiocGenerics':
## 
##     combine, intersect, setdiff, union
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
ann_grouped <- group_by(annotation, PROBEID)
ann_sum <- dplyr::summarize(ann_grouped, no_of_matches = n_distinct(SYMBOL))

ann_flt <- filter(ann_sum, no_of_matches > 1)

remove_id <- (probes %in% ann_flt$PROBEID)

table(remove_id)
## remove_id
## FALSE  TRUE 
## 45526  1705

slightly concerning here, bad that we have 45526 genes in our dataset. needs to come down to ~15000-19000

I’ll try to do this manually because I’m really hampered by not having a legitimate expressionset object.

annotations <- read.table("/data/projects/cmurray/A-GEOD-10558.adf.txt", header=T, sep="\t")

mart <- biomaRt::useMart(biomart = "ensembl", dataset = "hsapiens_gene_ensembl")
attr <- biomaRt::listAttributes(mart)
genbank_map <- biomaRt::getBM(attributes = c("refseq_mrna", "hgnc_symbol", "gene_biotype"), mart = mart, useCache = FALSE)
genbank_map <- genbank_map[!(is.na(genbank_map$refseq_mrna) | genbank_map$refseq_mrna==""),]
genbank_map <- subset(genbank_map, genbank_map$gene_biotype == "protein_coding")

# pop off the version ID
annotations$genbank <- gsub("\\.\\d+$", "", annotations$genbank)
# merge
master <- merge(genbank_map, annotations, by.x="refseq_mrna", by.y="genbank")

now we have a master df with probe IDs and the gene symbols for protein coding genes. I feel a bit better now about performing the subset.

Tbh fuck the expression set, I dont think we will need it going forward.

master_sub <- master[,c(4,2)]
master_sub <- master_sub[!(is.na(master_sub$hgnc_symbol) | master_sub$hgnc_symbol==""),]

# remove multi mapping probes
master_sub <- master_sub %>% distinct(hgnc_symbol, .keep_all = T)
dim(master_sub)
## [1] 17953     2

thank christ, that took me ages ngl. i feel much better with this amount of probes that have 1-1 mapping with a gene

qnorm <- subset(qnorm, rownames(qnorm) %in% master_sub$PROBE)
dim(qnorm)
## [1] 17943    23

fix the discrepancy in dimensions so we can use gene names going forward.

master_sub <- master_sub[match(rownames(qnorm), master_sub$PROBE),]
rownames(qnorm) <- master_sub$hgnc_symbol
head(qnorm[1:3,1:3])
##         GSM923165:b1 GSM923186:b1 GSM923193:b1
## TMEM74      363.8960     281.6070     362.3042
## TMSB15B     318.9158     388.6855     337.8624
## FSTL5       197.0318     203.0733     169.8640
medians <- rowMedians(qnorm)

hist(log2(medians + 1), 150, col = "cornsilk1", freq = FALSE, 
             main = "Histogram of the median intensities", 
             border = "antiquewhite4", xlab = "Median intensities")

still need to get rid of that bastarding peak - cant have a bimodal distribution.

medians <- log2(rowMedians(qnorm) + 1)
qnorm <- as.data.frame(qnorm)
qnorm$medians <- medians

qnorm <- subset(qnorm, qnorm$medians > 7.9)
qnorm <- as.matrix(qnorm[,1:(ncol(qnorm)-1)])

hist(log2(qnorm + 1), 150, col = "cornsilk1", freq = FALSE, 
             main = "Histogram of the median intensities", 
             border = "antiquewhite4", xlab = "Median intensities")

finally

Exploratory Data Analysis

ok ok ok ok time to check out these batch effects - the last plots i showed you did not have filtering I just went straight in

PCA Cancer Subtypes

You can see two groups forming here so yeah, batch at large

library(ggpubr)
library(ggplot2)

PCA <- prcomp(t(log2(qnorm)), scale = TRUE)
percentVar <- round(100*PCA$sdev^2/sum(PCA$sdev^2),1)
sd_ratio <- sqrt(percentVar[2] / percentVar[1])

dataGG <- data.frame(PC1 = PCA$x[,1], PC2 = PCA$x[,2],
                    Subtype = my_set$group)


ggscatter(dataGG, x="PC1", y="PC2",
                  color = "Subtype", palette = c("dodgerblue4", "darkorange2", "chartreuse", "purple"),
                  title = "PCA plot log-transformed quantile normalized expression data",
                  subtitle = "Cancer Subtypes",
                  xlab = paste0("PC1, VarExp: ", percentVar[1], "%"),
                  ylab = paste0("PC2, VarExp: ", percentVar[2], "%"),
                  ellipse = FALSE, star.plot = FALSE, # HER2+ER+ sample n=1 - star, ellipse breaks
                  ggtheme = theme_bw()) + 
                  theme(legend.position = "right") + 
                  theme(plot.title = element_text(hjust = 0.5, face = "bold"))

PCA Batch Effects

dataGG <- data.frame(PC1 = PCA$x[,1], PC2 = PCA$x[,2],
                    Batch = my_set$Characteristics..batch.)


ggscatter(dataGG, x="PC1", y="PC2",
                  color = "Batch", palette = c("dodgerblue4", "darkorange2", "chartreuse", "purple"),
                  title = "PCA plot log-transformed quantile normalized expression data",
                  subtitle = "Batch Effects (Batch 1, Batch 2)",
                  xlab = paste0("PC1, VarExp: ", percentVar[1], "%"),
                  ylab = paste0("PC2, VarExp: ", percentVar[2], "%"),
                  ellipse = TRUE, star.plot = TRUE,
                  ggtheme = theme_bw()) + 
                  theme(legend.position = "right") + 
                  theme(plot.title = element_text(hjust = 0.5, face = "bold"))

boom - money shot for the ol thesis writeup

EigenCor

Just to prove the same point again, lets use pearsons correlation between the Principal components and the Batch variable and Cancer subtype variable

meta <- my_set@phenoData@data
meta <- meta[,c(4, 5)]
colnames(meta) <- c("Batch", "Cancer_Subtype")


pca <- PCAtools::pca(log2(qnorm + 1), metadata = meta)

PCAtools::eigencorplot(pca, components = PCAtools::getComponents(pca, 1:10),
    metavars = c('Batch','Cancer_Subtype'),
    col = c('white', 'cornsilk1', 'gold', 'forestgreen', 'darkgreen'),
    cexCorval = 1.2, fontCorval = 2, posLab = 'all', rotLabX = 45,
    scale = TRUE,
    main = bquote(PC ~ Pearson ~ r^2 ~ clinical ~ correlates),
    plotRsquared = TRUE,
    corFUN = 'pearson',
    corUSE = 'pairwise.complete.obs',
    corMultipleTestCorrection = 'BH',
    signifSymbols = c('****', '***', '**', '*', ''),
    signifCutpoints = c(0, 0.0001, 0.001, 0.01, 0.05, 1))
## Warning in PCAtools::eigencorplot(pca, components =
## PCAtools::getComponents(pca, : Batch is not numeric - please check the source
## data as non-numeric variables will be coerced to numeric
## Warning in PCAtools::eigencorplot(pca, components =
## PCAtools::getComponents(pca, : Cancer_Subtype is not numeric - please check the
## source data as non-numeric variables will be coerced to numeric

## eigencorplot shows this is piss poor, batch totally dominates the dataset for now. 

Batch Correction

code is straight forward thankfully

design <- model.matrix(~0 + groups)
rm_batch <- limma::removeBatchEffect(log2(qnorm + 1), batch = my_set@phenoData@data$Characteristics..batch., design = design)

PCA Cancer Subtype (Corrected)

PCA <- prcomp(t(log2(rm_batch)), scale = TRUE)
percentVar <- round(100*PCA$sdev^2/sum(PCA$sdev^2),1)
sd_ratio <- sqrt(percentVar[2] / percentVar[1])

dataGG <- data.frame(PC1 = PCA$x[,1], PC2 = PCA$x[,2],
                    Subtype = my_set$group)


ggscatter(dataGG, x="PC1", y="PC2",
                  color = "Subtype", palette = c("dodgerblue4", "darkorange2", "chartreuse", "purple"),
                  title = "PCA plot log-transformed batch corrected data",
                  subtitle = "Cancer Subtypes",
                  xlab = paste0("PC1, VarExp: ", percentVar[1], "%"),
                  ylab = paste0("PC2, VarExp: ", percentVar[2], "%"),
                  ellipse = F, star.plot = F, 
                  ggtheme = theme_bw()) + 
                  theme(legend.position = "right") + 
                  theme(plot.title = element_text(hjust = 0.5, face = "bold"))

PCA Batch Effect (Corrected)

dataGG <- data.frame(PC1 = PCA$x[,1], PC2 = PCA$x[,2],
                    Batch = my_set$Characteristics..batch.)


ggscatter(dataGG, x="PC1", y="PC2",
                  color = "Batch", palette = c("dodgerblue4", "darkorange2", "chartreuse", "purple"),
                  title = "PCA plot log-transformed batch corrected data",
                  subtitle = "Batch Effects: Batch1, Batch2",
                  xlab = paste0("PC1, VarExp: ", percentVar[1], "%"),
                  ylab = paste0("PC2, VarExp: ", percentVar[2], "%"),
                  ellipse = T, star.plot = T, 
                  ggtheme = theme_bw()) + 
                  theme(legend.position = "right") + 
                  theme(plot.title = element_text(hjust = 0.5, face = "bold"))

# fuck yes

DEA

Quickly check a diff exp test for sanity

colnames(design)[1:4] <- levels(gs)
fit <- limma::lmFit(qnorm, design)
fit <- limma::eBayes(fit, trend=T, robust=T)
res <- limma::decideTests(fit)
summary(res)
##          ER+ Her2+ Her2+ER+  TNBC
## Down       0     0        0     0
## NotSig    48   106      858    78
## Up     14846 14788    14036 14816

? i dont know enough about limma tbh - seems very high

cts <- paste(unique(groups), c(tail(groups, -1), head(groups, 1)), sep="-")
#cont.matrix <- limma::makeContrasts(contrasts=cts, levels=design)

Error in limma::makeContrasts(contrasts = cts, levels = design) : The levels must by syntactically valid names in R, see help(make.names). Non-valid names: ER+,Her2+,Her2+ER+

complains about the +

colnames(design) <- gsub("\\+", "", colnames(design))
cts <- gsub("\\+", "", cts)

cont.matrix <- limma::makeContrasts(contrasts=cts, levels=design)
fit2 <- limma::contrasts.fit(fit, cont.matrix)
## Warning in limma::contrasts.fit(fit, cont.matrix): row names of contrasts don't
## match col names of coefficients
fit2 <- limma::eBayes(fit2, 0.01)
## Warning in .ebayes(fit = fit, proportion = proportion, stdev.coef.lim =
## stdev.coef.lim, : Estimation of var.prior failed - set to default value
#Create the top table for the comparison (coef) we want
top <- limma::topTable(fit2, coef=2, adjust="fdr", sort.by="B", number = Inf)
nrow(top)
## [1] 14894
#Top Table of significant genes only
up_reg <- subset(top, logFC > 0.25 & P.Value < 0.05)
nrow(up_reg)
## [1] 335
up_reg_names <- rownames(up_reg)
e <- qnorm
sig_exprs <- e[rownames(e) %in% up_reg_names,]

sig_exprs <- t(sig_exprs)
sig_exprs <- scale(sig_exprs, center=T, scale = T)
sig_exprs <- t(sig_exprs)

annot_col <- data.frame(row.names = colnames(qnorm), Group = factor(my_set$group))

cols <- c("red", "blue", "green", "black")
names(cols) <- unique(groups)
annot_colors <- list(Group = cols)

library(viridis)
## Loading required package: viridisLite
pheatmap::pheatmap(sig_exprs, cluster_cols = T, cluster_rows = T, show_rownames = F,
                   show_colnames = T, annotation_col = annot_col,
                   annotation_colors = annot_colors,
                   col = hcl.colors(100, "Purple-Green",rev=F),
                   cutree_cols = 4)

I was generating rubbish because I was not specifying the coefficient for topTable correctly. make sure you know exactly the contrasts you are extracting with the coef= parameter (Im useless at limma soz cant help)