library(Biobase)## Loading required package: BiocGenerics
##
## Attaching package: 'BiocGenerics'
## The following objects are masked from 'package:stats':
##
## IQR, mad, sd, var, xtabs
## The following objects are masked from 'package:base':
##
## anyDuplicated, append, as.data.frame, basename, cbind, colnames,
## dirname, do.call, duplicated, eval, evalq, Filter, Find, get, grep,
## grepl, intersect, is.unsorted, lapply, Map, mapply, match, mget,
## order, paste, pmax, pmax.int, pmin, pmin.int, Position, rank,
## rbind, Reduce, rownames, sapply, setdiff, sort, table, tapply,
## union, unique, unsplit, which.max, which.min
## Welcome to Bioconductor
##
## Vignettes contain introductory material; view with
## 'browseVignettes()'. To cite Bioconductor, see
## 'citation("Biobase")', and for packages 'citation("pkgname")'.
library(ggpubr)## Loading required package: ggplot2
library(data.table)
library(DT)You’re right, I don’t see any CEL files. God only knows what they used back in those days. I found two zipped directories containing both ‘raw’ and ‘processed’ files at https://www.ebi.ac.uk/arrayexpress/files/E-GEOD-37614/. I downloaded all available files:
wget https://www.ebi.ac.uk/arrayexpress/files/E-GEOD-37614/E-GEOD-37614.processed.1.zip && mkdir processed && unzip E-GEOD-37614.processed.1.zip && mv GSM* processed/
wget https://www.ebi.ac.uk/arrayexpress/files/E-GEOD-37614/E-GEOD-37614.raw.1.zip && mkdir raw && unzip E-GEOD-37614.raw.1.zip && mv GSM* raw/
wget https://www.ebi.ac.uk/arrayexpress/files/E-GEOD-37614/E-GEOD-37614.sdrf.txt
wget https://www.ebi.ac.uk/arrayexpress/files/A-GEOD-10558/A-GEOD-10558.adf.txtYour directory tree should look like this:
├── [3.9M] A-GEOD-10558.adf.txt
├── [3.1M] E-GEOD-37614.processed.1.zip
├── [7.4M] E-GEOD-37614.raw.1.zip
├── [ 15K] E-GEOD-37614.sdrf.txt
├── [4.0K] processed
│  ├── [405K] GSM923165_sample_table.txt
│  ├── [405K] GSM923186_sample_table.txt
│  ├── [405K] GSM923193_sample_table.txt
│  ├── [405K] GSM923194_sample_table.txt
│  ├── [405K] GSM923195_sample_table.txt
│  ├── [405K] GSM923205_sample_table.txt
│  ├── [405K] GSM923206_sample_table.txt
│  ├── [405K] GSM923207_sample_table.txt
│  ├── [405K] GSM923208_sample_table.txt
│  ├── [405K] GSM923209_sample_table.txt
│  ├── [405K] GSM923210_sample_table.txt
│  ├── [405K] GSM923211_sample_table.txt
│  ├── [405K] GSM923236_sample_table.txt
│  ├── [405K] GSM923237_sample_table.txt
│  ├── [405K] GSM923238_sample_table.txt
│  ├── [405K] GSM923239_sample_table.txt
│  ├── [405K] GSM923240_sample_table.txt
│  ├── [405K] GSM923241_sample_table.txt
│  └── [405K] GSM923242_sample_table.txt
└── [4.0K] raw
├── [1009K] GSM923165_raw_TNBC.TB123.b1.txt
├── [1009K] GSM923186_raw_TNBC.TB125.b1.txt
├── [1.3M] GSM923193_raw_TNBC.TB134.txt
├── [1009K] GSM923194_raw_ER_.TB71.b1.txt
├── [1009K] GSM923195_raw_ER_.TB75.b1.txt
├── [1009K] GSM923205_raw_ER_.TB130.b1.txt
├── [1.3M] GSM923206_raw_ER_.TB98.txt
├── [1009K] GSM923207_raw_ER_.TB120.b1.txt
├── [1009K] GSM923208_raw_Her2_.TB76.b1.txt
├── [1.3M] GSM923209_raw_Her2_.TB117.txt
├── [1009K] GSM923210_raw_Her2_.TB136.b1.txt
├── [1.3M] GSM923211_raw_Her2_.TB122.txt
├── [1009K] GSM923236_raw_TNBC.TB160.b2.txt
├── [1009K] GSM923237_raw_TNBC.TB162.b2.txt
├── [1009K] GSM923238_raw_TNBC.TB164.b2.txt
├── [1009K] GSM923239_raw_ER_.TB163.b2.txt
├── [1009K] GSM923240_raw_ER_.TB165.b2.txt
├── [1009K] GSM923241_raw_Her2_.TB129.b2.txt
└── [1009K] GSM923242_raw_Her2_ER_.TB148.b2.txt
2 directories, 42 files
I don’t care much for processed data so let’s start with the raw data :)
Stage the SDRF file. We will use the column denoting
file names to read in the raw files.
SDRF <- read.delim("/data/projects/cmurray/E-GEOD-37614.sdrf.txt")
rownames(SDRF) <- SDRF$Array.Data.File
DT::datatable(SDRF, options = list(scrollX = TRUE, pageLength = 19, scroller = TRUE))These are not CEL files so we can’t use
oligo::read.celfiles(), just write a function to stick each
file together in a dataframe :)
gen_raw <- function(path, pattern){
files = list.files(path, pattern, full.names=T, recursive=T, include.dirs=T)
mat = as.data.frame(do.call(cbind, lapply(files, function(x) data.table::fread(x, stringsAsFactors=FALSE))))
ID_REF = mat[,1]
mat = as.data.frame(mat[,!grepl("ID_REF", colnames(mat))])
rownames(mat) = ID_REF
return(mat)
}
raw.dat <- gen_raw("/data/projects/cmurray/raw/", "\\.txt$")
dim(raw.dat)## [1] 47231 23
The column names are nonsensical. Next step is to use to metadata file to append the correct sample names to each column.
file_names <- list.files("/data/projects/cmurray/raw/", "\\.txt$", full.names=T, recursive=T, include.dirs=T)
file_names <- sub(".*/", "", file_names)
head(file_names)## [1] "GSM923165_raw_TNBC.TB123.b1.txt" "GSM923186_raw_TNBC.TB125.b1.txt"
## [3] "GSM923193_raw_TNBC.TB134.b1.txt" "GSM923193_raw_TNBC.TB134.b2.txt"
## [5] "GSM923194_raw_ER_.TB71.b1.txt" "GSM923195_raw_ER_.TB75.b1.txt"
sample_names <- sub("\\_.*", "", file_names)
head(sample_names)## [1] "GSM923165" "GSM923186" "GSM923193" "GSM923193" "GSM923194" "GSM923195"
length(sample_names) == ncol(raw.dat) #FALSE## [1] TRUE
There are more columns in raw.dat (23) than there are
sample names in the annotation file (19).
They did something shitty here (not sure how your GEO dataset handled this) - for all of the samples with a b1,b2 batch annotation, there are two columns for ‘signal’ in these files. I’m not 100% sure how to handle this yet. Take a look below to see what I mean:
pwd
/data/projects/cmurray/raw/
head -n 4 *.txt
==> GSM923165_raw_TNBC.TB123.b1.txt <==
ID_REF VALUE
ILMN_1653768 388.184
ILMN_1668377 353.682
ILMN_1824784 369.376
==> GSM923186_raw_TNBC.TB125.b1.txt <==
ID_REF VALUE
ILMN_1653768 336.9
ILMN_1668377 374.261
ILMN_1824784 371.367
==> GSM923193_raw_TNBC.TB134.txt <==
ID_REF VALUE VALUE2
ILMN_1653768 345.349 206.512
ILMN_1668377 344.038 248.29
ILMN_1824784 402.025 251.595
==> GSM923194_raw_ER_.TB71.b1.txt <==
ID_REF VALUE
ILMN_1653768 383.756
ILMN_1668377 341.217
ILMN_1824784 359.835
==> GSM923195_raw_ER_.TB75.b1.txt <==
ID_REF VALUE
ILMN_1653768 308.575
ILMN_1668377 378.362
ILMN_1824784 372.161
==> GSM923205_raw_ER_.TB130.b1.txt <==
ID_REF VALUE
ILMN_1653768 297.507
ILMN_1668377 340.106
ILMN_1824784 423.984
==> GSM923206_raw_ER_.TB98.txt <==
ID_REF VALUE VALUE2
ILMN_1653768 354.972 203.633
ILMN_1668377 371.799 191.744
ILMN_1824784 387.36 297.442You can see it in the file names too, any file that does not end with
.b1.txt/.b2.txt are ones with two batches present in the
file.
Thinking out loud we could split the files in bash and update the
sdrf file accordingly.
awk 'BEGIN{FS="\t"}; {if($4=="b1,b2") print $0}' E-GEOD-37614.sdrf.txt > b1.b2_batches.sdrf
wc -l b1.b2_batches.sdrf
4awk 'BEGIN{FS="\t"}; {print $29}' b1.b2_batches.sdrf > little_fuckers.txt
cat little_fuckers.txt
GSM923211_raw_Her2_.TB122.txt
GSM923209_raw_Her2_.TB117.txt
GSM923206_raw_ER_.TB98.txt
GSM923193_raw_TNBC.TB134.txtCode below takes the files which have 3 fields (NF = 3) and returns (NF=1, NF=2) for Batch 1, and (NF=1, NF=3) for batch 2.
cd raw/
while read -r line; do fn=$(basename $line .txt); awk '{print $(NF-2)" "$NF}' $line > ${fn}.b2.txt; awk '{print $(NF-2)" "$(NF-1)}' $line > ${fn}.b1.txt; done < ../little_fuckers.txtTake a look at the directory:
GSM923165_raw_TNBC.TB123.b1.txt GSM923206_raw_ER_.TB98.b2.txt GSM923211_raw_Her2_.TB122.b2.txt
GSM923186_raw_TNBC.TB125.b1.txt GSM923206_raw_ER_.TB98.txt GSM923211_raw_Her2_.TB122.txt
GSM923193_raw_TNBC.TB134.b1.txt GSM923207_raw_ER_.TB120.b1.txt GSM923236_raw_TNBC.TB160.b2.txt
GSM923193_raw_TNBC.TB134.b2.txt GSM923208_raw_Her2_.TB76.b1.txt GSM923237_raw_TNBC.TB162.b2.txt
GSM923193_raw_TNBC.TB134.txt GSM923209_raw_Her2_.TB117.b1.txt GSM923238_raw_TNBC.TB164.b2.txt
GSM923194_raw_ER_.TB71.b1.txt GSM923209_raw_Her2_.TB117.b2.txt GSM923239_raw_ER_.TB163.b2.txt
GSM923195_raw_ER_.TB75.b1.txt GSM923209_raw_Her2_.TB117.txt GSM923240_raw_ER_.TB165.b2.txt
GSM923205_raw_ER_.TB130.b1.txt GSM923210_raw_Her2_.TB136.b1.txt GSM923241_raw_Her2_.TB129.b2.txt
GSM923206_raw_ER_.TB98.b1.txt GSM923211_raw_Her2_.TB122.b1.txt GSM923242_raw_Her2_ER_.TB148.b2.txt
For example, GSM923193_raw_TNBC.TB134.txt now has a
batch 1 and batch 2 file. I assumed that VALUE is b1, and VALUE2 is
batch 2. This is actually a huge assumption on my behalf but sure look
we can see how it looks downstream - don’t over think it for now (me
totally over thinking it).
while read -r line; do rm $line; done < ../little_fuckers.txtWe need to update the sdrf file to reflect the new files
we made, and the ones we deleted.
Not a chance I’m able to code this dynamically so I just did it in
excel. I’ll send you the updated file
E-GEOD-37614.sdrf_fixed.txt :)
SDRF <- read.delim("/data/projects/cmurray/E-GEOD-37614.sdrf_fixed.txt")
rownames(SDRF) <- SDRF$Array.Data.File
DT::datatable(SDRF, options = list(scrollX = TRUE, pageLength = 23, scroller = TRUE))## same as first attempt tbh
gen_raw <- function(path, pattern){
files = list.files(path, pattern, full.names=T, recursive=T, include.dirs=T)
mat = as.data.frame(do.call(cbind, lapply(files, function(x) data.table::fread(x, stringsAsFactors=FALSE))))
ID_REF = mat[,1]
mat = as.data.frame(mat[,!grepl("ID_REF", colnames(mat))])
rownames(mat) = ID_REF
return(mat)
}
raw.dat <- gen_raw("/data/projects/cmurray/raw/", "\\.txt$")
## attach sample names to cols
file_names <- list.files("/data/projects/cmurray/raw/", "\\.txt$", full.names=T, recursive=T, include.dirs=T)
file_names <- sub(".*/", "", file_names)
sample_names <- sub("\\_.*", "", file_names)
length(sample_names) == ncol(raw.dat)## [1] TRUE
TRUE thank fuck.
Super super important that you match these correctly.
match() function for the win here.
# we need to append batch ID's to the samplenames - sorry but has to happen for unique colnames.
SDRF$samp_names <- paste(SDRF$Assay.Name, SDRF$Characteristics..batch., sep=":")
SDRF$samp_names## [1] "GSM923242:b2" "GSM923241:b2" "GSM923240:b2" "GSM923239:b2" "GSM923238:b2"
## [6] "GSM923237:b2" "GSM923236:b2" "GSM923211:b1" "GSM923211:b2" "GSM923210:b1"
## [11] "GSM923209:b1" "GSM923209:b2" "GSM923208:b1" "GSM923207:b1" "GSM923206:b1"
## [16] "GSM923206:b2" "GSM923205:b1" "GSM923195:b1" "GSM923194:b1" "GSM923193:b1"
## [21] "GSM923193:b2" "GSM923186:b1" "GSM923165:b1"
# below is most important part to make sure columns match file contents
SDRF <- SDRF[match(file_names, SDRF$Array.Data.File),]
colnames(raw.dat) <- SDRF$samp_names
## the below is correct, check it against the head -n 4 *.txt file command above
head(raw.dat[1:5, 1:5])## GSM923165:b1 GSM923186:b1 GSM923193:b1 GSM923193:b2 GSM923194:b1
## ILMN_1653768 388.184 336.900 345.349 206.512 383.756
## ILMN_1668377 353.682 374.261 344.038 248.290 341.217
## ILMN_1824784 369.376 371.367 402.025 251.595 359.835
## ILMN_2066534 417.759 317.597 427.092 273.403 423.359
## ILMN_1731640 375.803 418.458 404.662 341.895 315.882
finally done pissing around with formatting - fuck those guys.
Fairly sure that ExpressionSet objects are S4 objects (ew) but the essentially hold ‘slots’ for data. no reason we cant make our own one.
SDRF <- Biobase::AnnotatedDataFrame(SDRF)
rownames(SDRF) <- colnames(raw.dat)
my_set <- Biobase::ExpressionSet(assayData = as.matrix(raw.dat), phenoData = SDRF)
my_set## ExpressionSet (storageMode: lockedEnvironment)
## assayData: 47231 features, 23 samples
## element names: exprs
## protocolData: none
## phenoData
## sampleNames: GSM923165:b1 GSM923186:b1 ... GSM923242:b2 (23 total)
## varLabels: Source.Name Comment..Sample_source_name. ... samp_names
## (41 total)
## varMetadata: labelDescription
## featureData: none
## experimentData: use 'experimentData(object)'
## Annotation:
its missing a few slots but has everything we need to do stuff.
Check sample distributions, fix via normalization, filter if we can.
Ok we have raw data so I fully expect the boxplot to look shit
sml <- as.character(as.numeric(as.factor(SDRF@data$Characteristics..class.)))
gs <- factor(sml)
groups <- SDRF@data$Characteristics..class. # HER2+ER+ sample?
stupid_orientation <- unique(groups)
stupid_orientation <- c(stupid_orientation[2], stupid_orientation[3], stupid_orientation[4], stupid_orientation[1])
levels(gs) <- stupid_orientation
my_set@phenoData$group <- gs
ord <- order(gs)
palette <- palette(c("#1B9E77", "#7570B3", "#E7298A", "#A6761D"))
par(mar=c(7,4,2,1))
title <- "GSE37614 Log2 Raw Signals"
boxplot(log2(exprs(my_set[,ord])), boxwex=0.6, notch=T, main=title, outline=FALSE, las=2, col=gs[ord])
legend("topleft", stupid_orientation, fill=palette(), bty="n")shit indeed!
Usually do this with a dedicated package for microarrays (unfortunately not compatible with our object) but again no reason we cant do it from scratch.
quantile_normalisation <- function(df){
df_rank <- apply(df,2,rank,ties.method="min")
df_sorted <- data.frame(apply(df, 2, sort))
df_mean <- apply(df_sorted, 1, mean)
index_to_mean <- function(my_index, my_mean){
return(my_mean[my_index])
}
df_final <- apply(df_rank, 2, index_to_mean, my_mean=df_mean)
rownames(df_final) <- rownames(df)
return(df_final)
}
qnorm <- quantile_normalisation(raw.dat)par(mar=c(7,4,2,1))
title <- "GSE37614 Log2 Quantile Norm McDonald"
boxplot(log2(qnorm[,ord]), boxwex=0.6, notch=T, main=title, outline=FALSE, las=2, col=gs[ord])
legend("topleft", stupid_orientation, fill=palette(), bty="n")nice B-)
we can bin the raw data and replace it with the quantile normalized data.
exprs(my_set) <- qnorm
## check that actually saved
par(mar=c(7,4,2,1))
title <- "GSE37614 Log2 Quantile Norm McDonald"
boxplot(log2(exprs(my_set[,ord])), boxwex=0.6, notch=T, main=title, outline=FALSE, las=2, col=gs[ord])
legend("topleft", stupid_orientation, fill=palette(), bty="n")plot the same as the last one when we switch in
log2(exprs(my_set[,ord])) so yep it did save to the eset
obj properly.
Sanity check here to see if our normlized expression distribution looks like the one you produced with the preprocessed GEO data. I’m happy enough that it follows the same dist. The only difference is your plots y axis max goes to 0.0006 - mine goes to 0.008.
ex <- exprs(my_set)
par(mar=c(4,4,2,1))
title <- paste ("Normalized Expression Density", sep ="")
limma::plotDensities(ex, group=gs, main=title, legend ="topright")Little peak on the left we want to take care of.
# Probe filtering
medians <- rowMedians(ex)
hist(log2(medians + 1), 150, col = "cornsilk1", freq = FALSE,
main = "Histogram of the median intensities",
border = "antiquewhite4", xlab = "Median intensities")Couldn’t get the min threshold to work for some reason.
probes <- rownames(my_set)
library(illuminaHumanv4.db)## Loading required package: AnnotationDbi
## Loading required package: stats4
## Loading required package: IRanges
## Loading required package: S4Vectors
##
## Attaching package: 'S4Vectors'
## The following objects are masked from 'package:data.table':
##
## first, second
## The following objects are masked from 'package:base':
##
## expand.grid, I, unname
##
## Attaching package: 'IRanges'
## The following object is masked from 'package:data.table':
##
## shift
## Loading required package: org.Hs.eg.db
##
##
annotation <- AnnotationDbi::select(illuminaHumanv4.db,
keys = probes,
columns = c("SYMBOL", "GENENAME"),
keytype = "PROBEID")## 'select()' returned 1:many mapping between keys and columns
## 'select()' returned 1:many mapping between keys and columns
annotation <- subset(annotation, !is.na(SYMBOL))
## resolve multi maps.
library(dplyr)##
## Attaching package: 'dplyr'
## The following object is masked from 'package:AnnotationDbi':
##
## select
## The following objects are masked from 'package:IRanges':
##
## collapse, desc, intersect, setdiff, slice, union
## The following objects are masked from 'package:S4Vectors':
##
## first, intersect, rename, setdiff, setequal, union
## The following objects are masked from 'package:data.table':
##
## between, first, last
## The following object is masked from 'package:Biobase':
##
## combine
## The following objects are masked from 'package:BiocGenerics':
##
## combine, intersect, setdiff, union
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
ann_grouped <- group_by(annotation, PROBEID)
ann_sum <- dplyr::summarize(ann_grouped, no_of_matches = n_distinct(SYMBOL))
ann_flt <- filter(ann_sum, no_of_matches > 1)
remove_id <- (probes %in% ann_flt$PROBEID)
table(remove_id)## remove_id
## FALSE TRUE
## 45526 1705
slightly concerning here, bad that we have 45526 genes in our dataset. needs to come down to ~15000-19000
I’ll try to do this manually because I’m really hampered by not having a legitimate expressionset object.
annotations <- read.table("/data/projects/cmurray/A-GEOD-10558.adf.txt", header=T, sep="\t")
mart <- biomaRt::useMart(biomart = "ensembl", dataset = "hsapiens_gene_ensembl")
attr <- biomaRt::listAttributes(mart)
genbank_map <- biomaRt::getBM(attributes = c("refseq_mrna", "hgnc_symbol", "gene_biotype"), mart = mart, useCache = FALSE)
genbank_map <- genbank_map[!(is.na(genbank_map$refseq_mrna) | genbank_map$refseq_mrna==""),]
genbank_map <- subset(genbank_map, genbank_map$gene_biotype == "protein_coding")
# pop off the version ID
annotations$genbank <- gsub("\\.\\d+$", "", annotations$genbank)
# merge
master <- merge(genbank_map, annotations, by.x="refseq_mrna", by.y="genbank")now we have a master df with probe IDs and the gene symbols for protein coding genes. I feel a bit better now about performing the subset.
Tbh fuck the expression set, I dont think we will need it going forward.
master_sub <- master[,c(4,2)]
master_sub <- master_sub[!(is.na(master_sub$hgnc_symbol) | master_sub$hgnc_symbol==""),]
# remove multi mapping probes
master_sub <- master_sub %>% distinct(hgnc_symbol, .keep_all = T)
dim(master_sub)## [1] 17953 2
thank christ, that took me ages ngl. i feel much better with this amount of probes that have 1-1 mapping with a gene
qnorm <- subset(qnorm, rownames(qnorm) %in% master_sub$PROBE)
dim(qnorm)## [1] 17943 23
fix the discrepancy in dimensions so we can use gene names going forward.
master_sub <- master_sub[match(rownames(qnorm), master_sub$PROBE),]
rownames(qnorm) <- master_sub$hgnc_symbol
head(qnorm[1:3,1:3])## GSM923165:b1 GSM923186:b1 GSM923193:b1
## TMEM74 363.8960 281.6070 362.3042
## TMSB15B 318.9158 388.6855 337.8624
## FSTL5 197.0318 203.0733 169.8640
medians <- rowMedians(qnorm)
hist(log2(medians + 1), 150, col = "cornsilk1", freq = FALSE,
main = "Histogram of the median intensities",
border = "antiquewhite4", xlab = "Median intensities")still need to get rid of that bastarding peak - cant have a bimodal distribution.
medians <- log2(rowMedians(qnorm) + 1)
qnorm <- as.data.frame(qnorm)
qnorm$medians <- medians
qnorm <- subset(qnorm, qnorm$medians > 7.9)
qnorm <- as.matrix(qnorm[,1:(ncol(qnorm)-1)])
hist(log2(qnorm + 1), 150, col = "cornsilk1", freq = FALSE,
main = "Histogram of the median intensities",
border = "antiquewhite4", xlab = "Median intensities")finally
ok ok ok ok time to check out these batch effects - the last plots i showed you did not have filtering I just went straight in
You can see two groups forming here so yeah, batch at large
library(ggpubr)
library(ggplot2)
PCA <- prcomp(t(log2(qnorm)), scale = TRUE)
percentVar <- round(100*PCA$sdev^2/sum(PCA$sdev^2),1)
sd_ratio <- sqrt(percentVar[2] / percentVar[1])
dataGG <- data.frame(PC1 = PCA$x[,1], PC2 = PCA$x[,2],
Subtype = my_set$group)
ggscatter(dataGG, x="PC1", y="PC2",
color = "Subtype", palette = c("dodgerblue4", "darkorange2", "chartreuse", "purple"),
title = "PCA plot log-transformed quantile normalized expression data",
subtitle = "Cancer Subtypes",
xlab = paste0("PC1, VarExp: ", percentVar[1], "%"),
ylab = paste0("PC2, VarExp: ", percentVar[2], "%"),
ellipse = FALSE, star.plot = FALSE, # HER2+ER+ sample n=1 - star, ellipse breaks
ggtheme = theme_bw()) +
theme(legend.position = "right") +
theme(plot.title = element_text(hjust = 0.5, face = "bold"))dataGG <- data.frame(PC1 = PCA$x[,1], PC2 = PCA$x[,2],
Batch = my_set$Characteristics..batch.)
ggscatter(dataGG, x="PC1", y="PC2",
color = "Batch", palette = c("dodgerblue4", "darkorange2", "chartreuse", "purple"),
title = "PCA plot log-transformed quantile normalized expression data",
subtitle = "Batch Effects (Batch 1, Batch 2)",
xlab = paste0("PC1, VarExp: ", percentVar[1], "%"),
ylab = paste0("PC2, VarExp: ", percentVar[2], "%"),
ellipse = TRUE, star.plot = TRUE,
ggtheme = theme_bw()) +
theme(legend.position = "right") +
theme(plot.title = element_text(hjust = 0.5, face = "bold"))
boom - money shot for the ol thesis writeup
Just to prove the same point again, lets use pearsons correlation between the Principal components and the Batch variable and Cancer subtype variable
meta <- my_set@phenoData@data
meta <- meta[,c(4, 5)]
colnames(meta) <- c("Batch", "Cancer_Subtype")
pca <- PCAtools::pca(log2(qnorm + 1), metadata = meta)
PCAtools::eigencorplot(pca, components = PCAtools::getComponents(pca, 1:10),
metavars = c('Batch','Cancer_Subtype'),
col = c('white', 'cornsilk1', 'gold', 'forestgreen', 'darkgreen'),
cexCorval = 1.2, fontCorval = 2, posLab = 'all', rotLabX = 45,
scale = TRUE,
main = bquote(PC ~ Pearson ~ r^2 ~ clinical ~ correlates),
plotRsquared = TRUE,
corFUN = 'pearson',
corUSE = 'pairwise.complete.obs',
corMultipleTestCorrection = 'BH',
signifSymbols = c('****', '***', '**', '*', ''),
signifCutpoints = c(0, 0.0001, 0.001, 0.01, 0.05, 1))## Warning in PCAtools::eigencorplot(pca, components =
## PCAtools::getComponents(pca, : Batch is not numeric - please check the source
## data as non-numeric variables will be coerced to numeric
## Warning in PCAtools::eigencorplot(pca, components =
## PCAtools::getComponents(pca, : Cancer_Subtype is not numeric - please check the
## source data as non-numeric variables will be coerced to numeric
## eigencorplot shows this is piss poor, batch totally dominates the dataset for now. code is straight forward thankfully
design <- model.matrix(~0 + groups)
rm_batch <- limma::removeBatchEffect(log2(qnorm + 1), batch = my_set@phenoData@data$Characteristics..batch., design = design)PCA <- prcomp(t(log2(rm_batch)), scale = TRUE)
percentVar <- round(100*PCA$sdev^2/sum(PCA$sdev^2),1)
sd_ratio <- sqrt(percentVar[2] / percentVar[1])
dataGG <- data.frame(PC1 = PCA$x[,1], PC2 = PCA$x[,2],
Subtype = my_set$group)
ggscatter(dataGG, x="PC1", y="PC2",
color = "Subtype", palette = c("dodgerblue4", "darkorange2", "chartreuse", "purple"),
title = "PCA plot log-transformed batch corrected data",
subtitle = "Cancer Subtypes",
xlab = paste0("PC1, VarExp: ", percentVar[1], "%"),
ylab = paste0("PC2, VarExp: ", percentVar[2], "%"),
ellipse = F, star.plot = F,
ggtheme = theme_bw()) +
theme(legend.position = "right") +
theme(plot.title = element_text(hjust = 0.5, face = "bold"))dataGG <- data.frame(PC1 = PCA$x[,1], PC2 = PCA$x[,2],
Batch = my_set$Characteristics..batch.)
ggscatter(dataGG, x="PC1", y="PC2",
color = "Batch", palette = c("dodgerblue4", "darkorange2", "chartreuse", "purple"),
title = "PCA plot log-transformed batch corrected data",
subtitle = "Batch Effects: Batch1, Batch2",
xlab = paste0("PC1, VarExp: ", percentVar[1], "%"),
ylab = paste0("PC2, VarExp: ", percentVar[2], "%"),
ellipse = T, star.plot = T,
ggtheme = theme_bw()) +
theme(legend.position = "right") +
theme(plot.title = element_text(hjust = 0.5, face = "bold"))# fuck yesQuickly check a diff exp test for sanity
colnames(design)[1:4] <- levels(gs)
fit <- limma::lmFit(qnorm, design)
fit <- limma::eBayes(fit, trend=T, robust=T)
res <- limma::decideTests(fit)
summary(res)## ER+ Her2+ Her2+ER+ TNBC
## Down 0 0 0 0
## NotSig 48 106 858 78
## Up 14846 14788 14036 14816
? i dont know enough about limma tbh - seems very high
cts <- paste(unique(groups), c(tail(groups, -1), head(groups, 1)), sep="-")
#cont.matrix <- limma::makeContrasts(contrasts=cts, levels=design)Error in limma::makeContrasts(contrasts = cts, levels = design) : The levels must by syntactically valid names in R, see help(make.names). Non-valid names: ER+,Her2+,Her2+ER+
complains about the +
colnames(design) <- gsub("\\+", "", colnames(design))
cts <- gsub("\\+", "", cts)
cont.matrix <- limma::makeContrasts(contrasts=cts, levels=design)
fit2 <- limma::contrasts.fit(fit, cont.matrix)## Warning in limma::contrasts.fit(fit, cont.matrix): row names of contrasts don't
## match col names of coefficients
fit2 <- limma::eBayes(fit2, 0.01)## Warning in .ebayes(fit = fit, proportion = proportion, stdev.coef.lim =
## stdev.coef.lim, : Estimation of var.prior failed - set to default value
#Create the top table for the comparison (coef) we want
top <- limma::topTable(fit2, coef=2, adjust="fdr", sort.by="B", number = Inf)
nrow(top)## [1] 14894
#Top Table of significant genes only
up_reg <- subset(top, logFC > 0.25 & P.Value < 0.05)
nrow(up_reg)## [1] 335
up_reg_names <- rownames(up_reg)
e <- qnorm
sig_exprs <- e[rownames(e) %in% up_reg_names,]
sig_exprs <- t(sig_exprs)
sig_exprs <- scale(sig_exprs, center=T, scale = T)
sig_exprs <- t(sig_exprs)
annot_col <- data.frame(row.names = colnames(qnorm), Group = factor(my_set$group))
cols <- c("red", "blue", "green", "black")
names(cols) <- unique(groups)
annot_colors <- list(Group = cols)
library(viridis)## Loading required package: viridisLite
pheatmap::pheatmap(sig_exprs, cluster_cols = T, cluster_rows = T, show_rownames = F,
show_colnames = T, annotation_col = annot_col,
annotation_colors = annot_colors,
col = hcl.colors(100, "Purple-Green",rev=F),
cutree_cols = 4)I was generating rubbish because I was not specifying the coefficient for topTable correctly. make sure you know exactly the contrasts you are extracting with the coef= parameter (Im useless at limma soz cant help)