ALL IMPORTED FILES MUST BE IN THE SAME DIRECTORY AS THIS SCRIPT
Load required packages
library("tidyverse")
library("GenomeInfoDb")
library("deconstructSigs")
library("BSgenome.Hsapiens.UCSC.hg19")
library("RColorBrewer")
library("nnls")
library("SignatureEstimation")
library("reshape2")
library("siglasso")
library("parallel")
library("grid")
library("gridExtra")
Create mutation signature functions
run_deconstructSigs = function(x, out.dir = ".", signatures.ref = signatures.exome.cosmic.v3.may2019, all.colors = NULL, sig.type = 'SBS', bsg = NULL, ...){
# x = data.frame with columns CHROM, POS, REF, ALT and, optionally, Sample
# signatures.ref = one of signatures.nature2013, signatures.exome.cosmic.v3.may2019,
# signatures.genome.cosmic.v3.may2019, signatures.dbs.cosmic.v3.may2019,
# signatures.cosmic
# ... = arguments to pass to whichSignatures()
if(! "Sample" %in% colnames(x)){
x$Sample = "Sample"
samples = "Sample"
} else {
samples = unique(x$Sample)
}
# Same colours as before but different signature names
if(is.null(all.colors[1])){
all.colors <- c("#023FA5", "#023FA5", "#7D87B9", "#BEC1D4",
"#D6BCC0", "#BB7784", "gold", "#4A6FE3", "#8595E1", "#B5BBE3",
"#E6AFB9", "#E07B91", "#D33F6A", "#11C638", "#8DD593",
"#C6DEC7", "#EAD3C6", "#F0B98D", "#EF9708", "#0FCFC0",
"#9CDED6", "#D5EAE7", "#F3E1EB", "#F6C4E1", "#F79CD4",
"#866097", "#008941", "#A30059", "#F6C4E1", "#F79CD4",
"#866097", "#008941", "#A30059", "#008080", "#8B0000",
"#F4A460", "#663399")#, "#706563")
}
old.sigs <- c("Signature.1", "Signature.1A", "Signature.1B",
"Signature.2", "Signature.3", "Signature.4", "Signature.5",
"Signature.6", "Signature.7", "Signature.8", "Signature.9",
"Signature.10", "Signature.11", "Signature.12", "Signature.13",
"Signature.14", "Signature.15", "Signature.16", "Signature.17",
"Signature.18", "Signature.19", "Signature.20", "Signature.21",
"Signature.R1", "Signature.R2", "Signature.R3", "Signature.U1",
"Signature.U2", "Signature.22", "Signature.23", "Signature.24",
"Signature.25", "Signature.26", "Signature.27", "Signature.28",
"Signature.29", "Signature.30", "unknown")
sigs.input = mut.to.sigs.input(mut.ref = x,
sample.id = "Sample",
chr = "CHROM",
pos = "POS",
ref = "REF",
alt = "ALT",
sig.type = sig.type,
bsg = bsg)
for(sample in samples){
tryCatch(expr = {
cat(paste0("Processing ", sample, "...\n"))
if(length(samples) == 1 & sample == "Sample"){
results.dir = out.dir
} else {
results.dir = file.path(out.dir, sample)
}
prefix = paste(sample,deparse(substitute(signatures.ref)), sep="_")
if(!dir.exists(results.dir)){dir.create(results.dir, recursive = T)}
error = file(file.path(results.dir, paste0(prefix, "_error.txt")), open="wt")
sink(error, type="message")
sigs = whichSignatures(tumor.ref = sigs.input,
signatures.ref = signatures.ref,
sample.id = sample,
contexts.needed = TRUE,
...)
#pdf(file.path(results.dir, paste0(prefix, "_plots.pdf")))
# Plotting
png(file.path(results.dir, paste0(prefix, "_signature.png")),
width=1200, height=1000, units="px", pointsize=20)
plotSignatures(sigs)
dev.off()
new.sigs = setdiff(colnames(sigs$weights)[sigs$weights[1,] > 0], old.sigs)
png(file.path(results.dir, paste0(prefix, "_pie.png")),
width=960, height=960, units="px", pointsize=24)
if(length(new.sigs) > 0){
makePie(sigs, add.color=unique(all.colors)[1:length(new.sigs)])
} else {
makePie(sigs)
}
dev.off()
# Save output
write.table(x %>% dplyr::filter(Sample == sample) %>%
dplyr::filter(!duplicated(paste(CHROM, POS, REF, ALT))),
file = file.path(results.dir, paste0(prefix, "_unique_mutations.txt")),
sep="\t", quote=F, row.names=F, col.names=T)
write.table(data.frame(Signature = colnames(sigs$weights),
Percent = round(unlist(sigs$weights)*100, digits=2)),
file = file.path(results.dir, paste0(prefix, "_percentages.txt")),
quote=F, sep="\t", col.names=T, row.names=F)
write.table(sigs.input[sample,],
file = file.path(results.dir, paste0(prefix, "_trinucleotides.txt")),
row.names=F, col.names=T, sep="\t", quote=F)
}, finally = {
cat(paste0("Finished ", sample, "\n"))
})
}
cat("Finished!\n")
}
id.signature.path = "sigProfiler_ID_signatures.csv"
signatures.id.cosmic.v3.may2019 = as.data.frame(t(read.delim(id.signature.path,
sep=",", header=T, as.is=T, row.names=1)))
# Normalises variants to SBS, DBS and indels only (removes substitutions >= 3 BP in length)
norm_sbs_to_dbs = function(x, chr="CHROM", pos="POS",
ref="REF", alt="ALT", sample = "Sample"){
# colnames CHROM, POS, REF, ALT, (optional) Sample
if(! sample %in% colnames(x)){
x[,sample] = "Sample"
rm.sample = TRUE # do not return sample name, assume all variants same sample
} else {
rm.sample = FALSE
}
orig.cols = c(sample, chr, pos, ref, alt)
x = x[,orig.cols]
cols.use = c("Sample", "CHROM", "POS", "REF", "ALT")
colnames(x) = cols.use
x = x %>% arrange(Sample, CHROM, POS) %>%
filter(!duplicated(paste(Sample, CHROM, POS, REF, ALT))) %>%
mutate(Already.DBS = nchar(REF) == 2 & nchar(ALT) == 2)
indels = x %>% filter(nchar(REF) != nchar(ALT))
substitutions = x %>% filter((nchar(REF) == 1 & nchar(ALT) == 1) | Already.DBS)
dbs = substitutions %>% filter(nchar(REF) == 1 & nchar(ALT) == 1) %>%
group_by(Sample, CHROM) %>%
mutate(diff1 = abs(c(head(POS, -1) - tail(POS, -1), 0)), diff2 = abs(c(0, tail(POS, -1) - head(POS, -1)))) %>%
filter((diff1 == 1 | diff2 == 1) & ! (diff1 == 1 & diff2 == 1)) %>% # remove anything > 2 substitutions as probable indel or bad mapping
mutate(diff1 = abs(c(head(POS, -1) - tail(POS, -1), 0)), diff2 = abs(c(0, tail(POS, -1) - head(POS, -1)))) %>%
filter(diff1 == 1 | diff2 == 1) %>%
summarise(POS=min(POS), REF=paste0(REF, collapse=""), ALT = paste0(ALT, collapse="")) %>% ungroup()
sbs = substitutions %>% filter(nchar(REF) == 1 & nchar(ALT) == 1) %>%
group_by(Sample, CHROM) %>%
mutate(diff1 = abs(c(head(POS, -1) - tail(POS, -1), 0)), diff2 = abs(c(0, tail(POS, -1) - head(POS, -1)))) %>%
filter(diff1 != 1 & diff2 != 1) %>% ungroup()# remove anything with adjacent variants
if(sum(substitutions$Already.DBS) > 0){
dbs = rbind(dbs, substitutions[which(substitutions$Already.DBS),cols.use])
}
res = NULL
if(nrow(indels) > 0){
res = rbind(res, indels[,cols.use])
}
if(nrow(dbs) > 0){
res = rbind(res, dbs[,cols.use])
}
if(nrow(sbs) > 0){
res = rbind(res, sbs[,cols.use])
}
res = as.data.frame(res) %>% arrange(Sample, CHROM, POS)
res = res[,cols.use]
colnames(res) = orig.cols
return(res)
}
# Assigns InDel mutation type (see cosmic indel signatures)
calc.id.type = function(r,a,c){
ri = (nchar(c) %/% 2)
if(nchar(a) == 1){
# deletion
id.length = nchar(r) - nchar(a)
if(substr(c, ri, ri+id.length) != r){
return(NA)
}
id.type = "del"
id.seq = substr(r, 2, nchar(r))
if(substr(r,2,2) %in% c("A", "G")){
c = rev.strand(c)
id.seq = rev.strand(id.seq)
ri = ri - (id.length - 1)
while(substr(c, ri - id.length + 1, ri) == id.seq){
# left normalise reverse strand
ri = ri - id.length
}
#r = substr(c, ri, ri + id.length)
}
} else {
id.length = nchar(a) - nchar(r)
id.type = "ins"
id.seq = substr(a,2, nchar(a))
if(substr(id.seq,1,1) %in% c("A", "G")){
c = rev.strand(c)
ri = ri + 1
id.seq = rev.strand(id.seq)
}
}
while(substr(c, ri - id.length + 1, ri) == id.seq){
# left normalise
ri = ri - id.length
}
if(ri < 1){
ri = ri + id.length
}
n.reps = 0
while(substr(c, ri+1+id.length*(n.reps), ri+id.length*(n.reps+1)) == id.seq){
n.reps = n.reps + 1
}
if(id.type == "del"){
n.reps = n.reps - 1 # remove del itself
if(id.length == 1){
if(substr(id.seq, 1, 1) == "C"){
sig.index = 1 + min(n.reps, 5)
} else {
sig.index = 7 + min(n.reps, 5)
}
} else {
if(n.reps == 0){
# check microhomology
n.hom = 0
# forward
while(substr(c, ri+1+id.length, ri+1+id.length+n.hom) == substr(id.seq, 1, 1+n.hom)){
n.hom = n.hom + 1
}
# reverse
n.hom.r = 0
while(substr(c, ri - n.hom.r, ri) == substr(id.seq, id.length - n.hom.r, id.length)){
n.hom.r = n.hom.r + 1
}
n.hom = max(n.hom, n.hom.r)
if(n.hom == 0){
sig.index = 25 + min(n.reps, 5) + 6*(min(5, id.length)-2)
} else {
sig.index = 73 + min(n.hom, 5) - 1 + sum(0:(min(id.length, 5) - 2))
}
} else {
sig.index = 25 + min(n.reps, 5) + 6*(min(5, id.length)-2)
}
}
} else {
if(id.length == 1){
if(substr(id.seq, 1, 1) == "C"){
sig.index = 13 + min(n.reps, 5)
} else {
sig.index = 19 + min(n.reps, 5)
}
} else {
sig.index = 49 + min(n.reps, 5) + 6*(min(5, id.length) - 2)
}
}
return(sig.index)
}
rev.strand = function(x){
rev.bases = c("A" = "T", "T" = "A", "G" = "C", "C" = "G", "N" = "N")
y = rev(rev.bases[unlist(strsplit(x, ""))])
if(length(x) == 1){return(paste(y, collapse=""))} else {return(y)}
}
# Creates input mutation matrices for each mutation type
mut.to.sigs.input.alltypes = function(mut.ref, sample.id = "Sample", chr = "chr", pos = "pos",
ref = "ref", alt = "alt", bsg = NULL, sig.types = c("SBS", "DBS", "ID"), dbs_table = dbs_possible) {
results = lapply(sig.types, function(sig.type){
if(sig.type == "ID"){
# InDel stuff here
samples = factor(mut.ref[,sample.id])
mut.ref = mut.ref[nchar(mut.ref[,ref]) != 1 | nchar(mut.ref[,alt]) != 1,]
res = matrix(0, nrow = length(levels(samples)), ncol=83)
rownames(res) = levels(samples)
if (exists("mut.ref", mode = "list")) {
mut.full <- mut.ref
} else {
if (file.exists(mut.ref)) {
mut.full <- utils::read.table(mut.ref, sep = "\t",
header = TRUE, as.is = FALSE, check.names = FALSE)
}
else {
stop("mut.ref is neither a file nor a loaded data frame")
}
}
if(nrow(mut.full) == 0){
return(res)
}
mut <- mut.full[, c(sample.id, chr, pos, ref, alt)]
mut[, chr] <- factor(mut[, chr])
levels(mut[, chr]) <- sub("^([0-9XY])", "chr\\1", levels(mut[,
chr]))
levels(mut[, chr]) <- sub("^MT", "chrM", levels(mut[,
chr]))
levels(mut[, chr]) <- sub("^(GL[0-9]+).[0-9]", "chrUn_\\L\\1",
levels(mut[, chr]), perl = T)
if (is.null(bsg)) {
bsg = BSgenome.Hsapiens.UCSC.hg19::Hsapiens
}
unknown.regions <- levels(mut[, chr])[which(!(levels(mut[,chr]) %in% GenomeInfoDb::seqnames(bsg)))]
if (length(unknown.regions) > 0) {
unknown.regions <- paste(unknown.regions, collapse = ", ")
warning(paste("Check chr names -- not all match bsg object:\n",
unknown.regions, sep = " "))
mut <- mut[mut[, chr] %in% GenomeInfoDb::seqnames(bsg),
]
}
mut$context = BSgenome::getSeq(bsg,
mut[, chr], mut[, pos] - 6*pmax(1, nchar(mut[,ref]) - 1, nchar(mut[,alt]) - 1) + 1,
mut[, pos] + 6*pmax(1, nchar(mut[,ref]) - 1, nchar(mut[,alt]) - 1) + 1, as.character = T)
for(i in 1:nrow(mut)){
s = mut[i,sample.id]
j = match(s, levels(samples))
p = mut[i,pos]
r = mut[i,ref]
a = mut[i,alt]
#c = strsplit(mut[i,"context"], "")[[1]]
c = mut[i,"context"]
n = calc.id.type(r, a, c)
if(is.na(n)){
cat(paste0("Variant ", paste(mut[i,chr], p, r, a, sep="-"), " does not match reference! It will be ignored.\n"))
} else {
res[j,n] = res[j,n] + 1
}
}
return(res)
} else {
return(mut.to.sigs.input(mut.ref, sample.id, chr, pos, ref, alt, bsg, sig.type, dbs_table))
}
})
results = setNames(results, sig.types)
return(results)
}
# Non-negative least squares method for SignatureEstimation
decomposeNNLS = function(m, P, ...){
exposures = coef(nnls(P, m))
exposures = exposures/sum(exposures)
return(exposures)
}
# DeconstructSigs wrapper for SignatureEstimation
decomposeDeSig = function(m, P, ...){
m = as.data.frame(t(m))
rownames(m) = "Sample"
P = as.data.frame(t(P))
colnames(P) = colnames(m)
y = whichSignatures(tumor.ref = m, signatures.ref = P)
exposures = as.numeric(y$weights)
#exposures = exposures/sum(exposures)
return(exposures)
}
# SigLASSO wrapper for SignatureEstimation
decomposeSiglasso = function(m, P, ...){
counts = list(...)[["counts"]]
if(!is.null(counts)){
m = m*counts
}
m = matrix(m, ncol=1)
success = FALSE # randomly fails sometimes
while(!success){
tryCatch(expr = {
y = siglasso(sample_spectrum = m, signature = P, plot=F)
success = TRUE
}, error = function(e){},
finally = {
next
})
}
exposures = y[,1]
#exposures = exposures/sum(exposures)
return(exposures)
}
# Multiprocess version of SignatureEstimation::bootstrapSigExposures
bootstrapSigExposuresMP = function (m, P, R, mutation.count = NULL, decomposition.method = decomposeQP, num.threads = 8,
...) {
P = as.matrix(P)
if (length(m) != nrow(P))
stop("Length of vector 'm' and number of rows of matrix 'P' must be the same.")
if (any(names(m) != rownames(P)))
stop("Elements of vector 'm' and rows of matrix 'P' must have the same names (mutations types).")
if (ncol(P) == 1)
stop("Matrices 'P' must have at least 2 columns (signatures).")
if (is.null(mutation.count)) {
if (all(SignatureEstimation:::is.wholenumber(m)))
mutation.count = sum(m)
else stop("Please specify the parameter 'mutation.count' in the function call or provide mutation counts in parameter 'm'.")
}
m = m/sum(m)
K = length(m)
replicateMP = function(n, expr, num.threads = 8){
as.matrix(do.call(cbind, mclapply(integer(n), eval.parent(substitute(function(...) expr)),
mc.cores = num.threads)))
}
extra.args = list(...)
exposures = replicateMP(R, {
mutations_sampled = sample(seq(K), mutation.count, replace = TRUE,
prob = m)
m_sampled = as.numeric(table(factor(mutations_sampled,
levels = seq(K))))
m_sampled = m_sampled/sum(m_sampled)
do.call(decomposition.method, c(list(m=m_sampled, P=P), extra.args))
}, num.threads = num.threads)
rownames(exposures) = colnames(P)
colnames(exposures) = paste0("Replicate_", seq(R))
errors = apply(exposures, 2, function(e) SignatureEstimation:::FrobeniusNorm(m,
P, e))
names(errors) = colnames(exposures)
return(list(exposures = exposures, errors = errors))
}
# Specify COSMICv2 & 3 signature sets (incl.OvCa-specific COSMICv3 signatures)
all.signatures = list("DBS" = list("DBS.COSMIC.v3.may2019" = signatures.dbs.cosmic.v3.may2019,
"DBS.COSMIC.v3.may2019.HGSOvCa" = signatures.dbs.cosmic.v3.may2019[c(2,4,5,6,9),]),
"SBS" = list("SBS.COSMIC" = signatures.cosmic,
"SBS.COSMIC.exome.v3.may2019" = signatures.exome.cosmic.v3.may2019,
"SBS.COSMIC.genome.v3.may2019" = signatures.genome.cosmic.v3.may2019,
"SBS.COSMIC.exome.v3.may2019.HGSOvCa" = signatures.exome.cosmic.v3.may2019[c(1,2,3,5,11,17,23,31,40,44,45,46),],
"SBS.COSMIC.genome.v3.may2019.HGSOvCa" = signatures.genome.cosmic.v3.may2019[c(1,2,3,5,11,17,23,31,40,44,45,46),]),
"ID" = list("ID.COSMIC.v3.may2019" = signatures.id.cosmic.v3.may2019,
"ID.COSMIC.v3.may2019.HGSOvCa" = signatures.id.cosmic.v3.may2019[c(1,2,4,5,6,8,9),]))
SBS.COSMIC.exome.v3.may2019.HGSOvCa = signatures.exome.cosmic.v3.may2019[c(1,2,3,5,11,17,23,31,40,44,45,46),]
# Main function
# data = data.frame with sample, chr, pos, ref, alt columns specified
# sig.types = mutation types to test
# signature.list = list[sig.types] of signature matrices
# bootstrap.reps = number of bootstrap resampling repetitions for calculating variance
# plot.file = output file for plots (pdf)
# bsg = BSGenome object (default is hg19) for fetching reference contexts
# num.threads = number of processes for bootstrap parallelisation
# method.colours = colours for signature methods in plot, default is ggplot colours
# Returns list with signature scores and errors (and extra table for bootstraps) plus input matrices
run_sigs = function(data, sample = "Sample",
chr="CHROM", pos="POS", ref="REF", alt="ALT",
sig.types = c("SBS", "DBS", "ID"),
signature.list = all.signatures,
decomposition.methods = c("QP", "SA", "DeconstructSigs",
"SigLASSO", "NNLS"),
bootstrap.reps = 0, bsg=NULL,
plot=TRUE, plot.file = NULL,
num.threads = 8,
method.colours = NULL
){
all.decomposition.methods = list("QP" = decomposeQP,
"SA" = decomposeSA,
"DeconstructSigs" = decomposeDeSig,
"SigLASSO" = decomposeSiglasso,
"NNLS" = decomposeNNLS)
decomposition.methods = all.decomposition.methods[decomposition.methods]
if(!sample %in% colnames(data)){data[,sample] = "_"}
data = norm_sbs_to_dbs(data, chr=chr, pos=pos, ref=ref, alt=alt, sample = sample)
data[,"Sample"] = data[,sample]
sample = "Sample" # deconstructSigs mut.to.sigs.input doesn't work properly for DBS with different sample.id column for some reason
input.matrices = mut.to.sigs.input.alltypes(mut.ref = data,
chr = chr, pos=pos, ref = ref, alt=alt,
sample.id = sample, bsg=bsg,
sig.types = sig.types, dbs_table = dbs_possible)
results = list(input.matrices = input.matrices)
samples = unique(data[,sample])
for(s in samples){
print(s)
for(sig.type in sig.types){
sigs.input = input.matrices[[sig.type]]
if(is.null(sigs.input)){next}
if(!s %in% rownames(sigs.input)){next}
sigs.input = sigs.input[s,,drop=F]
if(nrow(sigs.input) > 0){
if(sum(sigs.input) < 1){
next
}
signatures = signature.list[[sig.type]]
m = as.matrix(t(sigs.input))
for(signature.name in names(signatures)){
signature = signatures[[signature.name]]
P = as.matrix(t(signature))
result.list = lapply(1:length(decomposition.methods), function(i){
decomposition.method = decomposition.methods[[i]]
decomposition.method.name = names(decomposition.methods)[i]
cat(paste0("Method: ", decomposition.method.name,
", Signature: ", signature.name,
", Mutation type: ", sig.type,
"\n"))
if(decomposition.method.name == "SigLASSO"){
if(sig.type != "SBS"){return(NULL)}
counts = sum(m)
y = findSigExposures(m, P,
decomposition.method = decomposition.method,
counts = counts)
} else {
y = findSigExposures(m, P,
decomposition.method = decomposition.method)
}
exposures = data.frame(Signature = rownames(y$exposures),
Exposure = y$exposures[,1],
Method = decomposition.method.name,
Signature.Set = signature.name,
stringsAsFactors = F)
exposures$Method = decomposition.method.name
exposures$Signature.Set = signature.name
exposures$Sample = s
errors = data.frame(Error = y$errors,
Method = decomposition.method.name,
Signature.Set = signature.name,
Sample = s,
stringsAsFactors = F)
return(list(exposures = exposures,
errors = errors))
})
exposures = do.call(rbind, lapply(result.list, function(x){
x[["exposures"]]
}))
errors = do.call(rbind, lapply(result.list, function(x){
x[["errors"]]
}))
results$exposures = rbind(results$exposures, exposures)
results$errors = rbind(results$errors, errors)
# estimate errors by bootstrapping
if(bootstrap.reps > 1){
cat("Bootstrapping...\n")
result.list = lapply(1:length(decomposition.methods), function(i){
decomposition.method = decomposition.methods[[i]]
decomposition.method.name = names(decomposition.methods)[i]
cat(paste0("Method: ", decomposition.method.name,
", Signature: ", signature.name,
", Mutation type: ", sig.type,
"\n"))
if(decomposition.method.name == "SigLASSO"){
if(sig.type != "SBS"){return(NULL)}
counts = sum(m)
y = bootstrapSigExposuresMP(m, P, R=bootstrap.reps,
decomposition.method = decomposition.method,
num.threads = num.threads,
counts=counts)
} else {
y = bootstrapSigExposuresMP(m, P, R=bootstrap.reps,
decomposition.method = decomposition.method,
num.threads = num.threads)
}
exposures = melt(data.frame(Signature = rownames(y$exposures),
y$exposures), value.name = "Exposure")
exposures$Method = decomposition.method.name
exposures$Signature.Set = signature.name
exposures$Sample = s
colnames(exposures)[2] = "Replicate"
errors = data.frame(Replicate = names(y$errors),
Error = y$errors,
Method = decomposition.method.name,
Signature.Set = signature.name,
Sample = s,
stringsAsFactors = F)
return(list(exposures = exposures,
errors = errors))
})
exposures = do.call(rbind, lapply(result.list, function(x){
x[["exposures"]]
}))
errors = do.call(rbind, lapply(result.list, function(x){
x[["errors"]]
}))
results$exposures.bootstrap = rbind(results$exposures.bootstrap, exposures)
results$errors.bootstrap = rbind(results$errors.bootstrap, errors)
}
}
}
}
}
if(plot){
plot_results(results, plot.file, method.colours)
}
return(results)
}
# Plot results
plot_results = function(results, plot.file=NULL, method.colours = NULL, width=16, height=9){
if(!is.null(plot.file)){
pdf(plot.file, width=width, height=height)
}
int_breaks_rounded <- function(x, n = 5) pretty(x, n)[round(pretty(x, n),1) %% 1 == 0] # for integer y axis breaks
for(s in unique(results$exposures$Sample)){
for(sig.type in names(results$input.matrices)){
if(s %in% rownames(results$input.matrices[[sig.type]])){
sigs.input = results$input.matrices[[sig.type]][s,]
theme_common = theme_bw() + theme(panel.grid.major.x = element_blank(),
plot.margin = unit(c(5,1,5,1), "lines"),
axis.text.x = element_text(angle = 90, hjust = 1, vjust = 0.5),
axis.ticks = element_blank())
if(sig.type == "SBS"){
cols = c("#5ABCEB", "#050708", "#D33C32", "#CBCACB", "#ABCD72", "#E7C9C6")
dat = data.frame(x = as.character(1:length(sigs.input)),
y = as.numeric(sigs.input),
top.cols = gsub(".*\\[|\\].*$", "", colnames(sigs.input)),
x.labels = gsub("\\[|>..", "", colnames(sigs.input)),
stringsAsFactors = F)
for(col in c("x", "top.cols")){
dat[,col] = factor(dat[,col], levels = unique(dat[,col]))
}
p = ggplot(dat, aes(x=x,y=y,fill=top.cols)) + geom_col(show.legend = F) + scale_fill_manual(values=cols) + xlab("") + ylab("Mutation Count")
r = c(0, max(dat$y)*1.05)
expand.factor = 0.5
p = p + scale_x_discrete(labels = dat$x.labels) + scale_y_continuous(expand = expand_scale(c(-expand.factor/(1+2*expand.factor), -expand.factor/(1+2*expand.factor))), limits = c(-expand.factor*r[2], r[2]*(1+expand.factor)), breaks=int_breaks_rounded)
p = p + theme_common
top.segs = dat %>% group_by(top.cols) %>% summarise(xstart = as.numeric(head(x,1)) - 0.25, xend=as.numeric(tail(x,1)) + 0.25,y=max(dat$y)*1.10, text.x = x[ceiling(length(x)/2)])
p = p + geom_segment(data = top.segs, mapping =aes(x=xstart,xend=xend, y=y,yend=y, size=2, col=top.cols), show.legend = F) + scale_colour_manual(values=cols)
p = p + geom_text(data = top.segs, mapping = aes(x=text.x, y= max(dat$y)*1.15, label=top.cols))
p = p + coord_cartesian(clip="off")
p = p + ggtitle(paste0(s, " - SBS mutations\n\n\n"))
print(p)
} else if(sig.type == "DBS"){
cols = c("#a6cee3", "#1f78b4",
"#b2df8a", "#33a02c", "#fb9a99", "#e31a1c", "#fdbf6f",
"#ff7f00", "#cab2d6", "#6a3d9a")
dat = data.frame(x = as.character(1:length(sigs.input)),
y = as.numeric(sigs.input),
top.cols = paste0(gsub(">.*$", "", colnames(sigs.input)), ">NN"),
x.labels = gsub("^.*>", "", colnames(sigs.input)),
stringsAsFactors = F)
for(col in c("x", "top.cols")){
dat[,col] = factor(dat[,col], levels = unique(dat[,col]))
}
p = ggplot(dat, aes(x=x,y=y,fill=top.cols)) + geom_col(show.legend = F) + scale_fill_manual(values=cols) + xlab("") + ylab("Mutation Count")
r = c(0, max(dat$y)*1.05)
expand.factor = 0.5
p = p + scale_x_discrete(labels = dat$x.labels) + scale_y_continuous(expand = expand_scale(c(-expand.factor/(1+2*expand.factor), -expand.factor/(1+2*expand.factor))), limits = c(-expand.factor*r[2], r[2]*(1+expand.factor)), breaks=int_breaks_rounded)
p = p + theme_common
top.segs = dat %>% group_by(top.cols) %>% summarise(xstart = as.numeric(head(x,1)) - 0.25, xend=as.numeric(tail(x,1)) + 0.25, y=max(dat$y)*1.10, text.x = x[ceiling(length(x)/2)])
p = p + geom_segment(data = top.segs, mapping =aes(x=xstart,xend=xend, y=y,yend=y, size=2, col=top.cols), show.legend = F) + scale_colour_manual(values=cols)
p = p + geom_text(data = top.segs, mapping = aes(x=text.x, y= max(dat$y)*1.15, label=top.cols))
p = p + coord_cartesian(clip="off")
p = p + ggtitle(paste0(s, " - DBS mutations\n\n\n"))
print(p)
} else {
cols = c("#fdbf6f", "#ff7f00", "#b2df8a", "#33a02c",
colorRampPalette(colors = c("#fb9a99", "#e31a1c"))(4),
colorRampPalette(colors = c("#a6cee3", "#1f78b4"))(4),
colorRampPalette(colors = c("#cab2d6", "#6a3d9a"))(4))
dat = data.frame(x = as.character(1:length(sigs.input)),
y = as.numeric(sigs.input),
top.text1 = c(rep("1bp deletion", 12),
rep("1bp insertion", 12),
rep(">1bp deletions at repeats\n(Deletion length)", 24),
rep(">1bp insertions at repeats\n(Insertion length)", 24),
rep("Deletions with microhomology\n(Deletion length)", 11)),
top.text2 = c(rep("C", 6),
rep("T", 6),
rep("C", 6),
rep("T", 6),
rep("2", 6),
rep("3", 6),
rep("4", 6),
rep("5+", 6),
rep("2", 6),
rep("3", 6),
rep("4", 6),
rep("5+", 6),
rep("2", 1),
rep("3", 2),
rep("4", 3),
rep("5+", 5)),
x.labels = "",
bottom.text1 = c(rep(c("1", "2", "3", "4", "5", "6+"), 2),
rep(c("0", "1", "2", "3", "4", "5+"), 2),
rep(c("1", "2", "3", "4", "5", "6+"), 4),
rep(c("0", "1", "2", "3", "4", "5+"), 4),
c(1, 1:2, 1:3, 1:4, "5+")),
bottom.text2 = c(rep("Homopolymer length", 12),
rep("Homopolymer length", 12),
rep("Number of repeat units", 24),
rep("Number of repeat units", 24),
rep("Microhomology length", 11)),
stringsAsFactors = F)
dat$top.cols = paste(dat$top.text1, dat$top.text2)
for(col in c("x", "top.cols")){
dat[,col] = factor(dat[,col], levels = unique(dat[,col]))
}
top.segs = dat %>% group_by(top.cols) %>% summarise(xstart = as.numeric(head(x,1)) - 0.25, xend=as.numeric(tail(x,1)) + 0.25,
y=max(dat$y)*1.10, y2=-(0.05*max(dat$y)),
text.x = x[ceiling(length(x)/2)], text=top.text2[1], text.col = ifelse(text %in% c("T", "5+"), "white", "black"))
p = ggplot() + geom_col(data=dat, mapping = aes(x=x,y=y,fill=top.cols), show.legend = FALSE) + scale_fill_manual(values=cols) + xlab("") + ylab("Mutation Count")
r = c(0, max(dat$y)*1.05)
expand.factor = 0.5
p = p + scale_x_discrete(labels = dat$x.labels) + scale_y_continuous(expand = expand_scale(c(-expand.factor/(1+2*expand.factor), -expand.factor/(1+2*expand.factor))), limits = c(-expand.factor*r[2], r[2]*(1+expand.factor)), breaks=int_breaks_rounded)
p = p + theme_common
p = p + geom_segment(data = top.segs, mapping =aes(x=xstart,xend=xend, y=y,yend=y, size=2, col=top.cols), show.legend = FALSE)
p = p + geom_segment(data = top.segs, mapping =aes(x=xstart,xend=xend, y=y2,yend=y2, size=2, col=top.cols), show.legend = FALSE) + scale_colour_manual(values=cols)
p = p + geom_text(data = top.segs, mapping = aes(x=text.x, y=y, label=text), col=top.segs$text.col,hjust=0.5, nudge_x = ifelse((top.segs$xend - top.segs$xstart - 0.5) %% 2, 0.5, 0))
p = p + geom_text(data = dat, mapping = aes(x=x, y=-(0.1*max(dat$y)), label=bottom.text1), hjust=0.5)
annot2 = dat %>% group_by(bottom.text2, top.text1) %>% summarise(y=-0.15*max(dat$y), y2=1.17*max(dat$y), x=x[ceiling(length(x)/2)]) %>% ungroup()
p = p + geom_text(data = annot2, mapping=aes(x=x, y=y, label=bottom.text2))
p = p + geom_text(data = annot2, mapping=aes(x=x, y=y2, label=top.text1))
p = p + coord_cartesian(clip="off")
p = p + ggtitle(paste0(s, " - InDel mutations\n\n\n\n"))
print(p)
}
}
}
for(sig.set in unique(results$exposures$Signature.Set)){
all.sig.names = unique(results$exposures$Signature)
x = results$exposures %>%
filter(Signature.Set == sig.set,
Sample == s) %>%
filter(Signature %in% Signature[Exposure > 0]) %>%
mutate(Signature = factor(Signature, levels = intersect(all.sig.names, Signature)))
if("exposures.bootstrap" %in% names(results)){
# boxplot instead
y = results$exposures.bootstrap %>%
filter(Signature.Set == sig.set,
Sample == s) %>%
mutate(Signature = factor(Signature, levels=intersect(all.sig.names, Signature)))
x = x %>% mutate(Signature = factor(as.character(Signature), levels=levels(y$Signature)))
p = ggplot(y, aes(x=Signature, y=Exposure, fill=Method)) + stat_boxplot(geom='errorbar') + geom_boxplot(outlier.shape=NA) +
geom_boxplot(aes(color = Method),
fatten = NULL, fill = NA, coef = 0, outlier.shape=NA,
show.legend = F) +
scale_y_continuous(expand = expand_scale(c(0, 0.05)))
p = p + geom_point(data = x,
mapping = aes(x = Signature, y = Exposure, fill = Method, group = Method),
shape=23, position=position_dodge(width=0.75), show.legend = F) + guides(col=FALSE)
} else {
p = ggplot(x, aes(x=Signature, y=Exposure, fill=Method)) +
geom_col(position="dodge") +
scale_y_continuous(expand = expand_scale(c(0, 0.05)))
}
p = p + facet_grid(. ~ Signature, scales = "free_x")
p = p + theme_classic() +
theme(axis.text.x = element_text(angle = 90, hjust = 1, vjust = 0.5)) +
theme(strip.background = element_blank(), strip.text.x = element_blank()) +
xlab("") + ggtitle(paste0(ifelse(s != "_", paste0(s, " - "), ""), gsub("\\.", " ", sig.set)))
if(!is.null(method.colours)){
p = p + scale_fill_manual(name="Method", values = method.colours) + scale_colour_manual(name="Method", values=method.colours)
}
print(p)
if("errors.bootstrap" %in% names(results)){
p = results$errors.bootstrap %>% filter(Signature.Set == sig.set, Sample ==s) %>%
ggplot(aes(x=Method, y=Error, fill=Method)) +
stat_boxplot(geom="errorbar") + geom_boxplot(show.legend = F) + theme_classic()
} else {
p = results$errors %>% filter(Signature.Set == sig.set, Sample==s) %>%
ggplot(aes(x=Method, y=Error, col=Method)) + geom_point(show.legend = F) + theme_classic()
}
if(!is.null(method.colours)){
p = p + scale_fill_manual(name="Method", values = method.colours) + scale_colour_manual(name="Method", values=method.colours)
}
print(p + ylab("Decomposition Error"))
}
}
if(!is.null(plot.file)){
dev.off()
}
}
plot_pies = function(results, signature.sets = NULL, samples = NULL, method="DeconstructSigs", plot.file=NULL,
cols = c("#023FA5", "#023FA5", "#7D87B9", "#BEC1D4",
"#D6BCC0", "#BB7784", "gold", "#4A6FE3", "#8595E1", "#B5BBE3",
"#E6AFB9", "#E07B91", "#D33F6A", "#11C638", "#8DD593",
"#C6DEC7", "#EAD3C6", "#F0B98D", "#EF9708", "#0FCFC0",
"#9CDED6", "#D5EAE7", "#F3E1EB", "#F6C4E1", "#F79CD4",
"#866097", "#008941", "#A30059", "#F6C4E1", "#F79CD4",
"#866097", "#008941", "#A30059", "#008080", "#8B0000",
"#F4A460", "#663399")){
blank_theme <- theme_minimal()+
theme(
axis.title.x = element_blank(),
axis.title.y = element_blank(),
panel.border = element_blank(),
panel.grid=element_blank(),
axis.ticks = element_blank(),
plot.title=element_text(size=14, face="bold"),
axis.text.x = element_blank()
)
if(is.null(signature.sets[1])){
signature.sets = unique(results$exposures$Signature.Set)
}
if(is.null(samples)){
samples = unique(results$exposures$Sample)
}
if(!method %in% results$exposures$Method){
warning(paste0("Method: '", method, "' is not in these results"))
return(NULL)
}
if(!is.null(plot.file)){
pdf(plot.file, height=5, width=5*length(signature.sets))
}
for(sample in samples){
ps = lapply(signature.sets, function(signature.set){
signatures = unique(results$exposures %>% filter(Signature.Set == signature.set) %>% pull(Signature))
if(length(signatures) > length(cols)){
signature.cols = setNames(colorRampPalette(colors = cols)(length(signatures)), signatures)
} else {
signature.cols = setNames(cols[1:length(signatures)], signatures)
}
p = results$exposures %>% filter(Method == method, Sample == sample, Signature.Set == signature.set) %>%
filter(Exposure > 0) %>% ggplot(aes(x="", y=Exposure, fill=Signature)) + geom_bar(stat="identity", width=1) +
scale_fill_manual(values=signature.cols, name="") +
coord_polar("y", start=0) + blank_theme +
ggtitle(signature.set)
return(p)
})
ps[["nrow"]] = 1
do.call(grid.arrange, ps)
}
if(!is.null(plot.file)){dev.off()}
}
Import individual tumour exome somatic variants file and filter as before (see https://rpubs.com/deepsubs/thesis_tumour_exome_variants)
max.callers = 3
f <- read.table("tumour_sample_germline_exome_somatic_caller_file.tsv", header=TRUE, stringsAsFactors=FALSE, sep="\t", comment.char="", quote="") %>%
dplyr::rename("Tumour_Sample"="X.Tumour_Sample") %>%
mutate(n.callers = setNames(sapply(strsplit(unique(Identified), "-"),
function(callers){
ifelse("Intersection" %in% callers, max.callers, sum(!grepl("^filter", callers)))
}), unique(Identified))[Identified])
g <- read.delim("tumour_sample_germline_paired_exomes_HAP.tsv", header=TRUE, stringsAsFactors=FALSE, sep="\t", comment.char="") %>%
dplyr::rename("Tumour_Sample"="X.Tumour_Sample")
sort(unique(f$Tumour_Sample))
sort(unique(g$Tumour_Sample))
ViP_list <- read.delim("ViP Samples & Candidate Genes.txt", header=TRUE, stringsAsFactors=FALSE, sep="\t", comment.char="") %>%
arrange(Sample)
tumour_type <- select(ViP_list,Sample,Tumour.Type) %>% distinct() %>% rename("Sample"="Normal_Sample")
f$Normal_Sample <- as.character(f$Normal_Sample)
g$Normal_Sample <- as.character(g$Normal_Sample)
f <- left_join(f,tumour_type,by="Normal_Sample",copy=FALSE)
g0 <- left_join(g,tumour_type,by="Normal_Sample",copy=FALSE) %>% filter(CANONICAL%in%"YES")
gene_list <- select(ViP_list,Sample,SYMBOL) %>% filter(Sample%in%(f$Normal_Sample))
genes <- gene_list$SYMBOL
tumour_purity <- read.delim("Tumour Purity.txt", header=TRUE, stringsAsFactors=FALSE, sep="\t", comment.char="") %>%
select(Tumour_Sample,Purity,No_Mutations)
f <- left_join(f,tumour_purity,by="Tumour_Sample",copy=FALSE)
g0 <- left_join(g0,tumour_purity,by="Tumour_Sample",copy=FALSE)
f$QUAL <- as.numeric(f$QUAL) %>% replace_na(0)
f$TUMOUR.PMCAD <- as.numeric(f$TUMOUR.PMCAD) %>% replace_na(0)
f$TUMOUR.PMCDP <- as.numeric(f$TUMOUR.PMCDP) %>% replace_na(0)
f$TUMOUR.PMCFREQ <- as.numeric(f$TUMOUR.PMCFREQ) %>% replace_na(0)
f$NORMAL.PMCAD <- as.numeric(f$NORMAL.PMCAD) %>% replace_na(0)
f$NORMAL.PMCDP <- as.numeric(f$NORMAL.PMCDP) %>% replace_na(0)
f$NORMAL.PMCFREQ <- as.numeric(f$NORMAL.PMCFREQ) %>% replace_na(0)
f$GnomAD_v3_AF <- as.numeric(f$GnomAD_v3_AF) %>% replace_na(0)
f0 <- filter(f,!Identified%in%c("FilteredInAll")) %>%
filter(n.callers>=2) %>%
filter(Consequence_Rank<7)
f1<-filter(f0,(NORMAL.PMCAD <= 2) & (NORMAL.PMCDP >= 10) & (NORMAL.PMCFREQ < 0.01))
f2<-filter(f1,TUMOUR.PMCAD >= 5)
f3<-filter(f2,TUMOUR.PMCDP>=20)
f4<-filter(f3,TUMOUR.PMCFREQ >= (f3$Purity*0.5*(2/3)))
f5<-filter(f4,GnomAD_v2.1_non_cancer_AF <= 0.0001)
f6<-filter(f5,GnomAD_v3_AF <= 0.0001)
samples<-unique(f$Tumour_Sample)
Extract remaining unique variants and columns used for mutation signature analysis, and rename sample column
f7<-select(f6,c(CHROM,POS,REF,ALT,Variant_Type))
unique(f7$Variant_Type)
f7$Sample<-f6$Tumour_Sample
Run mutation signature functions (DeconstructSigs, SigLASSO, SA, QP, NNLS, SignatureEstimation), including count of number of variants used and filtering metrics, and output results
mut.ref<-f7
mut.ref$CHROM<-unlist(lapply(mut.ref$CHROM,function(x) paste("chr",x,sep="")))
mut.ref_edit<-mut.ref[,c("Sample","CHROM","POS","REF","ALT")]
for (sample in samples){
setwd("Tumour Mutation Signatures")
mut.ref2<-mut.ref_edit[mut.ref_edit$Sample==sample,]
mut.ref2<-unique(mut.ref2)
mutNum <- nrow(mut.ref2)
options(scipen = 999)
s <- paste("The number of variants in sample", sample, "used for mutation signature analysis is", mutNum)
t <- paste("The minimum TUMOUR.PMCAD in sample", sample, "used for filtering is", min(unique(f2$TUMOUR.PMCAD)), "and the minimum figure for the variants used for mutation signature analysis is", min(unique(f6$TUMOUR.PMCAD)))
u <- paste("The minimum TUMOUR.PMCDP in sample", sample, "used for filtering is", min(unique(f3$TUMOUR.PMCDP)), "and the minimum figure for the variants used for mutation signature analysis is", min(unique(f6$TUMOUR.PMCDP)))
v <- paste("The minimum TUMOUR.PMCFREQ in sample", sample, "used for filtering is", min(unique(f4$TUMOUR.PMCFREQ)), "and the minimum figure for the variants used for mutation signature analysis is", min(unique(f6$TUMOUR.PMCFREQ)))
w <- paste("The maximuum GnomAD_v2.1_non_cancer_AF in sample", sample, "used for filtering is", max(unique(f5$GnomAD_v2.1_non_cancer_AF)), "and the maximum figure for the variants used for mutation signature analysis is", max(unique(f6$GnomAD_v2.1_non_cancer_AF)))
x <- paste("The maximum GnomAD_v3_AF in sample", sample, "used for filtering and for the variants used for mutation signature analysis is", max(unique(f6$GnomAD_v3_AF)))
parameters <- print(c(s,t,u,v,w,x))
options(scipen = 0)
sigs.input = run_deconstructSigs(mut.ref2)
sigs.input = run_deconstructSigs(mut.ref2,
signatures.ref = SBS.COSMIC.exome.v3.may2019.HGSOvCa)
sigs.input = run_deconstructSigs(mut.ref2,
signatures.ref = signatures.cosmic)
setwd(sample)
mut.ref3 = mut.ref2[,c(3:5)]
mut.ref.chr = tibble(gsub("[chr]","", mut.ref2$CHROM))
bind_cols(mut.ref.chr,mut.ref3) %>% write_tsv(path=paste(sample,"unique_mutations_forSignal.tsv",sep="_"),col_names=FALSE)
results = run_sigs(data=mut.ref2, bootstrap.reps = 100, plot = TRUE, plot.file = paste(sample,"_signatures.pdf",sep=""))
write.table(results$exposures, file=paste(sample,"signatures_exposures.tsv",sep="_"), sep='\t', col.names=T, row.names=F, quote=F)
write.table(results$errors, file=paste(sample,"signatures_errors.tsv",sep="_"), sep='\t', col.names=T, row.names=F, quote=F)
write.table(results$exposures.bootstrap, file=paste(sample,"signatures_exposures_bootstraps.tsv",sep="_"), sep='\t', col.names=T, row.names=F, quote=F)
write.table(results$errors.bootstrap, file=paste(sample,"signatures_errors.bootstaps.tsv",sep="_"), sep='\t', col.names=T, row.names=F, quote=F)
write.table(parameters,file=paste(sample,"_parameters.txt",sep=""), quote=F,row.names = F,sep="\t")
}
---
title: "Thesis Tumour Exome Mutational Signature Analysis Script"
output: html_notebook
---

ALL IMPORTED FILES MUST BE IN THE SAME DIRECTORY AS THIS SCRIPT

Load required packages
```{r}
library("tidyverse")
library("GenomeInfoDb")
library("deconstructSigs")
library("BSgenome.Hsapiens.UCSC.hg19")
library("RColorBrewer")
library("nnls")
library("SignatureEstimation")
library("reshape2")
library("siglasso")
library("parallel")
library("grid")
library("gridExtra")
```

Create mutation signature functions
```{r}
run_deconstructSigs = function(x, out.dir = ".", signatures.ref = signatures.exome.cosmic.v3.may2019, all.colors = NULL, sig.type = 'SBS', bsg = NULL, ...){
  
  # x = data.frame with columns CHROM, POS, REF, ALT and, optionally, Sample
  # signatures.ref = one of signatures.nature2013, signatures.exome.cosmic.v3.may2019,
  #                  signatures.genome.cosmic.v3.may2019, signatures.dbs.cosmic.v3.may2019,
  #                  signatures.cosmic
  # ... = arguments to pass to whichSignatures()
  if(! "Sample" %in% colnames(x)){
    x$Sample = "Sample"
    samples = "Sample"
  } else {
    samples = unique(x$Sample)
  }
  
  # Same colours as before but different signature names
  if(is.null(all.colors[1])){
    all.colors <- c("#023FA5", "#023FA5", "#7D87B9", "#BEC1D4", 
                    "#D6BCC0", "#BB7784", "gold", "#4A6FE3", "#8595E1", "#B5BBE3", 
                    "#E6AFB9", "#E07B91", "#D33F6A", "#11C638", "#8DD593", 
                    "#C6DEC7", "#EAD3C6", "#F0B98D", "#EF9708", "#0FCFC0", 
                    "#9CDED6", "#D5EAE7", "#F3E1EB", "#F6C4E1", "#F79CD4", 
                    "#866097", "#008941", "#A30059", "#F6C4E1", "#F79CD4", 
                    "#866097", "#008941", "#A30059", "#008080", "#8B0000", 
                    "#F4A460", "#663399")#, "#706563")
  }
  
  old.sigs <- c("Signature.1", "Signature.1A", "Signature.1B", 
                "Signature.2", "Signature.3", "Signature.4", "Signature.5", 
                "Signature.6", "Signature.7", "Signature.8", "Signature.9", 
                "Signature.10", "Signature.11", "Signature.12", "Signature.13", 
                "Signature.14", "Signature.15", "Signature.16", "Signature.17", 
                "Signature.18", "Signature.19", "Signature.20", "Signature.21", 
                "Signature.R1", "Signature.R2", "Signature.R3", "Signature.U1", 
                "Signature.U2", "Signature.22", "Signature.23", "Signature.24", 
                "Signature.25", "Signature.26", "Signature.27", "Signature.28", 
                "Signature.29", "Signature.30", "unknown")
  
  sigs.input = mut.to.sigs.input(mut.ref = x,
                                 sample.id = "Sample",
                                 chr = "CHROM",
                                 pos = "POS",
                                 ref = "REF",
                                 alt = "ALT",
                                 sig.type = sig.type,
                                 bsg = bsg)
  
  
  
  
  for(sample in samples){
    tryCatch(expr = {
      cat(paste0("Processing ", sample, "...\n"))
      if(length(samples) == 1 & sample == "Sample"){
        results.dir = out.dir
      } else {
        results.dir = file.path(out.dir, sample)
      }
      
      prefix = paste(sample,deparse(substitute(signatures.ref)), sep="_")
      
      if(!dir.exists(results.dir)){dir.create(results.dir, recursive = T)}
      error = file(file.path(results.dir, paste0(prefix, "_error.txt")), open="wt")
      sink(error, type="message")
      
      sigs = whichSignatures(tumor.ref = sigs.input,
                             signatures.ref = signatures.ref,
                             sample.id = sample,
                             contexts.needed = TRUE,
                             ...)
      
      #pdf(file.path(results.dir, paste0(prefix, "_plots.pdf")))
      
      # Plotting
      png(file.path(results.dir, paste0(prefix, "_signature.png")),
          width=1200, height=1000, units="px", pointsize=20)
      plotSignatures(sigs)
      dev.off()
      
      new.sigs = setdiff(colnames(sigs$weights)[sigs$weights[1,] > 0], old.sigs)
      png(file.path(results.dir, paste0(prefix, "_pie.png")),
          width=960, height=960, units="px", pointsize=24)
      if(length(new.sigs) > 0){
        makePie(sigs, add.color=unique(all.colors)[1:length(new.sigs)])
      } else {
        makePie(sigs)
      }
      dev.off()
      
      # Save output
      write.table(x %>% dplyr::filter(Sample == sample) %>%
                    dplyr::filter(!duplicated(paste(CHROM, POS, REF, ALT))),
                  file = file.path(results.dir, paste0(prefix, "_unique_mutations.txt")),
                  sep="\t", quote=F, row.names=F, col.names=T)
      
      
      write.table(data.frame(Signature = colnames(sigs$weights),
                             Percent = round(unlist(sigs$weights)*100, digits=2)),
                  file = file.path(results.dir, paste0(prefix, "_percentages.txt")),
                  quote=F, sep="\t", col.names=T, row.names=F)
      
      write.table(sigs.input[sample,],
                  file = file.path(results.dir, paste0(prefix, "_trinucleotides.txt")),
                  row.names=F, col.names=T, sep="\t", quote=F)
      
    }, finally = {
      cat(paste0("Finished ", sample, "\n"))
    })
    
  }
  cat("Finished!\n")
}

id.signature.path = "sigProfiler_ID_signatures.csv"

signatures.id.cosmic.v3.may2019 = as.data.frame(t(read.delim(id.signature.path,
                                                             sep=",", header=T, as.is=T, row.names=1)))


# Normalises variants to SBS, DBS and indels only (removes substitutions >= 3 BP in length)
norm_sbs_to_dbs = function(x, chr="CHROM", pos="POS",
                           ref="REF", alt="ALT", sample = "Sample"){
  # colnames CHROM, POS, REF, ALT, (optional) Sample
  
  if(! sample %in% colnames(x)){
    x[,sample] = "Sample"
    rm.sample = TRUE # do not return sample name, assume all variants same sample
  } else {
    rm.sample = FALSE
  }
  
  orig.cols = c(sample, chr, pos, ref, alt)
  x = x[,orig.cols]
  cols.use = c("Sample", "CHROM", "POS", "REF", "ALT")
  colnames(x) = cols.use
  
  x = x %>% arrange(Sample, CHROM, POS) %>%
    filter(!duplicated(paste(Sample, CHROM, POS, REF, ALT))) %>%
    mutate(Already.DBS = nchar(REF) == 2 & nchar(ALT) == 2)
  
  
  indels = x %>% filter(nchar(REF) != nchar(ALT))
  substitutions = x %>% filter((nchar(REF) == 1 & nchar(ALT) == 1) | Already.DBS)
  
  dbs = substitutions %>% filter(nchar(REF) == 1 & nchar(ALT) == 1) %>%
    group_by(Sample, CHROM) %>%
    mutate(diff1 = abs(c(head(POS, -1) - tail(POS, -1), 0)), diff2 = abs(c(0, tail(POS, -1) - head(POS, -1)))) %>%
    filter((diff1 == 1 | diff2 == 1) & ! (diff1 == 1 & diff2 == 1)) %>% # remove anything > 2 substitutions as probable indel or bad mapping
    mutate(diff1 = abs(c(head(POS, -1) - tail(POS, -1), 0)), diff2 = abs(c(0, tail(POS, -1) - head(POS, -1)))) %>%
    filter(diff1 == 1 | diff2 == 1) %>%
    summarise(POS=min(POS), REF=paste0(REF, collapse=""), ALT = paste0(ALT, collapse="")) %>% ungroup()
  
  sbs = substitutions %>% filter(nchar(REF) == 1 & nchar(ALT) == 1) %>%
    group_by(Sample, CHROM) %>%
    mutate(diff1 = abs(c(head(POS, -1) - tail(POS, -1), 0)), diff2 = abs(c(0, tail(POS, -1) - head(POS, -1)))) %>%
    filter(diff1 != 1 & diff2 != 1)  %>% ungroup()# remove anything with adjacent variants
  
  if(sum(substitutions$Already.DBS) > 0){
    dbs = rbind(dbs, substitutions[which(substitutions$Already.DBS),cols.use])
  }
  
  
  
  res = NULL
  if(nrow(indels) > 0){
    res = rbind(res, indels[,cols.use])
  }
  if(nrow(dbs) > 0){
    res = rbind(res, dbs[,cols.use])
  }
  if(nrow(sbs) > 0){
    res = rbind(res, sbs[,cols.use])
  }
  res = as.data.frame(res) %>% arrange(Sample, CHROM, POS)
  res = res[,cols.use]
  colnames(res) = orig.cols
  return(res)
  
}

# Assigns InDel mutation type (see cosmic indel signatures)
calc.id.type = function(r,a,c){
  ri = (nchar(c) %/% 2)
  if(nchar(a) == 1){
    # deletion
    id.length = nchar(r) - nchar(a)
    if(substr(c, ri, ri+id.length) != r){
      return(NA)
    }
    
    id.type = "del"
    id.seq = substr(r, 2, nchar(r))
    if(substr(r,2,2) %in% c("A", "G")){
      c = rev.strand(c)
      id.seq = rev.strand(id.seq)
      ri = ri - (id.length - 1)
      while(substr(c, ri - id.length + 1, ri) == id.seq){
        # left normalise reverse strand
        ri = ri - id.length
      }
      #r = substr(c, ri, ri + id.length)
    }
    
    
    
  } else {
    id.length = nchar(a) - nchar(r)
    id.type = "ins"
    id.seq = substr(a,2, nchar(a))
    if(substr(id.seq,1,1) %in% c("A", "G")){
      c = rev.strand(c)
      ri = ri + 1
      id.seq = rev.strand(id.seq)
      
    }
  }
  while(substr(c, ri - id.length + 1, ri) == id.seq){
    # left normalise
    ri = ri - id.length
  }
  if(ri < 1){
    ri = ri + id.length
  }
  
  
  
  n.reps = 0
  while(substr(c, ri+1+id.length*(n.reps), ri+id.length*(n.reps+1)) == id.seq){
    n.reps = n.reps + 1
  }
  
  if(id.type == "del"){
    n.reps = n.reps - 1 # remove del itself
    if(id.length == 1){
      if(substr(id.seq, 1, 1) == "C"){
        sig.index = 1 + min(n.reps, 5)
      } else {
        sig.index = 7 + min(n.reps, 5)
      }
    } else {
      if(n.reps == 0){
        # check microhomology
        n.hom = 0
        # forward
        while(substr(c, ri+1+id.length, ri+1+id.length+n.hom) == substr(id.seq, 1, 1+n.hom)){
          n.hom = n.hom + 1
        }
        # reverse
        n.hom.r = 0
        while(substr(c, ri - n.hom.r, ri) == substr(id.seq, id.length - n.hom.r, id.length)){
          n.hom.r = n.hom.r + 1
        }
        n.hom = max(n.hom, n.hom.r)
        
        if(n.hom == 0){
          sig.index = 25 + min(n.reps, 5) + 6*(min(5, id.length)-2)
        } else {
          sig.index = 73 + min(n.hom, 5) - 1 + sum(0:(min(id.length, 5) - 2))
        }
      } else {
        sig.index = 25 + min(n.reps, 5) + 6*(min(5, id.length)-2)
      }
    }
  } else {
    if(id.length == 1){
      if(substr(id.seq, 1, 1) == "C"){
        sig.index = 13 + min(n.reps, 5)
      } else {
        sig.index = 19 + min(n.reps, 5)
      }
    } else {
      sig.index = 49 + min(n.reps, 5) + 6*(min(5, id.length) - 2)
    }
  }
  
  return(sig.index)
}


rev.strand = function(x){
  rev.bases = c("A" = "T", "T" = "A", "G" = "C", "C" = "G", "N" = "N")
  y = rev(rev.bases[unlist(strsplit(x, ""))])
  if(length(x) == 1){return(paste(y, collapse=""))} else {return(y)}
}

# Creates input mutation matrices for each mutation type
mut.to.sigs.input.alltypes = function(mut.ref, sample.id = "Sample", chr = "chr", pos = "pos", 
                      ref = "ref", alt = "alt", bsg = NULL, sig.types = c("SBS", "DBS", "ID"), dbs_table = dbs_possible) {
  results = lapply(sig.types, function(sig.type){
    if(sig.type == "ID"){
      # InDel stuff here
      samples = factor(mut.ref[,sample.id])
      mut.ref = mut.ref[nchar(mut.ref[,ref]) != 1 | nchar(mut.ref[,alt]) != 1,]
      
      
      
      res = matrix(0, nrow = length(levels(samples)), ncol=83)
      rownames(res) = levels(samples)
      
      
      if (exists("mut.ref", mode = "list")) {
        mut.full <- mut.ref
      } else {
        if (file.exists(mut.ref)) {
          mut.full <- utils::read.table(mut.ref, sep = "\t", 
                                        header = TRUE, as.is = FALSE, check.names = FALSE)
        }
        else {
          stop("mut.ref is neither a file nor a loaded data frame")
        }
      }
      
      if(nrow(mut.full) == 0){
        return(res)
      }
      
      mut <- mut.full[, c(sample.id, chr, pos, ref, alt)]
      mut[, chr] <- factor(mut[, chr])
      levels(mut[, chr]) <- sub("^([0-9XY])", "chr\\1", levels(mut[, 
                                                                   chr]))
      levels(mut[, chr]) <- sub("^MT", "chrM", levels(mut[, 
                                                          chr]))
      levels(mut[, chr]) <- sub("^(GL[0-9]+).[0-9]", "chrUn_\\L\\1", 
                                levels(mut[, chr]), perl = T)
      if (is.null(bsg)) {
        bsg = BSgenome.Hsapiens.UCSC.hg19::Hsapiens
      }
      unknown.regions <- levels(mut[, chr])[which(!(levels(mut[,chr]) %in% GenomeInfoDb::seqnames(bsg)))]
      if (length(unknown.regions) > 0) {
        unknown.regions <- paste(unknown.regions, collapse = ", ")
        warning(paste("Check chr names -- not all match bsg object:\n", 
                      unknown.regions, sep = " "))
        mut <- mut[mut[, chr] %in% GenomeInfoDb::seqnames(bsg), 
                   ]
      }
      mut$context = BSgenome::getSeq(bsg,
                                     mut[, chr], mut[, pos] - 6*pmax(1, nchar(mut[,ref]) - 1, nchar(mut[,alt]) - 1) + 1,
                                     mut[, pos] + 6*pmax(1, nchar(mut[,ref]) - 1, nchar(mut[,alt]) - 1) + 1, as.character = T)

      
      
      
      for(i in 1:nrow(mut)){
        
        s = mut[i,sample.id]
        j = match(s, levels(samples))
        p = mut[i,pos]
        r = mut[i,ref]
        a = mut[i,alt]
        #c = strsplit(mut[i,"context"], "")[[1]]
        c = mut[i,"context"]
        
        n = calc.id.type(r, a, c)
        if(is.na(n)){
          cat(paste0("Variant ", paste(mut[i,chr], p, r, a, sep="-"), " does not match reference! It will be ignored.\n"))
        } else {
          res[j,n] = res[j,n] + 1
        }
        
        
      }
      return(res)
    } else {
      return(mut.to.sigs.input(mut.ref, sample.id, chr, pos, ref, alt, bsg, sig.type, dbs_table))
    }
  })
  results = setNames(results, sig.types)
  return(results)
}

# Non-negative least squares method for SignatureEstimation
decomposeNNLS = function(m, P, ...){
  exposures = coef(nnls(P, m))
  exposures = exposures/sum(exposures)
  return(exposures)
}

# DeconstructSigs wrapper for SignatureEstimation
decomposeDeSig = function(m, P, ...){
  
  m = as.data.frame(t(m))
  rownames(m) = "Sample"
  P = as.data.frame(t(P))
  colnames(P) = colnames(m)
  y = whichSignatures(tumor.ref = m, signatures.ref = P)
  exposures = as.numeric(y$weights)
  #exposures = exposures/sum(exposures)
  return(exposures)
}

# SigLASSO wrapper for SignatureEstimation
decomposeSiglasso = function(m, P, ...){
  counts = list(...)[["counts"]]
  if(!is.null(counts)){
    m = m*counts
  }
  m = matrix(m, ncol=1)
  success = FALSE # randomly fails sometimes
  while(!success){
    tryCatch(expr = {
      y = siglasso(sample_spectrum = m, signature = P, plot=F)
      success = TRUE
    }, error = function(e){},
    finally = {
      next
    })
  }
  
  exposures = y[,1]
  #exposures = exposures/sum(exposures)
  return(exposures)
}

# Multiprocess version of SignatureEstimation::bootstrapSigExposures
bootstrapSigExposuresMP = function (m, P, R, mutation.count = NULL, decomposition.method = decomposeQP, num.threads = 8,
          ...) {
  P = as.matrix(P)
  if (length(m) != nrow(P)) 
    stop("Length of vector 'm' and number of rows of matrix 'P' must be the same.")
  if (any(names(m) != rownames(P))) 
    stop("Elements of vector 'm' and rows of matrix 'P' must have the same names (mutations types).")
  if (ncol(P) == 1) 
    stop("Matrices 'P' must have at least 2 columns (signatures).")
  if (is.null(mutation.count)) {
    if (all(SignatureEstimation:::is.wholenumber(m))) 
      mutation.count = sum(m)
    else stop("Please specify the parameter 'mutation.count' in the function call or provide mutation counts in parameter 'm'.")
  }
  m = m/sum(m)
  K = length(m)
  
  
  replicateMP = function(n, expr, num.threads = 8){
    as.matrix(do.call(cbind, mclapply(integer(n), eval.parent(substitute(function(...) expr)),
                                      mc.cores = num.threads)))
  }
  extra.args = list(...)
  exposures = replicateMP(R, {
    mutations_sampled = sample(seq(K), mutation.count, replace = TRUE, 
                               prob = m)
    m_sampled = as.numeric(table(factor(mutations_sampled, 
                                        levels = seq(K))))
    m_sampled = m_sampled/sum(m_sampled)
    
    do.call(decomposition.method, c(list(m=m_sampled, P=P), extra.args))
  }, num.threads = num.threads)
  
  rownames(exposures) = colnames(P)
  colnames(exposures) = paste0("Replicate_", seq(R))
  errors = apply(exposures, 2, function(e) SignatureEstimation:::FrobeniusNorm(m, 
                                                         P, e))
  names(errors) = colnames(exposures)
  return(list(exposures = exposures, errors = errors))
}


# Specify COSMICv2 & 3 signature sets (incl.OvCa-specific COSMICv3 signatures)
all.signatures = list("DBS" = list("DBS.COSMIC.v3.may2019" = signatures.dbs.cosmic.v3.may2019,
                                   "DBS.COSMIC.v3.may2019.HGSOvCa" = signatures.dbs.cosmic.v3.may2019[c(2,4,5,6,9),]),
                      "SBS" = list("SBS.COSMIC" = signatures.cosmic,
                                   "SBS.COSMIC.exome.v3.may2019" = signatures.exome.cosmic.v3.may2019,
                                   "SBS.COSMIC.genome.v3.may2019" = signatures.genome.cosmic.v3.may2019,
                                   "SBS.COSMIC.exome.v3.may2019.HGSOvCa" = signatures.exome.cosmic.v3.may2019[c(1,2,3,5,11,17,23,31,40,44,45,46),],
                                   "SBS.COSMIC.genome.v3.may2019.HGSOvCa" = signatures.genome.cosmic.v3.may2019[c(1,2,3,5,11,17,23,31,40,44,45,46),]),
                      "ID" = list("ID.COSMIC.v3.may2019" = signatures.id.cosmic.v3.may2019,
                                  "ID.COSMIC.v3.may2019.HGSOvCa" = signatures.id.cosmic.v3.may2019[c(1,2,4,5,6,8,9),]))
SBS.COSMIC.exome.v3.may2019.HGSOvCa = signatures.exome.cosmic.v3.may2019[c(1,2,3,5,11,17,23,31,40,44,45,46),]

# Main function
# data = data.frame with sample, chr, pos, ref, alt columns specified
# sig.types = mutation types to test
# signature.list = list[sig.types] of signature matrices
# bootstrap.reps = number of bootstrap resampling repetitions for calculating variance
# plot.file = output file for plots (pdf)
# bsg = BSGenome object (default is hg19) for fetching reference contexts
# num.threads = number of processes for bootstrap parallelisation
# method.colours = colours for signature methods in plot, default is ggplot colours
# Returns list with signature scores and errors (and extra table for bootstraps) plus input matrices
run_sigs = function(data, sample = "Sample",
                    chr="CHROM", pos="POS", ref="REF", alt="ALT",
                    sig.types = c("SBS", "DBS", "ID"),
                    signature.list = all.signatures,
                    decomposition.methods = c("QP", "SA", "DeconstructSigs",
                                              "SigLASSO", "NNLS"),
                    bootstrap.reps = 0, bsg=NULL,
                    plot=TRUE, plot.file = NULL,
                    num.threads = 8,
                    method.colours = NULL
                    ){
  all.decomposition.methods = list("QP" = decomposeQP,
                               "SA" = decomposeSA,
                               "DeconstructSigs" = decomposeDeSig,
                               "SigLASSO" = decomposeSiglasso,
                               "NNLS" = decomposeNNLS)
  decomposition.methods = all.decomposition.methods[decomposition.methods]
  
  
  
  
  if(!sample %in% colnames(data)){data[,sample] = "_"}
  data = norm_sbs_to_dbs(data, chr=chr, pos=pos, ref=ref, alt=alt, sample = sample)
  data[,"Sample"] = data[,sample]
  sample = "Sample" # deconstructSigs mut.to.sigs.input doesn't work properly for DBS with different sample.id column for some reason
  input.matrices = mut.to.sigs.input.alltypes(mut.ref = data,
                      chr = chr, pos=pos, ref = ref, alt=alt,
                      sample.id = sample, bsg=bsg,
                      sig.types = sig.types, dbs_table = dbs_possible)
  
  results = list(input.matrices = input.matrices)
  samples = unique(data[,sample])
  for(s in samples){
    print(s)
    for(sig.type in sig.types){
      sigs.input = input.matrices[[sig.type]]
      if(is.null(sigs.input)){next}
      if(!s %in% rownames(sigs.input)){next}
      sigs.input = sigs.input[s,,drop=F]
      if(nrow(sigs.input) > 0){
        if(sum(sigs.input)  < 1){
          next
        }
        signatures = signature.list[[sig.type]]
        m = as.matrix(t(sigs.input))
        for(signature.name in names(signatures)){
          signature = signatures[[signature.name]]
          P = as.matrix(t(signature))
          
          result.list = lapply(1:length(decomposition.methods), function(i){
            decomposition.method = decomposition.methods[[i]]
            decomposition.method.name = names(decomposition.methods)[i]
            
            cat(paste0("Method: ", decomposition.method.name,
                       ", Signature: ", signature.name,
                       ", Mutation type: ", sig.type,
                       "\n"))
            if(decomposition.method.name == "SigLASSO"){
              if(sig.type != "SBS"){return(NULL)}
              counts = sum(m)
              y = findSigExposures(m, P,
                                   decomposition.method = decomposition.method,
                                   counts = counts)
            } else {
              y = findSigExposures(m, P,
                                   decomposition.method = decomposition.method)
            }
            
            exposures = data.frame(Signature = rownames(y$exposures),
                                   Exposure = y$exposures[,1],
                                   Method = decomposition.method.name,
                                   Signature.Set = signature.name,
                                   stringsAsFactors = F)
            exposures$Method = decomposition.method.name
            exposures$Signature.Set = signature.name
            exposures$Sample = s
            
            errors = data.frame(Error = y$errors,
                                Method = decomposition.method.name,
                                Signature.Set = signature.name,
                                Sample = s,
                                stringsAsFactors = F)
            return(list(exposures = exposures,
                        errors = errors))
          })
          
          exposures = do.call(rbind, lapply(result.list, function(x){
            x[["exposures"]]
          }))
          errors = do.call(rbind, lapply(result.list, function(x){
            x[["errors"]]
          }))
          results$exposures = rbind(results$exposures, exposures)
          results$errors = rbind(results$errors, errors)
          
          # estimate errors by bootstrapping
          if(bootstrap.reps > 1){
            cat("Bootstrapping...\n")
            result.list = lapply(1:length(decomposition.methods), function(i){
              decomposition.method = decomposition.methods[[i]]
              decomposition.method.name = names(decomposition.methods)[i]
              cat(paste0("Method: ", decomposition.method.name,
                         ", Signature: ", signature.name,
                         ", Mutation type: ", sig.type,
                         "\n"))
              if(decomposition.method.name == "SigLASSO"){
                if(sig.type != "SBS"){return(NULL)}
                counts = sum(m)
                y = bootstrapSigExposuresMP(m, P, R=bootstrap.reps,
                                          decomposition.method = decomposition.method,
                                          num.threads = num.threads,
                                          counts=counts)
              } else {
                y = bootstrapSigExposuresMP(m, P, R=bootstrap.reps,
                                          decomposition.method = decomposition.method,
                                          num.threads = num.threads)
              }
              
              exposures = melt(data.frame(Signature = rownames(y$exposures),
                                          y$exposures), value.name = "Exposure")
              exposures$Method = decomposition.method.name
              exposures$Signature.Set = signature.name
              exposures$Sample = s
              colnames(exposures)[2] = "Replicate"
              errors = data.frame(Replicate = names(y$errors),
                                  Error = y$errors,
                                  Method = decomposition.method.name,
                                  Signature.Set = signature.name,
                                  Sample = s,
                                  stringsAsFactors = F)
              return(list(exposures = exposures,
                          errors = errors))
            })
            exposures = do.call(rbind, lapply(result.list, function(x){
              x[["exposures"]]
            }))
            errors = do.call(rbind, lapply(result.list, function(x){
              x[["errors"]]
            }))
            results$exposures.bootstrap = rbind(results$exposures.bootstrap, exposures)
            results$errors.bootstrap = rbind(results$errors.bootstrap, errors)
          }
          
        }
      }
    }  
  }
  
  
  if(plot){
    plot_results(results, plot.file, method.colours)
  }
  
  return(results)
}


# Plot results
plot_results = function(results, plot.file=NULL, method.colours = NULL, width=16, height=9){
  if(!is.null(plot.file)){
    pdf(plot.file, width=width, height=height)
  }
  int_breaks_rounded <- function(x, n = 5)  pretty(x, n)[round(pretty(x, n),1) %% 1 == 0] # for integer y axis breaks
  for(s in unique(results$exposures$Sample)){
    for(sig.type in names(results$input.matrices)){
      if(s %in% rownames(results$input.matrices[[sig.type]])){
        sigs.input = results$input.matrices[[sig.type]][s,]
      
        theme_common = theme_bw() + theme(panel.grid.major.x = element_blank(),
                                          plot.margin = unit(c(5,1,5,1), "lines"),
                                          axis.text.x = element_text(angle = 90, hjust = 1, vjust = 0.5),
                                          axis.ticks = element_blank())
        if(sig.type == "SBS"){
          
          cols = c("#5ABCEB", "#050708", "#D33C32", "#CBCACB", "#ABCD72", "#E7C9C6")
          dat = data.frame(x = as.character(1:length(sigs.input)),
                           y = as.numeric(sigs.input),
                           top.cols = gsub(".*\\[|\\].*$", "", colnames(sigs.input)),
                           x.labels = gsub("\\[|>..", "", colnames(sigs.input)),
                           stringsAsFactors = F)
          for(col in c("x", "top.cols")){
            dat[,col] = factor(dat[,col], levels = unique(dat[,col]))
          }
          p = ggplot(dat, aes(x=x,y=y,fill=top.cols)) + geom_col(show.legend = F) + scale_fill_manual(values=cols) + xlab("") + ylab("Mutation Count")
          r = c(0, max(dat$y)*1.05)
          expand.factor = 0.5
          p = p + scale_x_discrete(labels = dat$x.labels) + scale_y_continuous(expand = expand_scale(c(-expand.factor/(1+2*expand.factor), -expand.factor/(1+2*expand.factor))), limits = c(-expand.factor*r[2], r[2]*(1+expand.factor)), breaks=int_breaks_rounded)
          p = p + theme_common
          top.segs = dat %>% group_by(top.cols) %>% summarise(xstart = as.numeric(head(x,1)) - 0.25, xend=as.numeric(tail(x,1)) + 0.25,y=max(dat$y)*1.10, text.x = x[ceiling(length(x)/2)])
          p = p + geom_segment(data = top.segs, mapping =aes(x=xstart,xend=xend, y=y,yend=y, size=2, col=top.cols), show.legend = F) + scale_colour_manual(values=cols)
          p = p + geom_text(data = top.segs, mapping = aes(x=text.x, y= max(dat$y)*1.15, label=top.cols))
          p = p + coord_cartesian(clip="off")
          p = p + ggtitle(paste0(s, " - SBS mutations\n\n\n"))
          print(p)
          
        } else if(sig.type == "DBS"){
          cols = c("#a6cee3", "#1f78b4", 
                   "#b2df8a", "#33a02c", "#fb9a99", "#e31a1c", "#fdbf6f", 
                   "#ff7f00", "#cab2d6", "#6a3d9a")
          dat = data.frame(x = as.character(1:length(sigs.input)),
                           y = as.numeric(sigs.input),
                           top.cols = paste0(gsub(">.*$", "", colnames(sigs.input)), ">NN"),
                           x.labels = gsub("^.*>", "", colnames(sigs.input)),
                           stringsAsFactors = F)
          for(col in c("x", "top.cols")){
            dat[,col] = factor(dat[,col], levels = unique(dat[,col]))
          }
          p = ggplot(dat, aes(x=x,y=y,fill=top.cols)) + geom_col(show.legend = F) + scale_fill_manual(values=cols) + xlab("") + ylab("Mutation Count")
          r = c(0, max(dat$y)*1.05)
          expand.factor = 0.5
          p = p + scale_x_discrete(labels = dat$x.labels) + scale_y_continuous(expand = expand_scale(c(-expand.factor/(1+2*expand.factor), -expand.factor/(1+2*expand.factor))), limits = c(-expand.factor*r[2], r[2]*(1+expand.factor)), breaks=int_breaks_rounded)
          p = p + theme_common
          top.segs = dat %>% group_by(top.cols) %>% summarise(xstart = as.numeric(head(x,1)) - 0.25, xend=as.numeric(tail(x,1)) + 0.25, y=max(dat$y)*1.10, text.x = x[ceiling(length(x)/2)])
          p = p + geom_segment(data = top.segs, mapping =aes(x=xstart,xend=xend, y=y,yend=y, size=2, col=top.cols), show.legend = F) + scale_colour_manual(values=cols)
          p = p + geom_text(data = top.segs, mapping = aes(x=text.x, y= max(dat$y)*1.15, label=top.cols))
          p = p + coord_cartesian(clip="off")
          p = p + ggtitle(paste0(s, " - DBS mutations\n\n\n"))
          print(p)
        } else {
          cols = c("#fdbf6f", "#ff7f00", "#b2df8a", "#33a02c",
                   colorRampPalette(colors = c("#fb9a99", "#e31a1c"))(4),
                   colorRampPalette(colors = c("#a6cee3", "#1f78b4"))(4),
                   colorRampPalette(colors = c("#cab2d6", "#6a3d9a"))(4))
          
          dat = data.frame(x = as.character(1:length(sigs.input)),
                           y = as.numeric(sigs.input),
                           top.text1 = c(rep("1bp deletion", 12),
                                         rep("1bp insertion", 12),
                                         rep(">1bp deletions at repeats\n(Deletion length)", 24),
                                         rep(">1bp insertions at repeats\n(Insertion length)", 24),
                                         rep("Deletions with microhomology\n(Deletion length)", 11)),
                           top.text2 = c(rep("C", 6),
                                         rep("T", 6),
                                         rep("C", 6),
                                         rep("T", 6),
                                         rep("2", 6),
                                         rep("3", 6),
                                         rep("4", 6),
                                         rep("5+", 6),
                                         rep("2", 6),
                                         rep("3", 6),
                                         rep("4", 6),
                                         rep("5+", 6),
                                         rep("2", 1),
                                         rep("3", 2),
                                         rep("4", 3),
                                         rep("5+", 5)),
                           x.labels = "",
                           bottom.text1 = c(rep(c("1", "2", "3", "4", "5", "6+"), 2),
                                            rep(c("0", "1", "2", "3", "4", "5+"), 2),
                                            rep(c("1", "2", "3", "4", "5", "6+"), 4),
                                            rep(c("0", "1", "2", "3", "4", "5+"), 4),
                                            c(1, 1:2, 1:3, 1:4, "5+")),
                           bottom.text2 = c(rep("Homopolymer length", 12),
                                            rep("Homopolymer length", 12),
                                            rep("Number of repeat units", 24),
                                            rep("Number of repeat units", 24),
                                            rep("Microhomology length", 11)),
                           stringsAsFactors = F)
          dat$top.cols = paste(dat$top.text1, dat$top.text2)
          for(col in c("x", "top.cols")){
            dat[,col] = factor(dat[,col], levels = unique(dat[,col]))
          }
          top.segs = dat %>% group_by(top.cols) %>% summarise(xstart = as.numeric(head(x,1)) - 0.25, xend=as.numeric(tail(x,1)) + 0.25,
                                                              y=max(dat$y)*1.10, y2=-(0.05*max(dat$y)),
                                                              text.x = x[ceiling(length(x)/2)], text=top.text2[1], text.col = ifelse(text %in% c("T", "5+"), "white", "black"))
          
          
          p = ggplot() + geom_col(data=dat, mapping = aes(x=x,y=y,fill=top.cols), show.legend = FALSE) + scale_fill_manual(values=cols) + xlab("") + ylab("Mutation Count")
          r = c(0, max(dat$y)*1.05)
          expand.factor = 0.5
          p = p + scale_x_discrete(labels = dat$x.labels) + scale_y_continuous(expand = expand_scale(c(-expand.factor/(1+2*expand.factor), -expand.factor/(1+2*expand.factor))), limits = c(-expand.factor*r[2], r[2]*(1+expand.factor)), breaks=int_breaks_rounded)
          p = p + theme_common
          p = p + geom_segment(data = top.segs, mapping =aes(x=xstart,xend=xend, y=y,yend=y, size=2, col=top.cols), show.legend = FALSE) 
          p = p + geom_segment(data = top.segs, mapping =aes(x=xstart,xend=xend, y=y2,yend=y2, size=2, col=top.cols), show.legend = FALSE) + scale_colour_manual(values=cols)
          p = p + geom_text(data = top.segs, mapping = aes(x=text.x, y=y, label=text), col=top.segs$text.col,hjust=0.5, nudge_x = ifelse((top.segs$xend - top.segs$xstart - 0.5) %% 2, 0.5, 0))
          p = p + geom_text(data = dat, mapping = aes(x=x, y=-(0.1*max(dat$y)), label=bottom.text1), hjust=0.5)
          annot2 = dat %>% group_by(bottom.text2, top.text1) %>% summarise(y=-0.15*max(dat$y), y2=1.17*max(dat$y), x=x[ceiling(length(x)/2)]) %>% ungroup()
          p = p + geom_text(data = annot2, mapping=aes(x=x, y=y, label=bottom.text2))
          p = p + geom_text(data = annot2, mapping=aes(x=x, y=y2, label=top.text1))
          
          p = p + coord_cartesian(clip="off")
          p = p + ggtitle(paste0(s, " - InDel mutations\n\n\n\n"))
          
          print(p)
        }
      }
    }
    for(sig.set in unique(results$exposures$Signature.Set)){
      all.sig.names = unique(results$exposures$Signature)
      x = results$exposures %>%
        filter(Signature.Set == sig.set,
               Sample == s) %>%
        filter(Signature %in% Signature[Exposure > 0]) %>% 
        mutate(Signature = factor(Signature, levels = intersect(all.sig.names, Signature)))
      if("exposures.bootstrap" %in% names(results)){
        # boxplot instead
        y = results$exposures.bootstrap %>% 
          filter(Signature.Set == sig.set,
                 Sample == s) %>% 
          mutate(Signature = factor(Signature, levels=intersect(all.sig.names, Signature)))
        x = x %>% mutate(Signature = factor(as.character(Signature), levels=levels(y$Signature)))
        p = ggplot(y, aes(x=Signature, y=Exposure, fill=Method)) + stat_boxplot(geom='errorbar') + geom_boxplot(outlier.shape=NA) +
          geom_boxplot(aes(color = Method),
                       fatten = NULL, fill = NA, coef = 0, outlier.shape=NA,
                       show.legend = F) +
          scale_y_continuous(expand = expand_scale(c(0, 0.05)))
        p = p + geom_point(data = x,
                           mapping = aes(x = Signature, y = Exposure, fill = Method, group = Method),
                           shape=23, position=position_dodge(width=0.75), show.legend = F) + guides(col=FALSE)
        
        
      } else {
        
        p = ggplot(x, aes(x=Signature, y=Exposure, fill=Method)) +
          geom_col(position="dodge") +
          scale_y_continuous(expand = expand_scale(c(0, 0.05)))
      }
      
      p = p + facet_grid(. ~ Signature, scales = "free_x") 
      
      p = p + theme_classic() +
        theme(axis.text.x = element_text(angle = 90, hjust = 1, vjust = 0.5)) +
        theme(strip.background = element_blank(), strip.text.x = element_blank()) + 
        xlab("") + ggtitle(paste0(ifelse(s != "_", paste0(s, " - "), ""), gsub("\\.", " ", sig.set)))
      if(!is.null(method.colours)){
        p = p + scale_fill_manual(name="Method", values = method.colours) + scale_colour_manual(name="Method", values=method.colours)
      }
      print(p)
      
      if("errors.bootstrap" %in% names(results)){
        p = results$errors.bootstrap %>% filter(Signature.Set == sig.set, Sample ==s) %>%
          ggplot(aes(x=Method, y=Error, fill=Method)) +
          stat_boxplot(geom="errorbar") + geom_boxplot(show.legend = F) + theme_classic()
      } else {
        p = results$errors %>% filter(Signature.Set == sig.set, Sample==s) %>%
          ggplot(aes(x=Method, y=Error, col=Method)) + geom_point(show.legend = F) + theme_classic()
      }
      if(!is.null(method.colours)){
        p = p + scale_fill_manual(name="Method", values = method.colours) + scale_colour_manual(name="Method", values=method.colours)
      }
      print(p + ylab("Decomposition Error"))
    }
  }
  if(!is.null(plot.file)){
    dev.off()
  }
}

plot_pies = function(results, signature.sets = NULL, samples = NULL, method="DeconstructSigs", plot.file=NULL,
                     cols = c("#023FA5", "#023FA5", "#7D87B9", "#BEC1D4", 
                              "#D6BCC0", "#BB7784", "gold", "#4A6FE3", "#8595E1", "#B5BBE3", 
                              "#E6AFB9", "#E07B91", "#D33F6A", "#11C638", "#8DD593", 
                              "#C6DEC7", "#EAD3C6", "#F0B98D", "#EF9708", "#0FCFC0", 
                              "#9CDED6", "#D5EAE7", "#F3E1EB", "#F6C4E1", "#F79CD4", 
                              "#866097", "#008941", "#A30059", "#F6C4E1", "#F79CD4", 
                              "#866097", "#008941", "#A30059", "#008080", "#8B0000", 
                              "#F4A460", "#663399")){
  blank_theme <- theme_minimal()+
    theme(
      axis.title.x = element_blank(),
      axis.title.y = element_blank(),
      panel.border = element_blank(),
      panel.grid=element_blank(),
      axis.ticks = element_blank(),
      plot.title=element_text(size=14, face="bold"),
      axis.text.x = element_blank()
    )
  
  if(is.null(signature.sets[1])){
    signature.sets = unique(results$exposures$Signature.Set)
  }
  if(is.null(samples)){
    samples = unique(results$exposures$Sample)
  }
  if(!method %in% results$exposures$Method){
    warning(paste0("Method: '", method, "' is not in these results"))
    return(NULL)
  }
  
  if(!is.null(plot.file)){
    pdf(plot.file, height=5, width=5*length(signature.sets))
  }
  for(sample in samples){
    ps = lapply(signature.sets, function(signature.set){
      signatures = unique(results$exposures %>% filter(Signature.Set == signature.set) %>% pull(Signature))
      if(length(signatures) > length(cols)){
        signature.cols = setNames(colorRampPalette(colors = cols)(length(signatures)), signatures)
      } else {
        signature.cols = setNames(cols[1:length(signatures)], signatures)
      }
      p = results$exposures %>% filter(Method == method, Sample == sample, Signature.Set == signature.set) %>%
        filter(Exposure > 0) %>% ggplot(aes(x="", y=Exposure, fill=Signature)) + geom_bar(stat="identity", width=1) +
        scale_fill_manual(values=signature.cols, name="") +
        coord_polar("y", start=0) + blank_theme +
        ggtitle(signature.set)
      return(p)
    })
    ps[["nrow"]] = 1
    do.call(grid.arrange, ps)
  }
  if(!is.null(plot.file)){dev.off()}
}
```

Import individual tumour exome somatic variants file and filter as before (see https://rpubs.com/deepsubs/thesis_tumour_exome_variants)
```{r}
max.callers = 3
f <- read.table("tumour_sample_germline_exome_somatic_caller_file.tsv", header=TRUE, stringsAsFactors=FALSE, sep="\t", comment.char="", quote="") %>%
  dplyr::rename("Tumour_Sample"="X.Tumour_Sample") %>% 
  mutate(n.callers = setNames(sapply(strsplit(unique(Identified), "-"), 
                                     function(callers){
                                       ifelse("Intersection" %in% callers, max.callers, sum(!grepl("^filter", callers)))
                                       }), unique(Identified))[Identified])
g <- read.delim("tumour_sample_germline_paired_exomes_HAP.tsv", header=TRUE, stringsAsFactors=FALSE, sep="\t", comment.char="") %>% 
  dplyr::rename("Tumour_Sample"="X.Tumour_Sample")
sort(unique(f$Tumour_Sample))
sort(unique(g$Tumour_Sample))
ViP_list <- read.delim("ViP Samples & Candidate Genes.txt", header=TRUE, stringsAsFactors=FALSE, sep="\t", comment.char="") %>% 
  arrange(Sample)
tumour_type <- select(ViP_list,Sample,Tumour.Type) %>% distinct() %>% rename("Sample"="Normal_Sample")
f$Normal_Sample <- as.character(f$Normal_Sample)
g$Normal_Sample <- as.character(g$Normal_Sample)
f <- left_join(f,tumour_type,by="Normal_Sample",copy=FALSE)
g0 <- left_join(g,tumour_type,by="Normal_Sample",copy=FALSE) %>% filter(CANONICAL%in%"YES")
gene_list <- select(ViP_list,Sample,SYMBOL) %>% filter(Sample%in%(f$Normal_Sample))
genes <- gene_list$SYMBOL
tumour_purity <- read.delim("Tumour Purity.txt", header=TRUE, stringsAsFactors=FALSE, sep="\t", comment.char="") %>%
  select(Tumour_Sample,Purity,No_Mutations)
f <- left_join(f,tumour_purity,by="Tumour_Sample",copy=FALSE)
g0 <- left_join(g0,tumour_purity,by="Tumour_Sample",copy=FALSE)

f$QUAL <- as.numeric(f$QUAL) %>% replace_na(0)
f$TUMOUR.PMCAD <- as.numeric(f$TUMOUR.PMCAD) %>% replace_na(0)
f$TUMOUR.PMCDP <- as.numeric(f$TUMOUR.PMCDP) %>% replace_na(0)
f$TUMOUR.PMCFREQ <- as.numeric(f$TUMOUR.PMCFREQ) %>% replace_na(0)
f$NORMAL.PMCAD <- as.numeric(f$NORMAL.PMCAD) %>% replace_na(0)
f$NORMAL.PMCDP <- as.numeric(f$NORMAL.PMCDP) %>% replace_na(0)
f$NORMAL.PMCFREQ <- as.numeric(f$NORMAL.PMCFREQ) %>% replace_na(0)
f$GnomAD_v3_AF <- as.numeric(f$GnomAD_v3_AF) %>% replace_na(0)

f0 <- filter(f,!Identified%in%c("FilteredInAll")) %>% 
    filter(n.callers>=2) %>%
    filter(Consequence_Rank<7)

f1<-filter(f0,(NORMAL.PMCAD <= 2) & (NORMAL.PMCDP >= 10) & (NORMAL.PMCFREQ < 0.01))
f2<-filter(f1,TUMOUR.PMCAD >= 5)
f3<-filter(f2,TUMOUR.PMCDP>=20)
f4<-filter(f3,TUMOUR.PMCFREQ >= (f3$Purity*0.5*(2/3)))
f5<-filter(f4,GnomAD_v2.1_non_cancer_AF <= 0.0001)
f6<-filter(f5,GnomAD_v3_AF <= 0.0001)

samples<-unique(f$Tumour_Sample)
```

Extract remaining unique variants and columns used for mutation signature analysis, and rename sample column
```{r}
f7<-select(f6,c(CHROM,POS,REF,ALT,Variant_Type))
unique(f7$Variant_Type)
f7$Sample<-f6$Tumour_Sample
```

Run mutation signature functions (DeconstructSigs, SigLASSO, SA, QP, NNLS, SignatureEstimation), including count of number of variants used and filtering metrics, and output results
```{r}
mut.ref<-f7
mut.ref$CHROM<-unlist(lapply(mut.ref$CHROM,function(x) paste("chr",x,sep=""))) 
mut.ref_edit<-mut.ref[,c("Sample","CHROM","POS","REF","ALT")]

for (sample in samples){
      setwd("Tumour Mutation Signatures")
      mut.ref2<-mut.ref_edit[mut.ref_edit$Sample==sample,]
      mut.ref2<-unique(mut.ref2)
      mutNum <- nrow(mut.ref2)
      options(scipen = 999)
      s <- paste("The number of variants in sample", sample, "used for mutation signature analysis is", mutNum)
      t <- paste("The minimum TUMOUR.PMCAD in sample", sample, "used for filtering is", min(unique(f2$TUMOUR.PMCAD)), "and the minimum figure for the variants used for mutation signature analysis is", min(unique(f6$TUMOUR.PMCAD)))
      u <- paste("The minimum TUMOUR.PMCDP in sample", sample, "used for filtering is", min(unique(f3$TUMOUR.PMCDP)), "and the minimum figure for the variants used for mutation signature analysis is", min(unique(f6$TUMOUR.PMCDP)))
      v <- paste("The minimum TUMOUR.PMCFREQ in sample", sample, "used for filtering is", min(unique(f4$TUMOUR.PMCFREQ)), "and the minimum figure for the variants used for mutation signature analysis is", min(unique(f6$TUMOUR.PMCFREQ)))
      w <- paste("The maximuum GnomAD_v2.1_non_cancer_AF in sample", sample, "used for filtering is", max(unique(f5$GnomAD_v2.1_non_cancer_AF)), "and the maximum figure for the variants used for mutation signature analysis is", max(unique(f6$GnomAD_v2.1_non_cancer_AF)))
      x <- paste("The maximum GnomAD_v3_AF in sample", sample, "used for filtering and for the variants used for mutation signature analysis is", max(unique(f6$GnomAD_v3_AF)))
      parameters <- print(c(s,t,u,v,w,x))
      options(scipen = 0)
      sigs.input = run_deconstructSigs(mut.ref2)
      sigs.input = run_deconstructSigs(mut.ref2,
                                       signatures.ref = SBS.COSMIC.exome.v3.may2019.HGSOvCa)
      sigs.input = run_deconstructSigs(mut.ref2,
                                       signatures.ref = signatures.cosmic)
      setwd(sample)
      mut.ref3 = mut.ref2[,c(3:5)]
      mut.ref.chr = tibble(gsub("[chr]","", mut.ref2$CHROM))
      bind_cols(mut.ref.chr,mut.ref3) %>% write_tsv(path=paste(sample,"unique_mutations_forSignal.tsv",sep="_"),col_names=FALSE)
      results = run_sigs(data=mut.ref2, bootstrap.reps = 100, plot = TRUE, plot.file = paste(sample,"_signatures.pdf",sep=""))
      write.table(results$exposures, file=paste(sample,"signatures_exposures.tsv",sep="_"), sep='\t', col.names=T, row.names=F, quote=F)
      write.table(results$errors, file=paste(sample,"signatures_errors.tsv",sep="_"), sep='\t', col.names=T, row.names=F, quote=F)
      write.table(results$exposures.bootstrap, file=paste(sample,"signatures_exposures_bootstraps.tsv",sep="_"), sep='\t', col.names=T, row.names=F, quote=F)
      write.table(results$errors.bootstrap, file=paste(sample,"signatures_errors.bootstaps.tsv",sep="_"), sep='\t', col.names=T, row.names=F, quote=F)
      write.table(parameters,file=paste(sample,"_parameters.txt",sep=""), quote=F,row.names = F,sep="\t")
    }
```
