STEP 0: Import all data and fix formats.

library(dplyr)
library(data.table)

setwd() # fix this according to your system #

allmutations <- fread("allmutations.txt", header=T, sep="\t") # import mutation data #
allmutations <- as.data.frame(allmutations)
allmutations$effect <- gsub("3UTR.*", "3UTR", allmutations$effect) 
allmutations$SIFT <- gsub("\\(.*", "", allmutations$SIFT)
allmutations$PolyPhen <- gsub("\\(.*", "", allmutations$PolyPhen)
allmutations$Amino_Acid_Change <- gsub("[^0-9]", "", allmutations$Amino_Acid_Change) 
allmutations$Amino_Acid_Change <- as.numeric(allmutations$Amino_Acid_Change)

allmutations$effect <- gsub("Frame_Shift_Del", "Frameshift", allmutations$effect) # simple recode effect #
allmutations$effect <- gsub("Frame_Shift_Ins", "Frameshift", allmutations$effect)
allmutations$effect <- gsub("In_Frame_Del", "Deletion", allmutations$effect)
allmutations$effect <- gsub("In_Frame_Ins", "Insertion", allmutations$effect)
allmutations$effect <- gsub("Flank", "Others", allmutations$effect)
allmutations$effect <- gsub("Intron","Others", allmutations$effect)
allmutations$effect <- gsub("larged","D", allmutations$effect)
allmutations$effect <- gsub("_Mutation","", allmutations$effect)
allmutations$effect <- gsub("RNA", "Others", allmutations$effect)
allmutations$effect <- gsub("_S", " s", allmutations$effect)
allmutations$effect <- gsub("Translation start site", "Others", allmutations$effect)
allmutations$effect <- gsub("UTR", "Others", allmutations$effect)
allmutations$effect <- gsub("[^A-z]", "", allmutations$effect)

allmutations$PolyPhen <- gsub("^$","unknown",allmutations$PolyPhen) # recoding polyphen & SIFT #
allmutations$SIFT <- gsub("^$","unknown",allmutations$SIFT)

conv <- fread("uniprot_matrisome.txt", header=F, sep="\t") # load conversion from genes to protein names #
conv <- conv[,c(1:2)]
colnames(conv) <- c("gene","Uniprot_ID")

mat <- fread("matrisome_hs.txt", header = T, sep="\t") 
conv <- merge(conv, mat, by.x="gene", by.y="Gene Symbol")
conv <- as.data.frame(conv)

#Adding a column that states whether a gene is matrisome or not #
allmutations$is.matrisome <- ifelse(allmutations$gene %in% mat$`Gene Symbol`, "matrisome", "rest.of.genome")
cl <- fread("TCGAclindata.txt", header=T,sep="\t")
cl <- cl[,c(1,3)]
cl <- unique(merge(cl, allmutations, by="sample"))
cl <- subset(cl, cl$effect != "Silent")

STEP 1: A general overview of the data included in the study so far. We have a total of 2277979 non-silent entries from 9075 patients and 32 tumor types. Of these, only ~ 6.6% occur in the matrisome, but considering that the matrisome is 1027 genes in total (vs. 20228 other genes) the frequency of mutation “per gene” is ~ 147 in the matrisome vs. ~105 in the rest of the genome, in line with our previous report [https://pubmed.ncbi.nlm.nih.gov/32722287/]. Regardless, there is no difference between the % of the different types of mutation in matrisome vs. rest of the genome, again as reported.

length(unique(cl$sample))
length(unique(cl$`cancer type abbreviation`))
allmutations <- cl

tab1 <- table(allmutations$is.matrisome)
tab1 <- (tab1/2277979)*100
length(unique(mat$`Gene Symbol`))
sb <- subset(allmutations, allmutations$is.matrisome != "matrisome")
length(unique(sb$gene))
tab1 <- table(allmutations$is.matrisome)
pie(tab1, col = c("#fdae61","#4575b4"))
tab1[1] <- tab1[1]/1027
tab1[2] <- tab1[2]/20259
barplot(tab1, col = c("#fdae61","#4575b4"), ylim = c(0,200))
tab1 <- table(allmutations$is.matrisome, allmutations$effect) 
tab2 <- (tab1/rowSums(tab1))*100
chisq.test(tab2) #not significant#

library(ggplot2)
vec <- c("#fdae61","#4575b4", "#313695", "#92c5de", "#d73027","#fee090", "#e0f3f8","#f46d43", "#abd9e9","#a50026") # vector for colours #
df <- as.data.frame(tab2)
colnames(df) <- c("type", "mutation", "frequency (%)")
df$type <- gsub("rest.of.genome", "nonmatrisome", df$type)
ggplot(df, aes(x=mutation, y=`frequency (%)`, fill=type)) + # side by side barplot #
     geom_bar(position=position_dodge() , stat="identity") + 
     theme_classic() + 
     scale_fill_manual(values = vec) + 
     coord_flip()

tab1 <- table(allmutations$is.matrisome, allmutations$SIFT)
tab2 <- (tab1/rowSums(tab1))*100
chisq.test(tab2) #not significant#
df <- as.data.frame(tab2)
colnames(df) <- c("type", "mutation", "frequency (%)")
df$type <- gsub("rest.of.genome", "nonmatrisome", df$type)
ggplot(df, aes(x=mutation, y=`frequency (%)`, fill=type)) + # side by side barplot #
     geom_bar(position=position_dodge() , stat="identity") + 
     theme_classic() + 
     scale_fill_manual(values = vec) + 
     coord_flip()

tab1 <- table(allmutations$is.matrisome, allmutations$PolyPhen)
tab2 <- (tab1/rowSums(tab1))*100
chisq.test(tab2) #not significant#
df <- as.data.frame(tab2)
colnames(df) <- c("type", "mutation", "frequency (%)")
df$type <- gsub("rest.of.genome", "nonmatrisome", df$type)
ggplot(df, aes(x=mutation, y=`frequency (%)`, fill=type)) + # side by side barplot #
     geom_bar(position=position_dodge() , stat="identity") + 
     theme_classic() + 
     scale_fill_manual(values = vec) + 
     coord_flip()

STEP 2: Intersect mutation data with PTM sites data and identify mutations which potentially disrupt PTMs in tumors. In the first chunk, all PTM sites are loaded and integrated with mutation sites.

# PTMs - phosphorylation #

library(R.utils)
# gunzip("Phosphorylation_site_dataset.gz") # After opening the file with editor as a txt file, the first three rows were deleted #

phosdata <- fread("Phosphorylation_site_dataset.txt", header=T, sep="\t") # load PTM #
phosdata$PTM <- c("phosphorylation")
phosdata$MOD_RSD <- gsub("[^0-9]", "", phosdata$MOD_RSD)
phosdata$MOD_RSD <- as.numeric(phosdata$MOD_RSD)

# PTMs - acetylation #

# gunzip("Acetylation_site_dataset.gz") # After opening the file with editor as a txt file, the first three rows were deleted #

acdata <- fread("Acetylation_site_dataset.txt", header=T, sep="\t") # load PTM #
acdata$PTM <- c("acetylation")
acdata$MOD_RSD <- gsub("[^0-9]", "", acdata$MOD_RSD)
acdata$MOD_RSD <- as.numeric(acdata$MOD_RSD)

# PTMs - methylation #

# gunzip("Methylation_site_dataset.gz") # After opening the file with editor as a txt file, the first three rows were deleted #

metdata <- fread("Methylation_site_dataset.txt", header=T, sep="\t") # load PTM #
metdata$PTM <- c("methylation")
metdata$MOD_RSD <- gsub("[^0-9]", "", metdata$MOD_RSD)
metdata$MOD_RSD <- as.numeric(metdata$MOD_RSD)

# PTMs - O-glycosylation #

# gunzip("O-GalNAc_site_dataset.gz") # After opening the file with editor as a txt file, the first three rows were deleted #
# gunzip("O-GlcNAc_site_dataset.gz") # Same as on top #

Ogal <- fread("O-GalNAc_site_dataset.txt", header=T, sep="\t") # load PTM #
Ogal$MOD_RSD <- gsub("[^0-9]", "", Ogal$MOD_RSD)
Ogal$MOD_RSD <- as.numeric(Ogal$MOD_RSD)

Oglc <- fread("O-GlcNAc_site_dataset.txt", header=T, sep="\t")
Oglc$MOD_RSD <- gsub("[^0-9]", "", Oglc$MOD_RSD)
Oglc$MOD_RSD <- as.numeric(Oglc$MOD_RSD)

OG <- rbind(Ogal, Oglc)
OG$PTM <- c("O-glycosylation")

# PTMs - sumoylation #

# gunzip("Sumoylation_site_dataset.gz") # After opening the file with editor as a txt file, the first three rows were deleted #

sumodata <- fread("Sumoylation_site_dataset.txt", header=T, sep="\t") # load PTM #
sumodata$PTM <- c("sumoylation")
sumodata$MOD_RSD <- gsub("[^0-9]", "", sumodata$MOD_RSD)
sumodata$MOD_RSD <- as.numeric(sumodata$MOD_RSD)

# PTMs - ubiquitylation #

# gunzip("Ubiquitination_site_dataset.gz") # After opening the file with editor as a txt file, the first three rows were deleted #

ubidata <- fread("Ubiquitination_site_dataset.txt", header=T, sep="\t") # load PTM #
ubidata$PTM <- c("ubiquitylation")
ubidata$MOD_RSD <- gsub("[^0-9]", "", ubidata$MOD_RSD)
ubidata$MOD_RSD <- as.numeric(ubidata$MOD_RSD)

# PTMs - N-glycosylation #

Nglyc <- fread("N-GlycositeAtlas_HumanAll.txt", sep="\t", header=T) # data from N-GlycositeAtlas, converted into txt file and deletion of first row #
Nglyc <- Nglyc[,2:5]
colnames(Nglyc) <- c("Uniprot_ID", "gene", "protein", "Amino_Acid_Change")
Nglyc$PTM <- c("N-glycosylation")
Nglyc$Amino_Acid_Change <- as.numeric(Nglyc$Amino_Acid_Change)

# PTMs - hydroxylation #

h <- fread("Hydroxylation sites_Uniprot.txt", sep="\t", header=T) # data from uniprot #
colnames(h) <- c("Uniprot_ID", "gene", "specification", "Amino_Acid_Change", "validation", "PTM")
h$specification <- NULL

## Combine all PTMs ##

allPTMs <- rbind(acdata, metdata) # combining all mutated PTM sites #
allPTMs <- rbind(allPTMs, OG)
allPTMs <- rbind(allPTMs, phosdata)
allPTMs <- rbind(allPTMs, sumodata)
allPTMs <- rbind(allPTMs, ubidata)
allPTMs <- allPTMs[, c(1:3, 5, 15)]
colnames(allPTMs) <- c("gene", "protein", "Uniprot_ID", "Amino_Acid_Change", "PTM")

allPTMs <- rbind(allPTMs, h, fill=T) 
allPTMs <- rbind(allPTMs, Nglyc, fill=T)
allPTMs$validation <- NULL

allPTMs <- as.data.frame(allPTMs)
allPTMs <- unique(allPTMs)
allPTMs$gene <- toupper(allPTMs$gene) # the file contains upper and lower case gene names, some wouldn't match with mutations otherwise #

hum <- fread("uniprot-proteome_UP000005640.tab",header=T,sep="\t") #need to cut to human proteins only, some of the IDs included are form other species!#
allPTMs <- subset(allPTMs, allPTMs$Uniprot_ID %in% hum$Entry)
allmutations_PTMs <- merge(allmutations, allPTMs, by.x=c("gene", "Amino_Acid_Change"), by.y=c("gene", "Amino_Acid_Change"))

fwrite(allmutations_PTMs, "All PTM mutations.txt", sep="\t", row.names = F, quote=F)
fwrite(allmutations, "All mutations.txt", sep="\t", row.names = F, quote=F)

In the second chunk, we start to analyze the breakdowns and genomic patterns of disruptive PTM mutations (PTMmut). In total, there are 42733 PTMmut (~1.87% of all mutations) from 6303 patients and 32 cancers, of which 1811 from the matrisome. The ratio of PTMmut/total mut is thus 1.2% for the matrisome and 1.9% for the rest of the genome. Note that the matrisome has, on average, lower PTMmut frequency per unit length as compared to the rest of the genome, oppositely to what observed for other mutations.

#PTMmut vs non-PTMmut#
dim(allmutations_PTMs)
(nrow(allmutations_PTMs)/nrow(allmutations))*100
length(unique(allmutations_PTMs$sample))
length(unique(allmutations_PTMs$`cancer type abbreviation`))
nrow(subset(allmutations_PTMs, allmutations_PTMs$is.matrisome != "rest.of.genome"))

r1 <- (nrow(subset(allmutations_PTMs, allmutations_PTMs$is.matrisome != "rest.of.genome"))/nrow(subset(allmutations, allmutations$is.matrisome != "rest.of.genome")))*100
r2 <- (nrow(subset(allmutations_PTMs, allmutations_PTMs$is.matrisome == "rest.of.genome"))/nrow(subset(allmutations, allmutations$is.matrisome == "rest.of.genome")))*100
r1
r2

tab1 <- c((nrow(allmutations_PTMs)/nrow(allmutations))*100,100-((nrow(allmutations_PTMs)/nrow(allmutations))*100))
names(tab1) <- c("PTMmut", "other mutations")
pie(tab1, col = c("red","#4575b4"))

#N of PTMmut by gene length#
library(goseq)
g <- unique(allmutations_PTMs$gene)
l<- getlength(g,'hg19','geneSymbol')
df <- data.frame(g=g,l=l)
tab1 <- as.data.frame(table(allmutations_PTMs$gene))
df <- merge(df,tab1,by.x="g",by.y="Var1")
df$r <- df$Freq/df$l
df$source <- ifelse(df$g %in% mat$`Gene Symbol`, "matrisome", "rest.of.genome")
df <- na.omit(df)
tab1 <- as.data.frame(table(allmutations$gene))
df <- merge(df,tab1,by.x="g",by.y="Var1")
df$Freq.y <- df$Freq.y-df$Freq.x
df$r2 <- df$Freq.y/df$l
df <- na.omit(df)
v <- aggregate(df$r,list(df$source),mean)
v2 <- aggregate(df$r2,list(df$source),mean) #all non-PTM-muts#
v <- merge(v,v2,by="Group.1")

df <- data.frame(g=g,l=l)
tab1 <- as.data.frame(table(allmutations$gene, allmutations$`cancer type abbreviation`))
df <- merge(df,tab1,by.x="g",by.y="Var1")
df$r <- df$Freq/df$l
df <- na.omit(df)
v <- aggregate(df$r,list(df$Var2),mean)
v$ord <- c(1:nrow(v))
v <- v[order(-v$ord),]
v$ord <- c(1:nrow(v))

ggplot(v, aes(x=ord, y=x)) + # side by side barplot #
     geom_bar(position=position_dodge() , stat="identity", fill="#4575b4") + 
     theme_classic() +  
     coord_flip() +
     scale_x_continuous(breaks = c(1:32),labels = as.character(v$Group.1)) +
     xlab("") + ylab("mean frequency of mutations per gene length")

df <- data.frame(g=g,l=l)
tab1 <- as.data.frame(table(allmutations_PTMs$gene, allmutations_PTMs$`cancer type abbreviation`))
df <- merge(df,tab1,by.x="g",by.y="Var1")
df$r <- df$Freq/df$l
df <- na.omit(df)
v <- aggregate(df$r,list(df$Var2),mean)
v$ord <- c(1:nrow(v))
v <- v[order(-v$ord),]
v$ord <- c(1:nrow(v))

ggplot(v, aes(x=ord, y=x)) + # side by side barplot #
     geom_bar(position=position_dodge() , stat="identity", fill="#fdae61") + 
     theme_classic() +  
     coord_flip() +
     scale_fill_manual(values="#fdae61") +
     scale_x_continuous(breaks = c(1:32),labels = as.character(v$Group.1)) +
     xlab("") + ylab("mean frequency of PTMmut per gene length")

Transitions and transversions are also equal in absolute, in line with previous reports. In comparison, also, transitions and transversions seem to have similar mutational effect, the only noticeable difference being the reduced amount of transversions among splice-site mutations.

m <- subset(allmutations_PTMs, !nchar(allmutations_PTMs$reference) > 1)
m <- subset(m, !nchar(m$alt) > 1)
m <- subset(m, !m$reference == "-")
m <- subset(m, !m$alt == "-")
m$mut.type <- paste0(m$reference,m$alt)
m$mut.type <- ifelse(m$mut.type == "AC", "transversions", ifelse(m$mut.type == "CA", "transversions", ifelse(m$mut.type == "GT", "transversions", ifelse(m$mut.type == "TG", "transversions", "transitions"))))
chisq.test(table(m$is.matrisome,m$mut.type)) #P-value = 0.5431
tab1 <- as.data.frame(table(m$is.matrisome,m$mut.type))

library(ggsci)
ggplot(tab1, aes(x=Var1, y=Freq, fill=Var2)) +
     geom_bar(position="fill" , stat="identity") + 
     scale_fill_aaas() + 
     theme_classic() + xlab("") 

m <- subset(allmutations, !nchar(allmutations$reference) > 1)
m <- subset(m, !nchar(m$alt) > 1)
m <- subset(m, !m$reference == "-")
m <- subset(m, !m$alt == "-")
m$mut.type <- paste0(m$reference,m$alt)
m$mut.type <- ifelse(m$mut.type == "AC", "transversions", ifelse(m$mut.type == "CA", "transversions", ifelse(m$mut.type == "GT", "transversions", ifelse(m$mut.type == "TG", "transversions", "transitions"))))
chisq.test(table(m$is.matrisome,m$mut.type)) #P-value = 0.01251, but if using %'s [tb <- table(m$is.matrisome,m$mut.type) ; tb <- (tb/rowSums(tb))*100] then there clearly are no differences!
tab1 <- as.data.frame(table(m$is.matrisome,m$mut.type))

ggplot(tab1, aes(x=Var1, y=Freq, fill=Var2)) +
     geom_bar(position="fill" , stat="identity") + 
     scale_fill_aaas() + 
     theme_classic() + xlab("")

m <- subset(allmutations_PTMs, !nchar(allmutations_PTMs$reference) > 1)
m <- subset(m, !nchar(m$alt) > 1)
m <- subset(m, !m$reference == "-")
m <- subset(m, !m$alt == "-")
m$mut.type <- paste0(m$reference,m$alt)
m$mut.type <- ifelse(m$mut.type == "AC", "transversions", ifelse(m$mut.type == "CA", "transversions", ifelse(m$mut.type == "GT", "transversions", ifelse(m$mut.type == "TG", "transversions", "transitions"))))

m.1 <- subset(m, m$is.matrisome == "matrisome")
chisq.test(table(m.1$mut.type,m.1$effect)) #P-value = 4.708e-06
tab1 <- as.data.frame(table(m.1$mut.type,m.1$effect))
m.2 <- subset(m, m$is.matrisome != "matrisome")
chisq.test(table(m.2$mut.type,m.2$effect)) #P-value = 2.2e-16
tab2 <- as.data.frame(table(m.2$mut.type,m.2$effect))
tab1$source <- "matrisome"
tab2$source <- "rest.of.genome"
tab <- bind_rows(tab1,tab2)
tab$source <- paste0(tab$source,"_",tab$Var1)
tab <- subset(tab, tab$Var2 != "Others") #there is jut one "Others" mutation here, to be removed to make columns compatible later#
res <- list()
for(i in unique(tab$Var2)){
  z <- subset(tab, tab$Var2 == i)
  df <- as.data.frame(z$Freq)
  names(df) <- unique(z$Var2)
  res[[i]] <- df
}
res <- bind_cols(res)
rownames(res) <- unique(tab$source)

ggplot(tab, aes(x=source, y=Freq, fill=Var2)) +
     geom_bar(position="fill" , stat="identity") + 
     scale_fill_aaas() + 
     theme_classic() + theme(axis.text.x = element_text(angle = 90, hjust = 0.5, vjust = 0.5, size = 10)) +   xlab("")

m.1 <- merge(mat,m.1,by.x="Gene Symbol",by.y="gene")
m.1a <- subset(m.1, m.1$mut.type == "transitions")
m.1b <- subset(m.1, m.1$mut.type == "transversions")
t1 <- as.data.frame(table(m.1a$Category))
t1$type="transitions"
t2 <- as.data.frame(table(m.1b$Category))
t2$type="transversions"

tab1 <- rbind(t1,t2) #P-value : 0.4122#

ggplot(tab1, aes(x=type, y=Freq, fill=Var1)) +
     geom_bar(position="fill" , stat="identity") + 
     scale_fill_d3() + 
     theme_classic() + theme(axis.text.x = element_text(angle = 90, hjust = 0.5, vjust = 0.5, size = 10)) + 
  xlab("")

At a glance, comparison of disruptive PTMmut in matrisome vs rest of the genome shows accumulation of events potentially affecting hydroxylation, N- and O-glycosylation in the matrisome, which is conversely devoid of methylation, sumoylation and ubiquitylation event. Acetylation and phoshporylation events seem comparable. However, the baseline frequency of PTM events in the genome is different, so we need to account for such differences before comparing. When we do, we get a measure we call “burden” (actually, the global burden) and we notice that the matrisome accumulates more PTMmut than the rest of the genome in almost all categories, with the most evident differences in hydroxylation, O-glycosylation and acetylation.

#overall PTMs and PTMmut in matrisome vs. non-matrisome#
allPTMs$source <- ifelse(allPTMs$gene %in% mat$`Gene Symbol`, "matrisome","rest.of.genome")
tab1 <- table(allPTMs$source,allPTMs$PTM)
tab2 <- (tab1/rowSums(tab1))*100 
chisq.test(tab2) # p-value = 0.000149 -> significant! #
df <- as.data.frame(tab2)
colnames(df) <- c("type", "PTM", "frequency (%)")

ggplot(df, aes(x=type, y=`frequency (%)`, fill=PTM)) +
  geom_bar(position="fill" , stat="identity") + 
  theme_classic() + 
  scale_fill_d3() + 
  theme(axis.text.x = element_text(angle = 90, hjust = 0.5, vjust = 0.5, size = 10)) + 
  xlab("")

tab1 <- table(allmutations_PTMs$is.matrisome,allmutations_PTMs$PTM)
tab2 <- (tab1/rowSums(tab1))*100 
chisq.test(tab2) # p-value = 0.0002194 -> significant! #
df <- as.data.frame(tab2)
colnames(df) <- c("type", "PTM", "frequency (%)")

ggplot(df, aes(x=type, y=`frequency (%)`, fill=PTM)) +
  geom_bar(position="fill" , stat="identity") + 
  theme_classic() + 
  scale_fill_d3() + 
  theme(axis.text.x = element_text(angle = 90, hjust = 0.5, vjust = 0.5, size = 10)) + 
  xlab("")

#normalization of N of PTMmut by N Ptm sites (burden), matrisome vs. non-matrisome#
res <- list()
for(i in unique(allmutations_PTMs$`cancer type abbreviation`)){
  print(paste0("analyzing cancer: ",i))
  z <- subset(allmutations_PTMs, allmutations_PTMs$`cancer type abbreviation` == i)
  z <- unique(z[,c(1,2,17)])
  z <- as.data.frame(table(z$gene,z$PTM))
  z2 <- subset(allPTMs, allPTMs$gene %in% z$Var1)
  z2 <- as.data.frame(table(z2$gene,z2$PTM))
  m <- merge(z,z2,by=c("Var1","Var2"),all.y=T)
  m$tot <- rowSums(m[,c(3,4)])
  #m <- subset(m,m$tot > 0)
  m$burden <- round((m$Freq.x/m$tot)*100,2)
  m$burden[is.na(m$burden)] <- 0
  m$cancer <- i
  res[[i]] <- m
}
res <- bind_rows(res)
res$source <- ifelse(res$Var1 %in% mat$`Gene Symbol`, "matrisome", "rest of genome")
v <- aggregate(res$burden,list(res$Var2, res$source),mean)
v2 <- data.frame(Group.1="hydroxylation",Group.2="rest of genome",x=0)
v <- bind_rows(v,v2)

ggplot(v, aes(x=Group.1, y=x, fill=Group.2)) + # side by side barplot #
     geom_bar(position=position_dodge() , stat="identity") + 
     theme_classic() + 
     scale_fill_manual(values = vec) + 
     theme(axis.text.x = element_text(angle = 90, hjust = 0.5, vjust = 0.5, size = 10)) +      xlab("") + ylab("")

res2 <- list()
for(i in unique(res$Var2)){
  z <- subset(res, res$Var2==i)
  x <- subset(z,z$source == "matrisome")
  y <- subset(z, z$source != "matrisome")
  tt <- t.test(x$burden,y$burden)
  p <- paste0("p value for ",i," : ",format.pval(tt$p.value))
  res2[[i]] <- p
} #hydroxylation cannot be counted, we proxy at p-value < 2.22e-16#

v2 <- list()
for(i in unique(v$Group.2)){
  z <- subset(v, v$Group.2 == i)
  df <- data.frame(source=i, t(z$x))
  names(df)[2:ncol(df)] <- z$Group.1
  v2[[i]] <- df
}
v2 <- bind_rows(v2)
v2 <- v2[,c(1:9)]
rownames(v2) <- v2[,1]
v2[,1] <- NULL
v2[is.na(v2)] <- 0.00000000001
v2 <- v2[1,]/v2[2,]
v2

Finally, we recalculate the burdens (exact by mutation), merge with the PTMmut table and export it.

res <- list()
for(i in unique(allmutations_PTMs$`cancer type abbreviation`)){
  print(paste0("analyzing cancer: ",i))
  z <- subset(allmutations_PTMs, allmutations_PTMs$`cancer type abbreviation` == i)
  z <- unique(z[,c(1,2,17)])
  g <- z$gene
  z$gene <- paste0(z$gene,"_",z$Amino_Acid_Change)
  z <- as.data.frame(table(z$gene,z$PTM))
  z2 <- subset(allPTMs, allPTMs$gene %in% g)
  z2 <- as.data.frame(table(z2$gene,z2$PTM))
  z$nVar1 <- gsub("\\_.*","",z$Var1)
  names(z)[1] <- "id"
  names(z)[4] <- "Var1"
  m <- merge(z,z2,by=c("Var1","Var2"),all.y=T)
  m[is.na(m)]<- 0
  m$tot <- rowSums(m[,c(4,5)])
  m <- subset(m,m$tot > 0)
  m$burden <- round((m$Freq.x/m$tot)*100,2)
  m$cancer <- i
  m <- subset(m,m$burden > 0)
  res[[i]] <- m
}
res <- bind_rows(res)
allmutations_PTMs$id <- paste0(allmutations_PTMs$gene,"_",allmutations_PTMs$Amino_Acid_Change)

fin <- merge(allmutations_PTMs,res,by.x=c("cancer type abbreviation","id"),by.y=c("cancer","id"))
fin <- as.data.frame(fin)
fin <- fin[,c(1,4,3,4:18,24)]
fin[,4] <- NULL
fin <- fin[,c(4,1,3,2,5:ncol(fin))]
fwrite(fin,"TABLE 1.txt",sep="\t",row.names = F,quote = F)

Before moving on, we calculate the correlation between the burden and the N of mutations. Expectedly, these two measures are generally opposite and breaking down by tumor type and matrisome category confirms that all significant correlations found are negative.

fin2 <- merge(fin,mat,by.x="gene",by.y="Gene Symbol")
fin2 <- unique(fin2)
res <- list()
res2 <- list()
for(i in unique(fin2$`cancer type abbreviation`)){
  print(paste0("analyzing tumor: ",i))
  z <- subset(fin2, fin2$`cancer type abbreviation` == i)
  z <- na.omit(z)
  for(w in unique(z$Category)){
    print(paste0("*** now analyzing matrisome category: ",w))
    df <- subset(z, z$Category == w)
    x <- df[, c(1,18)]
    y <- as.data.frame(table(df$gene))
    m <- na.omit(m)
    m <- merge(x,y,by.x="gene",by.y="Var1")
    if(nrow(m)>=4){
        ct <- cor.test(m$burden,m$Freq)
        m$tumor <- i
        m$category <- w
        m$corr <- ct$estimate
        m$pval <- ct$p.value}else{
          m$tumor <- i
          m$category <- w
          m$corr <- 0
          m$pval <- 1
        }
    ct <- NULL
    res[[w]] <- m
  }
  res2[[i]] <- bind_rows(res)
}
res2 <- bind_rows(res2)
res2$eval <- ifelse(res2$pval < 0.05, "significant", "NS")
res2$dir <- ifelse(res2$corr > 0 , "positive", "negative")

for(i in unique(res2$category)){
    z <- subset(res2, res2$category == i)
    lm <- lm(z$burden~z$Freq)
    s <- summary(lm)
    df <- data.frame(category=i, lr=s$coefficients[2,1], pval=format.pval(s$coefficients[2,4]))
    print(df)
}

res3 <- unique(res2[,c(4:9)])
table(res3$category,res3$eval,res3$dir)

ggplot(res2, aes(x=burden, y=Freq, color=factor(category))) +
  geom_point() + 
  scale_color_d3() + 
  theme_minimal() + 
  geom_smooth(method = "lm")

tab <- as.data.frame(table(res3$category,res3$eval,res3$dir))
df <- aggregate(tab$Freq,by=list(tab$Var3, tab$Var2),sum)
pie(c(df[1,3],df[3,3]),col = c("blue","red"))
pie(c(df[2,3],df[4,3]),col = c("blue","red"))

STEP 3: PTMmut in the tumor matrisome at the gene level. Here we focus on the matrisome, comparing frequency of mutations between different matrisome categories across cancers etc. Aggregating data by matrisome category shows a clear redistribution of PTMmut types, so that, e.g., hydroxylation is 52% of all PTMmut in collagens (and hydroxy-PTMmut in collagens are 74% of all PTMmut for that PTM) while phoshorylation is 72% of all PTMmut in secreted factors and 21.4% of all phoshpo-PTMmut.

fin2 <- unique(merge(fin,mat,by.x="gene",by.y="Gene Symbol"))
m <- subset(fin2, fin2$is.matrisome == "matrisome")
chisq.test(table(m$Division, m$PTM)) #p-value < 2.2e-16 significant!#
df <- as.data.frame(table(m$Division, m$PTM))

vec <- c("darkkhaki","cadetblue3","skyblue4","peachpuff3","navajowhite4","sandybrown","cadetblue4","lightblue3")

ggplot(df, aes(x=Var1, y=Freq, fill=Var2)) +
  geom_bar(position="fill" , stat="identity") + 
  theme_classic() + 
  scale_fill_manual(values = vec) + 
  theme(axis.text.x = element_text(angle = 0, hjust = 0.5, vjust = 0.5, size = 10)) + 
  xlab("")

chisq.test(table(m$Category, m$PTM)) #p-value < 2.2e-16 significant!#
df <- as.data.frame(table(m$Category, m$PTM))

ggplot(df, aes(x=Var1, y=Freq, fill=Var2)) +
  geom_bar(position="fill" , stat="identity") + 
  theme_classic() + 
  scale_fill_manual(values = vec) + 
  theme(axis.text.x = element_text(angle = 90, hjust = 0.5, vjust = 0.5, size = 10)) + 
  xlab("")

#PTMmut, % of tot by matrisome category
tab <- table(fin2$Category,fin2$PTM)
tab1 <- round((tab/rowSums(tab))*100,1)
tab1

#PTMmut, % of tot by total in that type of PTM
v <- apply(tab,2,sum)
tab2 <- apply(tab,2,function(x){round((x/sum(x)*100),1)})

The trends seem very similar pan-cancer, with SKCM, STAD and UCEC clearly leading the way. There seem to be no tissue-of-origin effects, though various local differences exist.

tab <- as.data.frame(table(fin2$`cancer type abbreviation`,fin2$Category,fin2$PTM))

ggplot(tab, aes(x=Var1, y=Var3, size=Freq, fill=Var3)) + 
geom_point(alpha=1, shape=21, color="black") + 
theme_classic() + 
scale_size_area(max_size = 8, name="freq") +
scale_fill_manual(values=vec) +
theme(axis.text.x = element_text(angle=90, size=10, vjust=0.4), axis.text.y = element_text(size=8)) + 
xlab("") + 
ylab("") +
facet_wrap(tab$Var2)

res <- list()
res2 <- list()
for(i in unique(tab$Var2)){
  z <- subset(tab, tab$Var2 == i)
  z$Var2 <- NULL
  for(w in unique(z$Var3)){
    k <- subset(z, z$Var3==w)
    df <- data.frame(matrisome=i,PTM=w,t(k$Freq))
    names(df)[3:ncol(df)] <- as.character(k$Var1)
    res[[w]]<- df
  }
  res2[[i]] <- bind_rows(res)
}
res2 <- bind_rows(res2)
fwrite(res2,"TABLE 2.txt", sep="\t", row.names = F, quote = F)

ndf <- list()
for(i in unique(res2$matrisome)){
  z <- subset(res2, res2$matrisome == i)
  print(paste0("now graphing data for: ",i))
  rownames(z) <- z$PTM
  z <- z[,c(3:ncol(z))]
  for(w in 1:nrow(z)){
    k <- z[w,]
    k <- as.data.frame(t(k))
    k$tumor <- rownames(k)
    ndf[[w]] <- k
  }
  f <- Reduce(merge,ndf)
  f <- reshape2::melt(f)
  
  p <- ggplot(data=f, aes(x=tumor, y=value, fill=variable)) +
  geom_bar(stat="identity") + 
  scale_fill_manual(values=vec) + 
  theme_bw() +
  facet_wrap(~f$variable) + ggtitle(i) + ggtitle(i) + theme(strip.background = element_rect(colour="black", fill="white"), axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1, size = 4))
  plot(p)
}

Examples of hydroxylation and phosphorylation PTMmut in collagens.

#zoom in: collagens#
prova <- subset(fin2, fin2$Category == "Collagens")
prova <- subset(prova, prova$PTM == "phosphorylation" | prova$PTM == "hydroxylation")
prova <- as.data.frame(table(prova$gene,prova$PTM,prova$`cancer type abbreviation`))
prova <- subset(prova, prova$Freq > 0)

ndf <- list()
ndf2 <- list()
for(i in unique(prova$Var3)){
  z <- subset(prova, prova$Var3 == i)
  for(j in unique(z$Var2)){
    k <- subset(z, z$Var2 == j)
    k <- k[order(-k$Freq),]
    if(nrow(k) >= 10){k <- k[1:10,]}else{k <- k}
    ndf[[j]] <- k
  }
  ndf2[[i]] <- bind_rows(ndf)
}
ndf2 <- bind_rows(ndf2)
ndf2 <- merge(ndf2,mat,by.x="Var1",by.y="Gene Symbol")

ggplot(ndf2, aes(x=Var3, y=Var1, size=Freq, fill=Var3)) + 
geom_point(alpha=1, shape=21, color="black") + 
theme_classic() + 
scale_size_area(max_size = 8, name="freq") +
theme(axis.text.x = element_text(angle=90, size=10, vjust=0.4), axis.text.y = element_text(size=8)) + 
xlab("") + 
ylab("") +
facet_wrap(ndf2$Var2)

Finally, the absolute Top10 per tumor type and a look at eventual “hotspots” across and within the cohorts.

prova <- fin2

# to see the table, for describing it in text:
#prova <- table(prova$gene,prova$`cancer type abbreviation`)
#prova <- ifelse(prova == 0, 0 ,1)
#prova <- as.data.frame(prova)
#prova$tot <- rowSums(prova)
#prova <- prova[order(-prova$tot),]

prova <- as.data.frame(table(prova$gene,prova$`cancer type abbreviation`))
prova <- subset(prova, prova$Freq > 0)

res <- list()
for(i in unique(prova$Var2)){
  print(paste0("now analyzing: ",i))
  z <- subset(prova, prova$Var2 == i)
  z <- z[order(-z$Freq),]
  if(nrow(z) <= 10){z <- z}else{z <- z[1:10,]}
  res[[i]] <- z
}
res <- bind_rows(res)

ggplot(res, aes(x=Var2, y=Var1, size=Freq, fill=Var2)) + 
geom_point(alpha=1, shape=21, color="black") + 
theme_classic() + 
scale_size_area(max_size = 8, name="freq") +
theme(axis.text.x = element_text(angle=90, size=10, vjust=0.4), axis.text.y = element_text(size=8)) + 
xlab("") + 
ylab("") 

prova <- fin2
prova$mut <- paste(prova$gene,prova$Amino_Acid_Change,prova$PTM,sep="_")
prova <- table(prova$mut, prova$`cancer type abbreviation`)
prova <- as.data.frame.matrix(prova)
prova[prova > 0] <- 1 #this allows calculation of absolute occurrences#
prova$tot <- rowSums(prova)
prova <- prova[order(-prova$tot),]
df1 <- data.frame(mut=rownames(prova),n.of.cohorts=prova$tot)
prova <- fin2
prova$mut <- paste(prova$gene,prova$Amino_Acid_Change,prova$PTM,sep="_")
prova <- table(prova$mut, prova$`cancer type abbreviation`)
prova <- as.data.frame.matrix(prova)
prova$tot <- rowSums(prova)
prova <- prova[order(-prova$tot),]
df2 <- data.frame(mut=rownames(prova),n.total=prova$tot)
df <- merge(df1,df2,by="mut",all=T)
df$col <- ifelse(df$n.of.cohorts >= 3 & df$n.total >= 3, 1, 0)
df <- df[order(-df$col, -df$n.of.cohorts),]
df$col <- ifelse(df$col==1, "1", "0")
df$labs <- gsub("_.*","",df$mut)

ggplot(df, aes(n.total, n.of.cohorts, color=col, label=labs)) + geom_point() +
  theme_classic() + 
  scale_color_aaas() +
  geom_vline(xintercept = 3.0, linetype="dotted", color = "grey80") +
  geom_hline(yintercept = 3.0, linetype="dotted", color = "grey80")

df <- subset(df, df$col == 1)
df$col <- NULL
colnames(df) <- c("mutation","number of cohorts","number of total occurrences")
fwrite(df, "TABLE 3.txt", sep="\t", quote = F, row.names = F)

STEP 4: Non-conservation of hydroxylation and glycosylation PTMmut. Why is none of these PTMmut in the hotspot list? Is there a negative selection against them? We check here the distribution of PTMmut type per SIFT/polyphen category and evaluate the ratio of non-silent PTMmut to silent PTMmut (dN/dS), as reported [https://doi.org/10.1186/s13059-018-1434-0].

prova <- fin2
tab <- table(prova$PTM,prova$SIFT)
tab <- round((tab/rowSums(tab))*100,0)
#for SIFT, we move the low-confidence findings to the "unknown" group#
tab <- tab[,c(1,3,5)]
tab[,3] <- 100-(tab[,1]+tab[,2])
cc <- chisq.test(tab)
tab <- reshape2::melt(tab)

ggplot(tab, aes(x=Var1, y=value, fill=Var2)) +
  geom_bar(position="fill" , stat="identity") + 
  theme_classic() + 
  scale_fill_npg() + 
  theme(axis.text.x = element_text(angle = 90, hjust = 0.5, vjust = 0.5, size = 10)) + 
  xlab("") + ggtitle(paste0("matrisome_SIFT","\n",cc$p.value))

allp <- subset(allmutations_PTMs, allmutations_PTMs$is.matrisome == "rest.of.genome")
tab <- table(allp$PTM,allp$SIFT)
tab <- round((tab/rowSums(tab))*100,0)
#for SIFT, we move the low-confidence findings to the "unknown" group#
tab <- tab[,c(1,3,5)]
tab[,3] <- 100-(tab[,1]+tab[,2])
cc <- chisq.test(tab)
tab <- reshape2::melt(tab)

ggplot(tab, aes(x=Var1, y=value, fill=Var2)) +
  geom_bar(position="fill" , stat="identity") + 
  theme_classic() + 
  scale_fill_npg() + 
  theme(axis.text.x = element_text(angle = 90, hjust = 0.5, vjust = 0.5, size = 10)) + 
  xlab("") + ggtitle(paste0("rest of genome_SIFT","\n",cc$p.value))

tab <- table(prova$PTM,prova$PolyPhen)
tab <- round((tab/rowSums(tab))*100,0)
#for Polyphen, we merge the "damaging" groups together#
tab[,2] <- tab[,1]+tab[,2]
tab <- tab[,c(1,2,4)]
colnames(tab)[2] <- "damaging"
cc <- chisq.test(tab)
tab <- reshape2::melt(tab)
levels(tab$Var2) <- c("damaging","benign","unknown")

ggplot(tab, aes(x=Var1, y=value, fill=Var2)) +
  geom_bar(position="fill" , stat="identity") + 
  theme_classic() + 
  scale_fill_npg() + 
  theme(axis.text.x = element_text(angle = 90, hjust = 0.5, vjust = 0.5, size = 10)) + 
  xlab("") + ggtitle(paste0("matrisome_PolyPhen","\n",cc$p.value))

tab <- table(allp$PTM,allp$PolyPhen)
tab <- round((tab/rowSums(tab))*100,0)
#for Polyphen, we merge the "damaging" groups together#
tab[,2] <- tab[,1]+tab[,2]
tab <- tab[,c(1,2,4)]
colnames(tab)[2] <- "damaging"
cc <- chisq.test(tab)
tab <- reshape2::melt(tab)
levels(tab$Var2) <- c("damaging","benign","unknown")

ggplot(tab, aes(x=Var1, y=value, fill=Var2)) +
  geom_bar(position="fill" , stat="identity") + 
  theme_classic() + 
  scale_fill_npg() + 
  theme(axis.text.x = element_text(angle = 90, hjust = 0.5, vjust = 0.5, size = 10)) + 
  xlab("") + ggtitle(paste0("rest of genome_PolyPhen","\n",cc$p.value))

#to calculate the dN/dS ratio, the initial allmutations file must be recreated since we had previously removed all silent mutations!#
allmutations <- fread("allmutations.txt", header=T, sep="\t") # import mutation data #
allmutations <- as.data.frame(allmutations)
allmutations$effect <- gsub("3UTR.*", "3UTR", allmutations$effect) 
allmutations$SIFT <- gsub("\\(.*", "", allmutations$SIFT)
allmutations$PolyPhen <- gsub("\\(.*", "", allmutations$PolyPhen)
allmutations$Amino_Acid_Change <- gsub("[^0-9]", "", allmutations$Amino_Acid_Change) 
allmutations$Amino_Acid_Change <- as.numeric(allmutations$Amino_Acid_Change)
allmutations$effect <- gsub("Frame_Shift_Del", "Frameshift", allmutations$effect) # simple recode effect #
allmutations$effect <- gsub("Frame_Shift_Ins", "Frameshift", allmutations$effect)
allmutations$effect <- gsub("In_Frame_Del", "Deletion", allmutations$effect)
allmutations$effect <- gsub("In_Frame_Ins", "Insertion", allmutations$effect)
allmutations$effect <- gsub("Flank", "Others", allmutations$effect)
allmutations$effect <- gsub("Intron","Others", allmutations$effect)
allmutations$effect <- gsub("larged","D", allmutations$effect)
allmutations$effect <- gsub("_Mutation","", allmutations$effect)
allmutations$effect <- gsub("RNA", "Others", allmutations$effect)
allmutations$effect <- gsub("_S", " s", allmutations$effect)
allmutations$effect <- gsub("Translation start site", "Others", allmutations$effect)
allmutations$effect <- gsub("UTR", "Others", allmutations$effect)
allmutations$effect <- gsub("[^A-z]", "", allmutations$effect)
allmutations$PolyPhen <- gsub("^$","unknown",allmutations$PolyPhen) # recoding polyphen & SIFT #
allmutations$SIFT <- gsub("^$","unknown",allmutations$SIFT)
conv <- fread("uniprot_matrisome.txt", header=F, sep="\t") # load conversion from genes to protein names #
conv <- conv[,c(1:2)]
colnames(conv) <- c("gene","Uniprot_ID")
mat <- fread("matrisome_hs.txt", header = T, sep="\t") 
conv <- merge(conv, mat, by.x="gene", by.y="Gene Symbol")
conv <- as.data.frame(conv)
allmutations$is.matrisome <- ifelse(allmutations$gene %in% mat$`Gene Symbol`, "matrisome", "rest.of.genome")
cl <- fread("TCGAclindata.txt", header=T,sep="\t")
cl <- cl[,c(1,3)]
cl <- unique(merge(cl, allmutations, by="sample"))
allmutations <- cl

res <- list()
res2 <- list()
stp <- 0
pb <- txtProgressBar(min = 0, max = length(unique(allmutations_PTMs$gene)), style = 3)
for(i in unique(allmutations_PTMs$gene)){
  stp <- stp+1
  setTxtProgressBar(pb, stp)
  z1 <- subset(allmutations_PTMs, allmutations_PTMs$gene == i)
  z2 <- subset(allmutations, allmutations$gene == i)
  z2 <- subset(z2, z2$effect =="Silent")
  r1 <- nrow(z1)/nrow(z2)
  z3 <- subset(allmutations, allmutations$gene == i)
  z3 <- subset(z3, z3$effect !="Silent")
  r2 <- (nrow(z3)-nrow(z1))/nrow(z2)
  
  df <- data.frame(gene=i,dN_dS_tot=r2,dN_dS_PTM=r1,PTM=z1$PTM)
  
  res[[i]] <- df
}
res <- bind_rows(res)
res$is.matrisome <- ifelse(res$gene %in% mat$`Gene Symbol`, "matrisome", "rest.of.genome")
res$r <- res$dN_dS_PTM/res$dN_dS_tot

res2 <- list()
for(i in unique(res$PTM)[1:7]){
  x <- subset(res,res$PTM == i & res$is.matrisome == "matrisome")
  y <- subset(res,res$PTM == i & res$is.matrisome != "matrisome")
  tt <- t.test(x$r,y$r)
  df <- data.frame(PTM=i,mean.of.r.matrisome=tt$estimate[1],mean.of.r.rest=tt$estimate[2],pval=tt$p.value,direction=ifelse(tt$estimate[1]>tt$estimate[2],"enriched in matrisome","depleted in matrisome"))
  res2[[i]] <- df
}
res2 <- bind_rows(res2)
m <- reshape2::melt(res2, id.var="PTM")
m <- m[1:14,]

ggplot(m,aes(PTM, as.numeric(value), colour=variable), show.legend=FALSE) +
  geom_boxplot() +
  theme_bw() + theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1))

res2$diff <- res2$mean.of.r.matrisome/mean.of.r.rest
m <- reshape2::melt(res2, id.var="PTM")
m <- m[29:nrow(m),]
ggplot(m,aes(PTM,value,fill=PTM), show.legend=FALSE) +
  geom_bar(stat="identity") +
  theme_bw() + theme() + theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1)) + scale_fill_d3()

fwrite(res2,"table pvals dN-dS.txt",sep="\t",row.names = F, quote = F)

PTMmut in the tumor matrisome at the protein domain level. Comparing PTMmut and non-PTMmut we expect to find enrichments/depletions at some domains that might explain why some frequent types of PTMs are not in the top-10 list. To this aim, we proceed as follows: 1) map PTMmut and non-PTMmut into protein domains 2) subset to PFAM domains only 3) compare the ratio of PTMmut to non-PTMmut, by domain, in matrisome and non matrisome genes 4) evaluate the 53 domains in common between matrisome and non matrisome for their ratios 5) evaluate the ratios by PTM types in these domains. O-glycosylation and actylation are, in fact, less “tolerated”!

library(ensembldb)
library(EnsDb.Hsapiens.v86)
edb <- EnsDb.Hsapiens.v86
library(magrittr)
pnames <- edb %>% filter(~ symbol == unique(allmutations_PTMs$gene) & tx_biotype =="protein_coding") %>% proteins
z <- select(edb, keys = pnames$protein_id, keytype = "PROTEINID",
            columns = c("PROTEINID", "GENENAME", "PROTDOMSTART", "PROTDOMEND", "PROTEINDOMAINID", "PROTEINDOMAINSOURCE"))
pos <- allmutations_PTMs[,c(1,2,4,14,17)]
pos <- pos[pos$gene %in% z$GENENAME,]
pos <- pos[order(pos$`cancer type abbreviation`),]
library(sqldf)
library(bit64)
res <- list()
res2 <- list()
for(i in unique(pos$'cancer type abbreviation')){
  print(paste0("PTM scanning domains for: ",i))
  k <- unique(subset(pos, pos$'cancer type abbreviation' == i))
  pb <- txtProgressBar(min = 0, max = length(unique(k$gene)), style = 3)
  stp <- 0
  for(q in unique(k$gene)){
    stp <- stp+1
    setTxtProgressBar(pb, stp)
    k2 <- subset(k, k$gene == q)
    nz <- subset(z, z$GENENAME == q)
    db <- sqldf("select * from k2
                left join nz
                on k2.Amino_Acid_Change between nz.PROTDOMSTART and nz.PROTDOMEND")
    db <- na.omit(db)
    res[[q]] <- db
  } 
  res2[[i]] <- bind_rows(res)
}
res2 <- bind_rows(res2)
res2 <- distinct(res2)
dom_ptm <- res2

allmutations_PTMs$mut <- paste0(allmutations_PTMs$gene,"_",allmutations_PTMs$Amino_Acid_Change,"_",allmutations_PTMs$sample)
allmutations$mut <- paste0(allmutations$gene,"_",allmutations$Amino_Acid_Change,"_",allmutations$sample)
pos <- subset(allmutations, !(allmutations$mut %in% allmutations_PTMs$mut))
pos <- pos[,c(8,1,2,10,14)]
pos <- pos[pos$gene %in% z$GENENAME,]
pos <- na.omit(pos)
pos <- pos[order(pos$`cancer type abbreviation`),]
res <- list()
res2 <- list()
for(i in unique(pos$'cancer type abbreviation')){
  print(paste0("NON-PTM scanning domains for: ",i))
  k <- unique(subset(pos, pos$'cancer type abbreviation' == i))
  pb <- txtProgressBar(min = 0, max = length(unique(k$gene)), style = 3)
  stp <- 0
  for(q in unique(k$gene)){
    stp <- stp+1
    setTxtProgressBar(pb, stp)
    k2 <- subset(k, k$gene == q)
    nz <- subset(z, z$GENENAME == q)
    db <- sqldf("select * from k2
                left join nz
                on k2.Amino_Acid_Change between nz.PROTDOMSTART and nz.PROTDOMEND")
    db <- na.omit(db)
    res[[q]] <- db
  } 
  res2[[i]] <- bind_rows(res)
}
res2 <- bind_rows(res2)
res2 <- distinct(res2)
dom_nonptm <- res2

#for a comparison, let's focus on PFAM domains only#
d1 <- subset(dom_ptm, dom_ptm$PROTEINDOMAINSOURCE == "pfam" & dom_ptm$is.matrisome == "matrisome")
d2 <- subset(dom_ptm, dom_ptm$PROTEINDOMAINSOURCE == "pfam" & dom_ptm$is.matrisome != "matrisome")
d3 <- subset(dom_nonptm, dom_nonptm$PROTEINDOMAINSOURCE == "pfam")

res <- list()
stp <- 0
print("checking mutation frequencies in matrisome domains")
pb <- txtProgressBar(min = 0, max = length(unique(d1$PROTEINDOMAINID)), style = 3)
for(i in unique(d1$PROTEINDOMAINID)){
  stp <- stp+1
  setTxtProgressBar(pb, stp)
  z <- subset(d3, d3$PROTEINDOMAINID == i)
  z <- as.data.table(z)[,.N, by = gene]
  z <- mean(z$N)
  z2 <- subset(d1, d1$PROTEINDOMAINID == i)
  z2 <- as.data.table(z2)[,.N, by = gene]
  z2 <- mean(z2$N)
  res[[i]] <- data.frame(domain=i, freq=(z2/z))
}
res <- bind_rows(res)

res2 <- list()
stp <- 0
print("checking mutation frequencies in non-matrisome domains")
pb <- txtProgressBar(min = 0, max = length(unique(d2$PROTEINDOMAINID)), style = 3)
for(i in unique(d2$PROTEINDOMAINID)){
  stp <- stp+1
  setTxtProgressBar(pb, stp)
  z <- subset(d3, d3$PROTEINDOMAINID == i)
  z <- as.data.table(z)[,.N, by = gene]
  z <- mean(z$N)
  z2 <- subset(d2, d2$PROTEINDOMAINID == i)
  z2 <- as.data.table(z2)[,.N, by = gene]
  z2 <- mean(z2$N)
  res2[[i]] <- data.frame(domain=i, freq=(z2/z))
}
res2 <- bind_rows(res2)
toex <- merge(res,res2,by="domain",all=T)
toex[is.na(toex)] <- 0
colnames(toex) <- c("PFAM domain","PTMmut/allmut frequency in matrisome", "PTMmut/allmut frequency in rest of genome")
toex$enrich <- ifelse(toex$`PTMmut/allmut frequency in matrisome`> toex$`PTMmut/allmut frequency in rest of genome`,"matrisome","rest of genome")
fwrite(toex,"relative freq of mutations at the domain level.txt",sep="\t",row.names = F, quote = F)

mt <- subset(toex,toex$`PTMmut/allmut frequency in matrisome`> toex$`PTMmut/allmut frequency in rest of genome`)
length(unique(mt$`PFAM domain`)) #128 enriched domains#

m1 <- subset(d1, d1$PROTEINDOMAINID %in% mt$`PFAM domain`)
m2 <- subset(d2, d2$PROTEINDOMAINID %in% mt$`PFAM domain`)
m1 <- distinct(m1[,c("gene","PTM","PROTEINDOMAINID")])
m2 <- distinct(m2[,c("gene","PTM","PROTEINDOMAINID")])
m3 <- distinct(d3[,c("gene","PROTEINDOMAINID")])

res1 <- list()
for(i in unique(m1$PROTEINDOMAINID)){
  z <- subset(m1, m1$PROTEINDOMAINID == i)
  z <- as.data.frame(table(z$PTM))
  z2 <- subset(m3, m3$PROTEINDOMAINID == i)
  z2 <- nrow(z2)
  z$tot <- z2
  z$rel.freq <- z$Freq/z$tot
  res1[[i]] <- z
}
res1 <- bind_rows(res1)
res1 <- aggregate(res1$rel.freq,by=list(res1$Var1),mean)

res2 <- list()
for(i in unique(m2$PROTEINDOMAINID)){
  z <- subset(m2, m2$PROTEINDOMAINID == i)
  z <- as.data.frame(table(z$PTM))
  z2 <- subset(m3, m3$PROTEINDOMAINID == i)
  z2 <- nrow(z2)
  z$tot <- z2
  z$rel.freq <- z$Freq/z$tot
  res2[[i]] <- z
}
res2 <- bind_rows(res2)
res2 <- aggregate(res2$rel.freq,by=list(res2$Var1),mean)

res <- merge(res1,res2,by="Group.1")
res[5,3] <- 1 #proxy
m <- reshape2::melt(res)

ggplot(data=m, aes(x=Group.1, y=value, fill=variable)) +
  geom_bar(stat="identity", position = "dodge") +
  theme_bw() +
  scale_fill_aaas() + 
  theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1))

Are there correlations between PTMmut location and gene function?

tab <- as.data.frame(fread("relative freq of mutations at the domain level.txt",sep="\t",header = T, dec=","))
library(clusterProfiler)
library(org.Hs.eg.db)
library(cowplot)

tab.mat <- subset(tab, tab[,2]>=2*tab[,3])
sel <- subset(dom_ptm, dom_ptm$PROTEINDOMAINID %in% tab.mat$`PFAM domain`)
gene <- table(sel$gene)
gene <- gene[order(-gene)]

ego <- enrichKEGG(gene)
p1 <- dotplot(ego, showCategory=10, title="matrisome enriched")

tab.mat <- subset(tab, tab[,3]>=2*tab[,2])
sel <- subset(dom_ptm, dom_ptm$PROTEINDOMAINID %in% tab.mat$`PFAM domain`)
gene2 <- table(sel$gene)
gene2 <- gene2[order(-gene2)]

ego <- enrichKEGG(gene2)
p2 <- dotplot(ego, showCategory=10, title="rest of genome enriched")

plot_grid(p1, p2, ncol=2)

ego <- enrichGO(gene          = gene,
                OrgDb         = org.Hs.eg.db,
                ont           = "BP",
                pAdjustMethod = "BH",
                pvalueCutoff  = 0.01,
                qvalueCutoff  = 0.05,
        readable      = TRUE)
p1 <- dotplot(ego)

ego <- enrichGO(gene          = gene2,
                OrgDb         = org.Hs.eg.db,
                ont           = "BP",
                pAdjustMethod = "BH",
                pvalueCutoff  = 0.01,
                qvalueCutoff  = 0.05,
        readable      = TRUE)
p2 <- dotplot(ego)
plot_grid(p1, p2, ncol=2)

l <- list(mat=gene,rest=gene2)
ck <- compareCluster(geneCluster = l, 
                fun = "enrichGO",
                OrgDb         = org.Hs.eg.db,
                ont           = "BP",
                pAdjustMethod = "BH",
                pvalueCutoff  = 0.01,
                qvalueCutoff  = 0.05)
dotplot(ck)

And are there matrisome PTMmut associated with protein functions?

reg <- as.data.frame(fread("regions.txt",header=T,sep="\t"))
reg <- reg[,c(1:16)]
reg <- subset(reg, reg$is.matrisome=="matrisome")
reg$`region I` <- ifelse(reg$`region I` == "iside","inside",reg$`region I`)

#of 921 mutations annotated with region information, 286 (286/921, 31%) fall in a functional region#

fmut <- subset(reg, reg$`region I` == "inside")
tfmut <- table(fmut$gene)
tfmut <- tfmut[order(-tfmut)]

Finally, do the proteins targeted by PTMmut interact? To answer this question, we will use PPI data from BioGrid v. 4.2.192.

ppi <- as.data.frame(fread("BIOGRID-ORGANISM-Homo_sapiens-4.2.192.tab3.txt",header=T,sep="\t"))
ppi <- ppi[,c(8,9)]

tab.mat <- subset(tab, tab[,2]>=tab[,3])
tab.mat <- subset(tab, tab$enrich == "matrisome")
sel <- subset(dom_ptm, dom_ptm$PROTEINDOMAINID %in% tab.mat$`PFAM domain`)
sel <- unique(sel$gene)
exp <- data.frame(gene=sel)
fwrite(exp,"gene with PTMmut in enriched domains.txt",sep="\t",row.names = F, quote = F)
ppi.mat <- subset(ppi, ppi$`Official Symbol Interactor A` %in% sel & ppi$`Official Symbol Interactor B` %in% sel)
ppi.mat$type <- ifelse(ppi.mat[,1]==ppi.mat[,2],"homo","hetero")
fwrite(ppi.mat,"PPI interactions in enriched domains.txt",sep="\t",row.names = F, quote = F)

#total would be:
#ppi.mat <- subset(ppi, ppi$`Official Symbol Interactor A` %in% sel | ppi$`Official Symbol Interactor B` %in% sel)

library(igraph)
net <- simplify(graph_from_data_frame(ppi.mat), remove.multiple = F, remove.loops = T)

V(net)$size <- 8
V(net)$frame.color <- "white"
#V(net)$color <- "orange"
E(net)$arrow.mode <- 0
deg <- degree(net, mode = "all")

g <- degree(net)
g <- g[order(-g)]
g <- names(g[1:10])

V(net)$label <- ifelse(V(net)$name %in% g, V(net)$name,"") 
V(net)$label.color <- "black"
V(net)$color <- ifelse(V(net)$label %in% names(tfmut), "red","orange")
l <- layout_with_fr(net)
plot(net, layout=l, vertex.size=deg*0.5) #this is for reference, labels will be added in post-production#
sel <- V(net)$label[V(net)$label %in% names(tfmut)]

V(net)$label <- ""
l <- layout_with_fr(net)
plot(net, layout=l, vertex.size=deg*0.5) 

Backtracking the TPMmut into their regions and domains

z <- fread("regions.txt",sep="\t",header=T)
z <- subset(z, z$is.matrisome != "rest.of.genome")
z <- subset(z, z$`region I` %in% c("inside","iside"))
z <- z[,c(1:18)]
sel <- unique(c(ppi.mat$`Official Symbol Interactor A`,ppi.mat$`Official Symbol Interactor B`))
z2 <- subset(z, z$gene %in% sel)
z2 <- as.data.frame(z2)

res <- list()
for(i in unique(z2$gene)){
  k <- subset(ppi.mat, ppi.mat$`Official Symbol Interactor A` == i | ppi.mat$`Official Symbol Interactor B` == i)
  k <- unique(c(k[,1],k[,2]))
  k <- paste(k, collapse = ",")
  df <- data.frame(gene=i, all.interacting.genes=k)
  res[[i]] <- df
}
res <- bind_rows(res)

m <- merge(z2,res,by="gene")
m$V11 <- NULL
m$V12 <- NULL
m$V17 <- NULL
m$V18 <- NULL

length(unique(m$gene))
fwrite(m, "FINAL MAPPING TAB.txt", sep="\t", row.names = F, quote = F)

net <- simplify(graph_from_data_frame(ppi.mat), remove.multiple = F, remove.loops = T)

V(net)$size <- 8
V(net)$frame.color <- "white"
E(net)$arrow.mode <- 0
V(net)$color <- "orange"
V(net)$color <- ifelse(names(V(net)) %in% m$gene, "red","orange")
V(net)$label <- ""
#n <- c("FN1","COL1A1","MMP2","ADAM10")
#V(net)$label <- ifelse(names(V(net)) %in% n,n,"") #to see labels of most-interesting hubs

deg <- degree(net, mode = "all")

l <- layout_with_fr(net)
plot(net, layout=l, vertex.size=deg*0.5) 

EXTRA IDEAS (from further thinking of, and discussing, the findings):

What is the % of different amino acids impacted by PTMmut?

tab <- fread("TABLE 1.txt",header=T,sep="\t")
allmutations <- fread("allmutations.txt", header=T, sep="\t") # import mutation data #
allmutations <- as.data.frame(allmutations)
allmutations$Amino_Acid_Change <- gsub("p.","",allmutations$Amino_Acid_Change)
allmutations$Amino_Acid_Change <- substring(allmutations$Amino_Acid_Change, 1, 1)
m <- merge(tab, allmutations, by.x=c("sample","gene","chr","start","end"), by.y=c("sample","gene","chr","start","end"))
tab <- table(m$PTM, m$Amino_Acid_Change.y)
tab <- tab[,-1]
tab <- round((tab/rowSums(tab))*100,0)
tab <- as.data.frame.matrix(tab)
tab$PTM <- rownames(tab)
tab <- tab[,c(22,1:21)]
fwrite(tab,"percentage of mutated aminoacids_PTM.txt", sep="\t",row.names = F, quote = F)

Are there patients with concurring/coexisting mutations in potentially interacting proteins?

ppi <- as.data.frame(fread("BIOGRID-ORGANISM-Homo_sapiens-4.2.192.tab3.txt",header=T,sep="\t"))
ppi <- ppi[,c(8,9)]
tab <- as.data.frame(fread("TABLE 1.txt",header=T,sep="\t"))
tab <- subset(tab, tab$is.matrisome == "matrisome")

res <- list()
pb <- txtProgressBar(min = 0, max = length(unique(tab$sample)), style = 3)
stp <- 0
for(i in unique(tab$sample)){
  stp <- stp+1
  setTxtProgressBar(pb, stp)
  z <- subset(tab, tab$sample %in% i)
  g <- unique(z$gene)
  s1 <- subset(ppi, ppi$`Official Symbol Interactor A` %in% g & ppi$`Official Symbol Interactor B` %in%  g)
  if(nrow(s1)==0){next}else{
    s1 <- subset(s1, s1$`Official Symbol Interactor A` != s1$`Official Symbol Interactor B`)
    if(nrow(s1)==0){next}else{
      s1$patient <- i
      res[[i]] <- s1
    }
  }
}
res <- bind_rows(res)
res <- unique(res)
length(unique(res$patient))
fwrite(res, "TABLE 2A.txt", sep="\t", row.names = F, quote = F)
res1 <- res

tab <- as.data.frame(fread("TABLE 1.txt",header=T,sep="\t"))
tab <- subset(tab, tab$is.matrisome != "matrisome")
res <- list()
pb <- txtProgressBar(min = 0, max = length(unique(tab$sample)), style = 3)
stp <- 0
for(i in unique(tab$sample)){
  stp <- stp+1
  setTxtProgressBar(pb, stp)
  z <- subset(tab, tab$sample %in% i)
  g <- unique(z$gene)
  s1 <- subset(ppi, ppi$`Official Symbol Interactor A` %in% g & ppi$`Official Symbol Interactor B` %in%  g)
  if(nrow(s1)==0){next}else{
    s1 <- subset(s1, s1$`Official Symbol Interactor A` != s1$`Official Symbol Interactor B`)
    if(nrow(s1)==0){next}else{
      s1$patient <- i
      res[[i]] <- s1
    }
  }
}
res <- bind_rows(res)
res <- unique(res)
length(unique(res$patient))

tab <- as.data.frame(fread("TABLE 1.txt",header=T,sep="\t"))
tab <- subset(tab, tab$is.matrisome == "matrisome")
nrow(tab)
tab <- as.data.frame(fread("TABLE 1.txt",header=T,sep="\t"))
tab <- subset(tab, tab$is.matrisome != "matrisome")
nrow(tab)

m <- matrix(c(nrow(res1),nrow(res),1907,43962),ncol=2,nrow=2) 
chisq.test(m) #p-value < 2.2e-16

Are there mutations in motifs mediating cell/ECM adhesions (GPP, RGD, LDV, collagen motifs - GFPGER, GLPGER, the latter two taken from doi: 10.1074/jbc.M806865200)? First, let’s check PTMmut. Results show that this kind of event is extremely infrequent, so much so as to ask whether other kind of mutations are similarly excluded.

tab <- fread("TABLE 1.txt",header=T,sep="\t")
allmutations <- fread("allmutations.txt", header=T, sep="\t") # import mutation data #
allmutations <- as.data.frame(allmutations)
allmutations$Amino_Acid_Change <- gsub("p.","",allmutations$Amino_Acid_Change)
allmutations$Amino_Acid_Change <- substring(allmutations$Amino_Acid_Change, 1, 1)
m <- merge(tab, allmutations, by.x=c("sample","gene","chr","start","end"), by.y=c("sample","gene","chr","start","end"))
m <- subset(m, m$is.matrisome == "matrisome")
sm <- m

library(EnsDb.Hsapiens.v86)  
library(ensembldb)
library(Biostrings)
library(bit64)

### GPP filter ###

#first, filter all proteins that have no GPP at all#
m <- sm
l <- list()
pb <- txtProgressBar(min = 0, max = length(unique(m$gene)), style = 3)
stp <- 0
for(i in unique(m$gene)){
  stp <- stp+1
  setTxtProgressBar(pb, stp)
  z <- subset(m, m$gene == i)
  g <- proteins(EnsDb.Hsapiens.v86, filter= GeneNameFilter(i), columns=c("uniprot_id","protein_sequence"),return.type = "data.frame")
  g <- subset(g, g$uniprot_id %in% unique(z$Uniprot_ID))
  g$r <- grepl("GPP",g$protein_sequence, fixed = TRUE)
  g <- as.data.frame(g)
  g$protein_sequence <- NULL
  l[[i]] <- g
}
l <- bind_rows(l)
l <- subset(l, l$r != "FALSE")

#next, backfilter m to include only the genes with a valid entry in l#
m <- subset(m, m$gene %in% l$gene_name & m$Uniprot_ID %in% l$uniprot_id)

#and then, the actual filter#
pb <- txtProgressBar(min = 0, max = length(unique(m$gene)), style = 3)
stp <- 0
res <- list()
res2 <- list()
for(i in unique(m$gene)){
  stp <- stp+1
  setTxtProgressBar(pb, stp)
  z <- subset(m, m$gene == i)
  g <- proteins(EnsDb.Hsapiens.v86, filter= GeneNameFilter(i), columns=c("uniprot_id","protein_sequence"),return.type = "data.frame")
  g <- subset(g, g$uniprot_id %in% unique(z$Uniprot_ID))
  if(nrow(g)==0){df <- data.frame(gene=i,position=0,available.prot.seq=0,of.which.seq.found=0,is.GVD="FALSE")
  res[[i]] <- df
  df <- NULL}else{
    g$n.aa <- nchar(g$protein_sequence)
    aa <- unique(z$Amino_Acid_Change.x)

    for(k in aa){
     min.aa <- k-3
     max.aa <- k+3
     s <- subset(g, g$n.aa >= min.aa & g$n.aa >= max.aa)
     if(nrow(s)==0){df <- data.frame(gene=i,position=0,available.prot.seq=0,of.which.seq.found=0,is.GVD="FALSE")
  res[[i]] <- df
  df <- NULL}else{
      
      for(w in 1:nrow(s)){
        v <- s[w,2]
        v <- subseq(v, start=min.aa, end=max.aa)
        r <- grepl("GPP",v, fixed = TRUE)
        res2[[stp]] <- data.frame(gene=i,position=k,available.prot.seq=nrow(g),of.which.seq.found=w,is.GVD=as.character(r))
      }}
    }
  }
}
res <- unique(bind_rows(res2))
res <- subset(res, res$is.GVD == "TRUE")
gpp <- res
gpp$motif <- "GPP"

### GVD filter ###

#first, filter all proteins that have no GVD at all#
m <- sm
l <- list()
pb <- txtProgressBar(min = 0, max = length(unique(m$gene)), style = 3)
stp <- 0
for(i in unique(m$gene)){
  stp <- stp+1
  setTxtProgressBar(pb, stp)
  z <- subset(m, m$gene == i)
  g <- proteins(EnsDb.Hsapiens.v86, filter= GeneNameFilter(i), columns=c("uniprot_id","protein_sequence"),return.type = "data.frame")
  g <- subset(g, g$uniprot_id %in% unique(z$Uniprot_ID))
  g$r <- grepl("GVD",g$protein_sequence, fixed = TRUE)
  g <- as.data.frame(g)
  g$protein_sequence <- NULL
  l[[i]] <- g
}
l <- bind_rows(l)
l <- subset(l, l$r != "FALSE")

#next, backfilter m to include only the genes with a valid entry in l#
m <- subset(m, m$gene %in% l$gene_name & m$Uniprot_ID %in% l$uniprot_id)

#and then, the actual filter#
pb <- txtProgressBar(min = 0, max = length(unique(m$gene)), style = 3)
stp <- 0
res <- list()
res2 <- list()
for(i in unique(m$gene)){
  stp <- stp+1
  setTxtProgressBar(pb, stp)
  z <- subset(m, m$gene == i)
  g <- proteins(EnsDb.Hsapiens.v86, filter= GeneNameFilter(i), columns=c("uniprot_id","protein_sequence"),return.type = "data.frame")
  g <- subset(g, g$uniprot_id %in% unique(z$Uniprot_ID))
  if(nrow(g)==0){df <- data.frame(gene=i,position=0,available.prot.seq=0,of.which.seq.found=0,is.GVD="FALSE")
  res[[i]] <- df
  df <- NULL}else{
    g$n.aa <- nchar(g$protein_sequence)
    aa <- unique(z$Amino_Acid_Change.x)

    for(k in aa){
     min.aa <- k-3
     max.aa <- k+3
     s <- subset(g, g$n.aa >= min.aa & g$n.aa >= max.aa)
     if(nrow(s)==0){df <- data.frame(gene=i,position=0,available.prot.seq=0,of.which.seq.found=0,is.GVD="FALSE")
  res[[i]] <- df
  df <- NULL}else{
      
      for(w in 1:nrow(s)){
        v <- s[w,2]
        v <- subseq(v, start=min.aa, end=max.aa)
        r <- grepl("GVD",v, fixed = TRUE)
        res2[[stp]] <- data.frame(gene=i,position=k,available.prot.seq=nrow(g),of.which.seq.found=w,is.GVD=as.character(r))
      }}
    }
  }
}
res <- unique(bind_rows(res2))
res <- subset(res, res$is.GVD == "TRUE")
gvd <- res
gvd$motif <- "GVD"
     
### RGD filter ###

#first, filter all proteins that have no RGD at all#
m <- sm
l <- list()
pb <- txtProgressBar(min = 0, max = length(unique(m$gene)), style = 3)
stp <- 0
for(i in unique(m$gene)){
  stp <- stp+1
  setTxtProgressBar(pb, stp)
  z <- subset(m, m$gene == i)
  g <- proteins(EnsDb.Hsapiens.v86, filter= GeneNameFilter(i), columns=c("uniprot_id","protein_sequence"),return.type = "data.frame")
  g <- subset(g, g$uniprot_id %in% unique(z$Uniprot_ID))
  g$r <- grepl("RGD",g$protein_sequence, fixed = TRUE)
  g <- as.data.frame(g)
  g$protein_sequence <- NULL
  l[[i]] <- g
}
l <- bind_rows(l)
l <- subset(l, l$r != "FALSE")

#next, backfilter m to include only the genes with a valid entry in l#
m <- subset(m, m$gene %in% l$gene_name & m$Uniprot_ID %in% l$uniprot_id)

#and then, the actual filter#
pb <- txtProgressBar(min = 0, max = length(unique(m$gene)), style = 3)
stp <- 0
res <- list()
res2 <- list()
for(i in unique(m$gene)){
  stp <- stp+1
  setTxtProgressBar(pb, stp)
  z <- subset(m, m$gene == i)
  g <- proteins(EnsDb.Hsapiens.v86, filter= GeneNameFilter(i), columns=c("uniprot_id","protein_sequence"),return.type = "data.frame")
  g <- subset(g, g$uniprot_id %in% unique(z$Uniprot_ID))
  if(nrow(g)==0){df <- data.frame(gene=i,position=0,available.prot.seq=0,of.which.seq.found=0,is.GVD="FALSE")
  res[[i]] <- df
  df <- NULL}else{
    g$n.aa <- nchar(g$protein_sequence)
    aa <- unique(z$Amino_Acid_Change.x)

    for(k in aa){
     min.aa <- k-3
     max.aa <- k+3
     s <- subset(g, g$n.aa >= min.aa & g$n.aa >= max.aa)
     if(nrow(s)==0){df <- data.frame(gene=i,position=0,available.prot.seq=0,of.which.seq.found=0,is.GVD="FALSE")
  res[[i]] <- df
  df <- NULL}else{
      
      for(w in 1:nrow(s)){
        v <- s[w,2]
        v <- subseq(v, start=min.aa, end=max.aa)
        r <- grepl("RGD",v, fixed = TRUE)
        res2[[stp]] <- data.frame(gene=i,position=k,available.prot.seq=nrow(g),of.which.seq.found=w,is.GVD=as.character(r))
      }}
    }
  }
}
res <- unique(bind_rows(res2))
res <- subset(res, res$is.GVD == "TRUE")
rgd <- res
#rgd$motif <- "RGD" #no RGD!

### LDV filter ###

#first, filter all proteins that have no LDV at all#
m <- sm
l <- list()
pb <- txtProgressBar(min = 0, max = length(unique(m$gene)), style = 3)
stp <- 0
for(i in unique(m$gene)){
  stp <- stp+1
  setTxtProgressBar(pb, stp)
  z <- subset(m, m$gene == i)
  g <- proteins(EnsDb.Hsapiens.v86, filter= GeneNameFilter(i), columns=c("uniprot_id","protein_sequence"),return.type = "data.frame")
  g <- subset(g, g$uniprot_id %in% unique(z$Uniprot_ID))
  g$r <- grepl("LDV",g$protein_sequence, fixed = TRUE)
  g <- as.data.frame(g)
  g$protein_sequence <- NULL
  l[[i]] <- g
}
l <- bind_rows(l)
l <- subset(l, l$r != "FALSE")

#next, backfilter m to include only the genes with a valid entry in l#
m <- subset(m, m$gene %in% l$gene_name & m$Uniprot_ID %in% l$uniprot_id)

#and then, the actual filter#
pb <- txtProgressBar(min = 0, max = length(unique(m$gene)), style = 3)
stp <- 0
res <- list()
res2 <- list()
for(i in unique(m$gene)){
  stp <- stp+1
  setTxtProgressBar(pb, stp)
  z <- subset(m, m$gene == i)
  g <- proteins(EnsDb.Hsapiens.v86, filter= GeneNameFilter(i), columns=c("uniprot_id","protein_sequence"),return.type = "data.frame")
  g <- subset(g, g$uniprot_id %in% unique(z$Uniprot_ID))
  if(nrow(g)==0){df <- data.frame(gene=i,position=0,available.prot.seq=0,of.which.seq.found=0,is.GVD="FALSE")
  res[[i]] <- df
  df <- NULL}else{
    g$n.aa <- nchar(g$protein_sequence)
    aa <- unique(z$Amino_Acid_Change.x)

    for(k in aa){
     min.aa <- k-3
     max.aa <- k+3
     s <- subset(g, g$n.aa >= min.aa & g$n.aa >= max.aa)
     if(nrow(s)==0){df <- data.frame(gene=i,position=0,available.prot.seq=0,of.which.seq.found=0,is.GVD="FALSE")
  res[[i]] <- df
  df <- NULL}else{
      
      for(w in 1:nrow(s)){
        v <- s[w,2]
        v <- subseq(v, start=min.aa, end=max.aa)
        r <- grepl("LDV",v, fixed = TRUE)
        res2[[stp]] <- data.frame(gene=i,position=k,available.prot.seq=nrow(g),of.which.seq.found=w,is.GVD=as.character(r))
      }}
    }
  }
}
res <- unique(bind_rows(res2))
res <- subset(res, res$is.GVD == "TRUE")
ldv <- res
#ldv$motif <- "LDV" #no LDV

### GFPGER filter ###

#first, filter all proteins that have no LDV at all#
m <- sm
l <- list()
pb <- txtProgressBar(min = 0, max = length(unique(m$gene)), style = 3)
stp <- 0
for(i in unique(m$gene)){
  stp <- stp+1
  setTxtProgressBar(pb, stp)
  z <- subset(m, m$gene == i)
  g <- proteins(EnsDb.Hsapiens.v86, filter= GeneNameFilter(i), columns=c("uniprot_id","protein_sequence"),return.type = "data.frame")
  g <- subset(g, g$uniprot_id %in% unique(z$Uniprot_ID))
  g$r <- grepl("GFPGER",g$protein_sequence, fixed = TRUE)
  g <- as.data.frame(g)
  g$protein_sequence <- NULL
  l[[i]] <- g
}
l <- bind_rows(l)
l <- subset(l, l$r != "FALSE")

#next, backfilter m to include only the genes with a valid entry in l#
m <- subset(m, m$gene %in% l$gene_name & m$Uniprot_ID %in% l$uniprot_id)

#and then, the actual filter#
pb <- txtProgressBar(min = 0, max = length(unique(m$gene)), style = 3)
stp <- 0
res <- list()
res2 <- list()
for(i in unique(m$gene)){
  stp <- stp+1
  setTxtProgressBar(pb, stp)
  z <- subset(m, m$gene == i)
  g <- proteins(EnsDb.Hsapiens.v86, filter= GeneNameFilter(i), columns=c("uniprot_id","protein_sequence"),return.type = "data.frame")
  g <- subset(g, g$uniprot_id %in% unique(z$Uniprot_ID))
  if(nrow(g)==0){df <- data.frame(gene=i,position=0,available.prot.seq=0,of.which.seq.found=0,is.GVD="FALSE")
  res[[i]] <- df
  df <- NULL}else{
    g$n.aa <- nchar(g$protein_sequence)
    aa <- unique(z$Amino_Acid_Change.x)

    for(k in aa){
     min.aa <- k-3
     max.aa <- k+3
     s <- subset(g, g$n.aa >= min.aa & g$n.aa >= max.aa)
     if(nrow(s)==0){df <- data.frame(gene=i,position=0,available.prot.seq=0,of.which.seq.found=0,is.GVD="FALSE")
  res[[i]] <- df
  df <- NULL}else{
      
      for(w in 1:nrow(s)){
        v <- s[w,2]
        v <- subseq(v, start=min.aa, end=max.aa)
        r <- grepl("GFPGER",v, fixed = TRUE)
        res2[[stp]] <- data.frame(gene=i,position=k,available.prot.seq=nrow(g),of.which.seq.found=w,is.GVD=as.character(r))
      }}
    }
  }
}
res <- unique(bind_rows(res2))
res <- subset(res, res$is.GVD == "TRUE")
gfpger <- res
#gfpger$motif <- "GFPGER" #no GFPGER

### GLPGER filter ###

#first, filter all proteins that have no LDV at all#
m <- sm
l <- list()
pb <- txtProgressBar(min = 0, max = length(unique(m$gene)), style = 3)
stp <- 0
for(i in unique(m$gene)){
  stp <- stp+1
  setTxtProgressBar(pb, stp)
  z <- subset(m, m$gene == i)
  g <- proteins(EnsDb.Hsapiens.v86, filter= GeneNameFilter(i), columns=c("uniprot_id","protein_sequence"),return.type = "data.frame")
  g <- subset(g, g$uniprot_id %in% unique(z$Uniprot_ID))
  g$r <- grepl("GLPGER",g$protein_sequence, fixed = TRUE)
  g <- as.data.frame(g)
  g$protein_sequence <- NULL
  l[[i]] <- g
}
l <- bind_rows(l)
l <- subset(l, l$r != "FALSE")

#next, backfilter m to include only the genes with a valid entry in l#
m <- subset(m, m$gene %in% l$gene_name & m$Uniprot_ID %in% l$uniprot_id)

#and then, the actual filter#
pb <- txtProgressBar(min = 0, max = length(unique(m$gene)), style = 3)
stp <- 0
res <- list()
res2 <- list()
for(i in unique(m$gene)){
  stp <- stp+1
  setTxtProgressBar(pb, stp)
  z <- subset(m, m$gene == i)
  g <- proteins(EnsDb.Hsapiens.v86, filter= GeneNameFilter(i), columns=c("uniprot_id","protein_sequence"),return.type = "data.frame")
  g <- subset(g, g$uniprot_id %in% unique(z$Uniprot_ID))
  if(nrow(g)==0){df <- data.frame(gene=i,position=0,available.prot.seq=0,of.which.seq.found=0,is.GVD="FALSE")
  res[[i]] <- df
  df <- NULL}else{
    g$n.aa <- nchar(g$protein_sequence)
    aa <- unique(z$Amino_Acid_Change.x)

    for(k in aa){
     min.aa <- k-3
     max.aa <- k+3
     s <- subset(g, g$n.aa >= min.aa & g$n.aa >= max.aa)
     if(nrow(s)==0){df <- data.frame(gene=i,position=0,available.prot.seq=0,of.which.seq.found=0,is.GVD="FALSE")
  res[[i]] <- df
  df <- NULL}else{
      
      for(w in 1:nrow(s)){
        v <- s[w,2]
        v <- subseq(v, start=min.aa, end=max.aa)
        r <- grepl("GLPGER",v, fixed = TRUE)
        res2[[stp]] <- data.frame(gene=i,position=k,available.prot.seq=nrow(g),of.which.seq.found=w,is.GVD=as.character(r))
      }}
    }
  }
}
res <- unique(bind_rows(res2))
res <- subset(res, res$is.GVD == "TRUE")
glpger <- res
#glpger$motif <- "GFPGER" #no GLPGER

### Bind results together ###

res <- bind_rows(gpp, gvd)
res$id <- paste0(res$gene,"_",res$position)
sm$id <- paste0(sm$gene,"_",sm$Amino_Acid_Change.x)
sm <- subset(sm, select = c("gene","PTM","id"))
res <- merge(sm,res,by="id")
res$id <- NULL
res$gene.y <- NULL
names(res)[1] <- "gene"
names(res)[6] <- "is.GVD.plus.minus.3"
fwrite(res, "MOTIF MAPPING OF PTMmut.txt",sep="\t",row.names = F, quote = F)

Check for non-PTMmut.

tab <- as.data.frame(fread("all mutations.txt",header=T,sep="\t"))
tab2 <- as.data.frame(fread("all PTM mutations.txt",header=T,sep="\t"))
tab$id <- paste0(tab$sample,"_",tab$gene,"_",tab$Amino_Acid_Change,"_",tab$`cancer type abbreviation`)
tab2$id <- paste0(tab2$sample,"_",tab2$gene,"_",tab2$Amino_Acid_Change,"_",tab2$`cancer type abbreviation`)
tab <- subset(tab, !(tab$id %in% tab2$id))
tab <- subset(tab, tab$is.matrisome == "matrisome")
tab <- na.omit(tab)
ntab <- fread("TABLE 1.txt",header=T,sep="\t")
ntab <- subset(ntab, ntab$is.matrisome == "matrisome")
ntab <- subset(ntab, select=c("gene","Uniprot_ID"))
m <- distinct(merge(tab,ntab,by="gene"))
m <- m %>% mutate_if(bit64::is.integer64, as.integer)

sm <- m

### GPP filter ###

#first, filter all proteins that have no GPP at all#
m <- sm
l <- list()
pb <- txtProgressBar(min = 0, max = length(unique(m$gene)), style = 3)
stp <- 0
for(i in unique(m$gene)){
  stp <- stp+1
  setTxtProgressBar(pb, stp)
  z <- subset(m, m$gene == i)
  g <- proteins(EnsDb.Hsapiens.v86, filter= GeneNameFilter(i), columns=c("uniprot_id","protein_sequence"),return.type = "data.frame")
  g <- subset(g, g$uniprot_id %in% unique(z$Uniprot_ID))
  g$r <- grepl("GPP",g$protein_sequence, fixed = TRUE)
  g <- as.data.frame(g)
  g$protein_sequence <- NULL
  l[[i]] <- g
}
l <- bind_rows(l)
l <- subset(l, l$r != "FALSE")

#next, backfilter m to include only the genes with a valid entry in l#
m <- subset(m, m$gene %in% l$gene_name & m$Uniprot_ID %in% l$uniprot_id)

#and then, the actual filter#
pb <- txtProgressBar(min = 0, max = length(unique(m$gene)), style = 3)
stp <- 0
res <- list()
res2 <- list()
for(i in unique(m$gene)){
  stp <- stp+1
  setTxtProgressBar(pb, stp)
  z <- subset(m, m$gene == i)
  g <- proteins(EnsDb.Hsapiens.v86, filter= GeneNameFilter(i), columns=c("uniprot_id","protein_sequence"),return.type = "data.frame")
  g <- subset(g, g$uniprot_id %in% unique(z$Uniprot_ID))
  if(nrow(g)==0){df <- data.frame(gene=i,position=0,available.prot.seq=0,of.which.seq.found=0,is.GVD="FALSE")
  res[[i]] <- df
  df <- NULL}else{
    g$n.aa <- nchar(g$protein_sequence)
    aa <- unique(z$Amino_Acid_Change)
    aa <- aa[aa > 4]

    for(k in aa){
     min.aa <- k-3
     max.aa <- k+3
     s <- subset(g, g$n.aa >= min.aa & g$n.aa >= max.aa)
     if(nrow(s)==0){df <- data.frame(gene=i,position=0,available.prot.seq=0,of.which.seq.found=0,is.GVD="FALSE")
  res[[i]] <- df
  df <- NULL}else{
      
      for(w in 1:nrow(s)){
        v <- s[w,2]
        v <- subseq(v, start=min.aa, end=max.aa)
        r <- grepl("GPP",v, fixed = TRUE)
        res2[[stp]] <- data.frame(gene=i,position=k,available.prot.seq=nrow(g),of.which.seq.found=w,is.GVD=as.character(r))
      }}
    }
  }
}
res <- unique(bind_rows(res2))
res <- subset(res, res$is.GVD == "TRUE")
gpp <- res
gpp$motif <- "GPP"

### GVD filter ###

#first, filter all proteins that have no GVD at all#
m <- sm
l <- list()
pb <- txtProgressBar(min = 0, max = length(unique(m$gene)), style = 3)
stp <- 0
for(i in unique(m$gene)){
  stp <- stp+1
  setTxtProgressBar(pb, stp)
  z <- subset(m, m$gene == i)
  g <- proteins(EnsDb.Hsapiens.v86, filter= GeneNameFilter(i), columns=c("uniprot_id","protein_sequence"),return.type = "data.frame")
  g <- subset(g, g$uniprot_id %in% unique(z$Uniprot_ID))
  g$r <- grepl("GVD",g$protein_sequence, fixed = TRUE)
  g <- as.data.frame(g)
  g$protein_sequence <- NULL
  l[[i]] <- g
}
l <- bind_rows(l)
l <- subset(l, l$r != "FALSE")

#next, backfilter m to include only the genes with a valid entry in l#
m <- subset(m, m$gene %in% l$gene_name & m$Uniprot_ID %in% l$uniprot_id)

#and then, the actual filter#
pb <- txtProgressBar(min = 0, max = length(unique(m$gene)), style = 3)
stp <- 0
res <- list()
res2 <- list()
for(i in unique(m$gene)){
  stp <- stp+1
  setTxtProgressBar(pb, stp)
  z <- subset(m, m$gene == i)
  g <- proteins(EnsDb.Hsapiens.v86, filter= GeneNameFilter(i), columns=c("uniprot_id","protein_sequence"),return.type = "data.frame")
  g <- subset(g, g$uniprot_id %in% unique(z$Uniprot_ID))
  if(nrow(g)==0){df <- data.frame(gene=i,position=0,available.prot.seq=0,of.which.seq.found=0,is.GVD="FALSE")
  res[[i]] <- df
  df <- NULL}else{
    g$n.aa <- nchar(g$protein_sequence)
    aa <- unique(z$Amino_Acid_Change)
    aa <- aa[aa > 4]

    for(k in aa){
     min.aa <- k-3
     max.aa <- k+3
     s <- subset(g, g$n.aa >= min.aa & g$n.aa >= max.aa)
     if(nrow(s)==0){df <- data.frame(gene=i,position=0,available.prot.seq=0,of.which.seq.found=0,is.GVD="FALSE")
  res[[i]] <- df
  df <- NULL}else{
      
      for(w in 1:nrow(s)){
        v <- s[w,2]
        v <- subseq(v, start=min.aa, end=max.aa)
        r <- grepl("GVD",v, fixed = TRUE)
        res2[[stp]] <- data.frame(gene=i,position=k,available.prot.seq=nrow(g),of.which.seq.found=w,is.GVD=as.character(r))
      }}
    }
  }
}
res <- unique(bind_rows(res2))
res <- subset(res, res$is.GVD == "TRUE")
gvd <- res
#gvd$motif <- "GVD" #no GVD!

### RGD filter ###

#first, filter all proteins that have no RGD at all#
m <- sm
l <- list()
pb <- txtProgressBar(min = 0, max = length(unique(m$gene)), style = 3)
stp <- 0
for(i in unique(m$gene)){
  stp <- stp+1
  setTxtProgressBar(pb, stp)
  z <- subset(m, m$gene == i)
  g <- proteins(EnsDb.Hsapiens.v86, filter= GeneNameFilter(i), columns=c("uniprot_id","protein_sequence"),return.type = "data.frame")
  g <- subset(g, g$uniprot_id %in% unique(z$Uniprot_ID))
  g$r <- grepl("RGD",g$protein_sequence, fixed = TRUE)
  g <- as.data.frame(g)
  g$protein_sequence <- NULL
  l[[i]] <- g
}
l <- bind_rows(l)
l <- subset(l, l$r != "FALSE")

#next, backfilter m to include only the genes with a valid entry in l#
m <- subset(m, m$gene %in% l$gene_name & m$Uniprot_ID %in% l$uniprot_id)

#and then, the actual filter#
pb <- txtProgressBar(min = 0, max = length(unique(m$gene)), style = 3)
stp <- 0
res <- list()
res2 <- list()
for(i in unique(m$gene)){
  stp <- stp+1
  setTxtProgressBar(pb, stp)
  z <- subset(m, m$gene == i)
  g <- proteins(EnsDb.Hsapiens.v86, filter= GeneNameFilter(i), columns=c("uniprot_id","protein_sequence"),return.type = "data.frame")
  g <- subset(g, g$uniprot_id %in% unique(z$Uniprot_ID))
  if(nrow(g)==0){df <- data.frame(gene=i,position=0,available.prot.seq=0,of.which.seq.found=0,is.GVD="FALSE")
  res[[i]] <- df
  df <- NULL}else{
    g$n.aa <- nchar(g$protein_sequence)
    aa <- unique(z$Amino_Acid_Change)
    aa <- aa[aa > 4]

    for(k in aa){
     min.aa <- k-3
     max.aa <- k+3
     s <- subset(g, g$n.aa >= min.aa & g$n.aa >= max.aa)
     if(nrow(s)==0){df <- data.frame(gene=i,position=0,available.prot.seq=0,of.which.seq.found=0,is.GVD="FALSE")
  res[[i]] <- df
  df <- NULL}else{
      
      for(w in 1:nrow(s)){
        v <- s[w,2]
        v <- subseq(v, start=min.aa, end=max.aa)
        r <- grepl("RGD",v, fixed = TRUE)
        res2[[stp]] <- data.frame(gene=i,position=k,available.prot.seq=nrow(g),of.which.seq.found=w,is.GVD=as.character(r))
      }}
    }
  }
}
res <- unique(bind_rows(res2))
res <- subset(res, res$is.GVD == "TRUE")
rgd <- res
#rgd$motif <- "RGD" #no RGD

### LDV filter ###

#first, filter all proteins that have no LDV at all#
m <- sm
l <- list()
pb <- txtProgressBar(min = 0, max = length(unique(m$gene)), style = 3)
stp <- 0
for(i in unique(m$gene)){
  stp <- stp+1
  setTxtProgressBar(pb, stp)
  z <- subset(m, m$gene == i)
  g <- proteins(EnsDb.Hsapiens.v86, filter= GeneNameFilter(i), columns=c("uniprot_id","protein_sequence"),return.type = "data.frame")
  g <- subset(g, g$uniprot_id %in% unique(z$Uniprot_ID))
  g$r <- grepl("LDV",g$protein_sequence, fixed = TRUE)
  g <- as.data.frame(g)
  g$protein_sequence <- NULL
  l[[i]] <- g
}
l <- bind_rows(l)
l <- subset(l, l$r != "FALSE")

#next, backfilter m to include only the genes with a valid entry in l#
m <- subset(m, m$gene %in% l$gene_name & m$Uniprot_ID %in% l$uniprot_id)

#and then, the actual filter#
pb <- txtProgressBar(min = 0, max = length(unique(m$gene)), style = 3)
stp <- 0
res <- list()
res2 <- list()
for(i in unique(m$gene)){
  stp <- stp+1
  setTxtProgressBar(pb, stp)
  z <- subset(m, m$gene == i)
  g <- proteins(EnsDb.Hsapiens.v86, filter= GeneNameFilter(i), columns=c("uniprot_id","protein_sequence"),return.type = "data.frame")
  g <- subset(g, g$uniprot_id %in% unique(z$Uniprot_ID))
  if(nrow(g)==0){df <- data.frame(gene=i,position=0,available.prot.seq=0,of.which.seq.found=0,is.GVD="FALSE")
  res[[i]] <- df
  df <- NULL}else{
    g$n.aa <- nchar(g$protein_sequence)
    aa <- unique(z$Amino_Acid_Change)
    aa <- aa[aa > 4]

    for(k in aa){
     min.aa <- k-3
     max.aa <- k+3
     s <- subset(g, g$n.aa >= min.aa & g$n.aa >= max.aa)
     if(nrow(s)==0){df <- data.frame(gene=i,position=0,available.prot.seq=0,of.which.seq.found=0,is.GVD="FALSE")
  res[[i]] <- df
  df <- NULL}else{
      
      for(w in 1:nrow(s)){
        v <- s[w,2]
        v <- subseq(v, start=min.aa, end=max.aa)
        r <- grepl("LDV",v, fixed = TRUE)
        res2[[stp]] <- data.frame(gene=i,position=k,available.prot.seq=nrow(g),of.which.seq.found=w,is.GVD=as.character(r))
      }}
    }
  }
}
res <- unique(bind_rows(res2))
res <- subset(res, res$is.GVD == "TRUE")
ldv <- res
ldv$motif <- "LDV"

### GFPGER filter ###

#first, filter all proteins that have no LDV at all#
m <- sm
l <- list()
pb <- txtProgressBar(min = 0, max = length(unique(m$gene)), style = 3)
stp <- 0
for(i in unique(m$gene)){
  stp <- stp+1
  setTxtProgressBar(pb, stp)
  z <- subset(m, m$gene == i)
  g <- proteins(EnsDb.Hsapiens.v86, filter= GeneNameFilter(i), columns=c("uniprot_id","protein_sequence"),return.type = "data.frame")
  g <- subset(g, g$uniprot_id %in% unique(z$Uniprot_ID))
  g$r <- grepl("GFPGER",g$protein_sequence, fixed = TRUE)
  g <- as.data.frame(g)
  g$protein_sequence <- NULL
  l[[i]] <- g
}
l <- bind_rows(l)
l <- subset(l, l$r != "FALSE")

#next, backfilter m to include only the genes with a valid entry in l#
m <- subset(m, m$gene %in% l$gene_name & m$Uniprot_ID %in% l$uniprot_id)

#and then, the actual filter#
pb <- txtProgressBar(min = 0, max = length(unique(m$gene)), style = 3)
stp <- 0
res <- list()
res2 <- list()
for(i in unique(m$gene)){
  stp <- stp+1
  setTxtProgressBar(pb, stp)
  z <- subset(m, m$gene == i)
  g <- proteins(EnsDb.Hsapiens.v86, filter= GeneNameFilter(i), columns=c("uniprot_id","protein_sequence"),return.type = "data.frame")
  g <- subset(g, g$uniprot_id %in% unique(z$Uniprot_ID))
  if(nrow(g)==0){df <- data.frame(gene=i,position=0,available.prot.seq=0,of.which.seq.found=0,is.GVD="FALSE")
  res[[i]] <- df
  df <- NULL}else{
    g$n.aa <- nchar(g$protein_sequence)
    aa <- unique(z$Amino_Acid_Change)
    aa <- aa[aa > 4]

    for(k in aa){
     min.aa <- k-3
     max.aa <- k+3
     s <- subset(g, g$n.aa >= min.aa & g$n.aa >= max.aa)
     if(nrow(s)==0){df <- data.frame(gene=i,position=0,available.prot.seq=0,of.which.seq.found=0,is.GVD="FALSE")
  res[[i]] <- df
  df <- NULL}else{
      
      for(w in 1:nrow(s)){
        v <- s[w,2]
        v <- subseq(v, start=min.aa, end=max.aa)
        r <- grepl("GFPGER",v, fixed = TRUE)
        res2[[stp]] <- data.frame(gene=i,position=k,available.prot.seq=nrow(g),of.which.seq.found=w,is.GVD=as.character(r))
      }}
    }
  }
}
res <- unique(bind_rows(res2))
res <- subset(res, res$is.GVD == "TRUE")
gfpger <- res
#gfpger$motif <- "GFPGER" #no GFPGER!

### GLPGER filter ###

#first, filter all proteins that have no LDV at all#
m <- sm
l <- list()
pb <- txtProgressBar(min = 0, max = length(unique(m$gene)), style = 3)
stp <- 0
for(i in unique(m$gene)){
  stp <- stp+1
  setTxtProgressBar(pb, stp)
  z <- subset(m, m$gene == i)
  g <- proteins(EnsDb.Hsapiens.v86, filter= GeneNameFilter(i), columns=c("uniprot_id","protein_sequence"),return.type = "data.frame")
  g <- subset(g, g$uniprot_id %in% unique(z$Uniprot_ID))
  g$r <- grepl("GLPGER",g$protein_sequence, fixed = TRUE)
  g <- as.data.frame(g)
  g$protein_sequence <- NULL
  l[[i]] <- g
}
l <- bind_rows(l)
l <- subset(l, l$r != "FALSE")

#next, backfilter m to include only the genes with a valid entry in l#
m <- subset(m, m$gene %in% l$gene_name & m$Uniprot_ID %in% l$uniprot_id)

#and then, the actual filter#
pb <- txtProgressBar(min = 0, max = length(unique(m$gene)), style = 3)
stp <- 0
res <- list()
res2 <- list()
for(i in unique(m$gene)){
  stp <- stp+1
  setTxtProgressBar(pb, stp)
  z <- subset(m, m$gene == i)
  g <- proteins(EnsDb.Hsapiens.v86, filter= GeneNameFilter(i), columns=c("uniprot_id","protein_sequence"),return.type = "data.frame")
  g <- subset(g, g$uniprot_id %in% unique(z$Uniprot_ID))
  if(nrow(g)==0){df <- data.frame(gene=i,position=0,available.prot.seq=0,of.which.seq.found=0,is.GVD="FALSE")
  res[[i]] <- df
  df <- NULL}else{
    g$n.aa <- nchar(g$protein_sequence)
    aa <- unique(z$Amino_Acid_Change)
    aa <- aa[aa > 4]

    for(k in aa){
     min.aa <- k-3
     max.aa <- k+3
     s <- subset(g, g$n.aa >= min.aa & g$n.aa >= max.aa)
     if(nrow(s)==0){df <- data.frame(gene=i,position=0,available.prot.seq=0,of.which.seq.found=0,is.GVD="FALSE")
  res[[i]] <- df
  df <- NULL}else{
      
      for(w in 1:nrow(s)){
        v <- s[w,2]
        v <- subseq(v, start=min.aa, end=max.aa)
        r <- grepl("GLPGER",v, fixed = TRUE)
        res2[[stp]] <- data.frame(gene=i,position=k,available.prot.seq=nrow(g),of.which.seq.found=w,is.GVD=as.character(r))
      }}
    }
  }
}
res <- unique(bind_rows(res2))
res <- subset(res, res$is.GVD == "TRUE")
glpger <- res
#glpger$motif <- "GLPGER" #no GLPGER!



tab <- fread("TABLE 1.txt",header=T,sep="\t")
allmutations <- fread("allmutations.txt", header=T, sep="\t") # import mutation data #
allmutations <- as.data.frame(allmutations)
allmutations$Amino_Acid_Change <- gsub("p.","",allmutations$Amino_Acid_Change)
allmutations$Amino_Acid_Change <- substring(allmutations$Amino_Acid_Change, 1, 1)
m <- merge(tab, allmutations, by.x=c("sample","gene","chr","start","end"), by.y=c("sample","gene","chr","start","end"))
m <- subset(m, m$is.matrisome == "matrisome")
sm <- m

library(EnsDb.Hsapiens.v86)  
library(ensembldb)
library(Biostrings)
library(bit64)

### GPP filter ###

#first, filter all proteins that have no GPP at all#
m <- sm
l <- list()
pb <- txtProgressBar(min = 0, max = length(unique(m$gene)), style = 3)
stp <- 0
for(i in unique(m$gene)){
  stp <- stp+1
  setTxtProgressBar(pb, stp)
  z <- subset(m, m$gene == i)
  g <- proteins(EnsDb.Hsapiens.v86, filter= GeneNameFilter(i), columns=c("uniprot_id","protein_sequence"),return.type = "data.frame")
  g <- subset(g, g$uniprot_id %in% unique(z$Uniprot_ID))
  g$r <- grepl("GPP",g$protein_sequence, fixed = TRUE)
  g <- as.data.frame(g)
  g$protein_sequence <- NULL
  l[[i]] <- g
}
l <- bind_rows(l)
l <- subset(l, l$r != "FALSE")

#next, backfilter m to include only the genes with a valid entry in l#
m <- subset(m, m$gene %in% l$gene_name & m$Uniprot_ID %in% l$uniprot_id)

#and then, the actual filter#
pb <- txtProgressBar(min = 0, max = length(unique(m$gene)), style = 3)
stp <- 0
res <- list()
res2 <- list()
for(i in unique(m$gene)){
  stp <- stp+1
  setTxtProgressBar(pb, stp)
  z <- subset(m, m$gene == i)
  g <- proteins(EnsDb.Hsapiens.v86, filter= GeneNameFilter(i), columns=c("uniprot_id","protein_sequence"),return.type = "data.frame")
  g <- subset(g, g$uniprot_id %in% unique(z$Uniprot_ID))
  if(nrow(g)==0){df <- data.frame(gene=i,position=0,available.prot.seq=0,of.which.seq.found=0,is.GVD="FALSE")
  res[[i]] <- df
  df <- NULL}else{
    g$n.aa <- nchar(g$protein_sequence)
    aa <- unique(z$Amino_Acid_Change.x)

    for(k in aa){
     min.aa <- k-3
     max.aa <- k+3
     s <- subset(g, g$n.aa >= min.aa & g$n.aa >= max.aa)
     if(nrow(s)==0){df <- data.frame(gene=i,position=0,available.prot.seq=0,of.which.seq.found=0,is.GVD="FALSE")
  res[[i]] <- df
  df <- NULL}else{
      
      for(w in 1:nrow(s)){
        v <- s[w,2]
        v <- subseq(v, start=min.aa, end=max.aa)
        r <- grepl("GPP",v, fixed = TRUE)
        res2[[stp]] <- data.frame(gene=i,position=k,available.prot.seq=nrow(g),of.which.seq.found=w,is.GVD=as.character(r))
      }}
    }
  }
}
res <- unique(bind_rows(res2))
res <- subset(res, res$is.GVD == "TRUE")
gpp <- res
gpp$motif <- "GPP"

### GVD filter ###

#first, filter all proteins that have no GVD at all#
m <- sm
l <- list()
pb <- txtProgressBar(min = 0, max = length(unique(m$gene)), style = 3)
stp <- 0
for(i in unique(m$gene)){
  stp <- stp+1
  setTxtProgressBar(pb, stp)
  z <- subset(m, m$gene == i)
  g <- proteins(EnsDb.Hsapiens.v86, filter= GeneNameFilter(i), columns=c("uniprot_id","protein_sequence"),return.type = "data.frame")
  g <- subset(g, g$uniprot_id %in% unique(z$Uniprot_ID))
  g$r <- grepl("GVD",g$protein_sequence, fixed = TRUE)
  g <- as.data.frame(g)
  g$protein_sequence <- NULL
  l[[i]] <- g
}
l <- bind_rows(l)
l <- subset(l, l$r != "FALSE")

#next, backfilter m to include only the genes with a valid entry in l#
m <- subset(m, m$gene %in% l$gene_name & m$Uniprot_ID %in% l$uniprot_id)

#and then, the actual filter#
pb <- txtProgressBar(min = 0, max = length(unique(m$gene)), style = 3)
stp <- 0
res <- list()
res2 <- list()
for(i in unique(m$gene)){
  stp <- stp+1
  setTxtProgressBar(pb, stp)
  z <- subset(m, m$gene == i)
  g <- proteins(EnsDb.Hsapiens.v86, filter= GeneNameFilter(i), columns=c("uniprot_id","protein_sequence"),return.type = "data.frame")
  g <- subset(g, g$uniprot_id %in% unique(z$Uniprot_ID))
  if(nrow(g)==0){df <- data.frame(gene=i,position=0,available.prot.seq=0,of.which.seq.found=0,is.GVD="FALSE")
  res[[i]] <- df
  df <- NULL}else{
    g$n.aa <- nchar(g$protein_sequence)
    aa <- unique(z$Amino_Acid_Change.x)

    for(k in aa){
     min.aa <- k-3
     max.aa <- k+3
     s <- subset(g, g$n.aa >= min.aa & g$n.aa >= max.aa)
     if(nrow(s)==0){df <- data.frame(gene=i,position=0,available.prot.seq=0,of.which.seq.found=0,is.GVD="FALSE")
  res[[i]] <- df
  df <- NULL}else{
      
      for(w in 1:nrow(s)){
        v <- s[w,2]
        v <- subseq(v, start=min.aa, end=max.aa)
        r <- grepl("GVD",v, fixed = TRUE)
        res2[[stp]] <- data.frame(gene=i,position=k,available.prot.seq=nrow(g),of.which.seq.found=w,is.GVD=as.character(r))
      }}
    }
  }
}
res <- unique(bind_rows(res2))
res <- subset(res, res$is.GVD == "TRUE")
gvd <- res
gvd$motif <- "GVD"
     
### RGD filter ###

#first, filter all proteins that have no RGD at all#
m <- sm
l <- list()
pb <- txtProgressBar(min = 0, max = length(unique(m$gene)), style = 3)
stp <- 0
for(i in unique(m$gene)){
  stp <- stp+1
  setTxtProgressBar(pb, stp)
  z <- subset(m, m$gene == i)
  g <- proteins(EnsDb.Hsapiens.v86, filter= GeneNameFilter(i), columns=c("uniprot_id","protein_sequence"),return.type = "data.frame")
  g <- subset(g, g$uniprot_id %in% unique(z$Uniprot_ID))
  g$r <- grepl("RGD",g$protein_sequence, fixed = TRUE)
  g <- as.data.frame(g)
  g$protein_sequence <- NULL
  l[[i]] <- g
}
l <- bind_rows(l)
l <- subset(l, l$r != "FALSE")

#next, backfilter m to include only the genes with a valid entry in l#
m <- subset(m, m$gene %in% l$gene_name & m$Uniprot_ID %in% l$uniprot_id)

#and then, the actual filter#
pb <- txtProgressBar(min = 0, max = length(unique(m$gene)), style = 3)
stp <- 0
res <- list()
res2 <- list()
for(i in unique(m$gene)){
  stp <- stp+1
  setTxtProgressBar(pb, stp)
  z <- subset(m, m$gene == i)
  g <- proteins(EnsDb.Hsapiens.v86, filter= GeneNameFilter(i), columns=c("uniprot_id","protein_sequence"),return.type = "data.frame")
  g <- subset(g, g$uniprot_id %in% unique(z$Uniprot_ID))
  if(nrow(g)==0){df <- data.frame(gene=i,position=0,available.prot.seq=0,of.which.seq.found=0,is.GVD="FALSE")
  res[[i]] <- df
  df <- NULL}else{
    g$n.aa <- nchar(g$protein_sequence)
    aa <- unique(z$Amino_Acid_Change.x)

    for(k in aa){
     min.aa <- k-3
     max.aa <- k+3
     s <- subset(g, g$n.aa >= min.aa & g$n.aa >= max.aa)
     if(nrow(s)==0){df <- data.frame(gene=i,position=0,available.prot.seq=0,of.which.seq.found=0,is.GVD="FALSE")
  res[[i]] <- df
  df <- NULL}else{
      
      for(w in 1:nrow(s)){
        v <- s[w,2]
        v <- subseq(v, start=min.aa, end=max.aa)
        r <- grepl("RGD",v, fixed = TRUE)
        res2[[stp]] <- data.frame(gene=i,position=k,available.prot.seq=nrow(g),of.which.seq.found=w,is.GVD=as.character(r))
      }}
    }
  }
}
res <- unique(bind_rows(res2))
res <- subset(res, res$is.GVD == "TRUE")
rgd <- res
#rgd$motif <- "RGD" #no RGD!

### LDV filter ###

#first, filter all proteins that have no LDV at all#
m <- sm
l <- list()
pb <- txtProgressBar(min = 0, max = length(unique(m$gene)), style = 3)
stp <- 0
for(i in unique(m$gene)){
  stp <- stp+1
  setTxtProgressBar(pb, stp)
  z <- subset(m, m$gene == i)
  g <- proteins(EnsDb.Hsapiens.v86, filter= GeneNameFilter(i), columns=c("uniprot_id","protein_sequence"),return.type = "data.frame")
  g <- subset(g, g$uniprot_id %in% unique(z$Uniprot_ID))
  g$r <- grepl("LDV",g$protein_sequence, fixed = TRUE)
  g <- as.data.frame(g)
  g$protein_sequence <- NULL
  l[[i]] <- g
}
l <- bind_rows(l)
l <- subset(l, l$r != "FALSE")

#next, backfilter m to include only the genes with a valid entry in l#
m <- subset(m, m$gene %in% l$gene_name & m$Uniprot_ID %in% l$uniprot_id)

#and then, the actual filter#
pb <- txtProgressBar(min = 0, max = length(unique(m$gene)), style = 3)
stp <- 0
res <- list()
res2 <- list()
for(i in unique(m$gene)){
  stp <- stp+1
  setTxtProgressBar(pb, stp)
  z <- subset(m, m$gene == i)
  g <- proteins(EnsDb.Hsapiens.v86, filter= GeneNameFilter(i), columns=c("uniprot_id","protein_sequence"),return.type = "data.frame")
  g <- subset(g, g$uniprot_id %in% unique(z$Uniprot_ID))
  if(nrow(g)==0){df <- data.frame(gene=i,position=0,available.prot.seq=0,of.which.seq.found=0,is.GVD="FALSE")
  res[[i]] <- df
  df <- NULL}else{
    g$n.aa <- nchar(g$protein_sequence)
    aa <- unique(z$Amino_Acid_Change.x)

    for(k in aa){
     min.aa <- k-3
     max.aa <- k+3
     s <- subset(g, g$n.aa >= min.aa & g$n.aa >= max.aa)
     if(nrow(s)==0){df <- data.frame(gene=i,position=0,available.prot.seq=0,of.which.seq.found=0,is.GVD="FALSE")
  res[[i]] <- df
  df <- NULL}else{
      
      for(w in 1:nrow(s)){
        v <- s[w,2]
        v <- subseq(v, start=min.aa, end=max.aa)
        r <- grepl("LDV",v, fixed = TRUE)
        res2[[stp]] <- data.frame(gene=i,position=k,available.prot.seq=nrow(g),of.which.seq.found=w,is.GVD=as.character(r))
      }}
    }
  }
}
res <- unique(bind_rows(res2))
res <- subset(res, res$is.GVD == "TRUE")
ldv <- res
#ldv$motif <- "LDV" #no LDV

### GFPGER filter ###

#first, filter all proteins that have no LDV at all#
m <- sm
l <- list()
pb <- txtProgressBar(min = 0, max = length(unique(m$gene)), style = 3)
stp <- 0
for(i in unique(m$gene)){
  stp <- stp+1
  setTxtProgressBar(pb, stp)
  z <- subset(m, m$gene == i)
  g <- proteins(EnsDb.Hsapiens.v86, filter= GeneNameFilter(i), columns=c("uniprot_id","protein_sequence"),return.type = "data.frame")
  g <- subset(g, g$uniprot_id %in% unique(z$Uniprot_ID))
  g$r <- grepl("GFPGER",g$protein_sequence, fixed = TRUE)
  g <- as.data.frame(g)
  g$protein_sequence <- NULL
  l[[i]] <- g
}
l <- bind_rows(l)
l <- subset(l, l$r != "FALSE")

#next, backfilter m to include only the genes with a valid entry in l#
m <- subset(m, m$gene %in% l$gene_name & m$Uniprot_ID %in% l$uniprot_id)

#and then, the actual filter#
pb <- txtProgressBar(min = 0, max = length(unique(m$gene)), style = 3)
stp <- 0
res <- list()
res2 <- list()
for(i in unique(m$gene)){
  stp <- stp+1
  setTxtProgressBar(pb, stp)
  z <- subset(m, m$gene == i)
  g <- proteins(EnsDb.Hsapiens.v86, filter= GeneNameFilter(i), columns=c("uniprot_id","protein_sequence"),return.type = "data.frame")
  g <- subset(g, g$uniprot_id %in% unique(z$Uniprot_ID))
  if(nrow(g)==0){df <- data.frame(gene=i,position=0,available.prot.seq=0,of.which.seq.found=0,is.GVD="FALSE")
  res[[i]] <- df
  df <- NULL}else{
    g$n.aa <- nchar(g$protein_sequence)
    aa <- unique(z$Amino_Acid_Change.x)

    for(k in aa){
     min.aa <- k-3
     max.aa <- k+3
     s <- subset(g, g$n.aa >= min.aa & g$n.aa >= max.aa)
     if(nrow(s)==0){df <- data.frame(gene=i,position=0,available.prot.seq=0,of.which.seq.found=0,is.GVD="FALSE")
  res[[i]] <- df
  df <- NULL}else{
      
      for(w in 1:nrow(s)){
        v <- s[w,2]
        v <- subseq(v, start=min.aa, end=max.aa)
        r <- grepl("GFPGER",v, fixed = TRUE)
        res2[[stp]] <- data.frame(gene=i,position=k,available.prot.seq=nrow(g),of.which.seq.found=w,is.GVD=as.character(r))
      }}
    }
  }
}
res <- unique(bind_rows(res2))
res <- subset(res, res$is.GVD == "TRUE")
gfpger <- res
#gfpger$motif <- "GFPGER" #no LDV

### GLPGER filter ###

#first, filter all proteins that have no LDV at all#
m <- sm
l <- list()
pb <- txtProgressBar(min = 0, max = length(unique(m$gene)), style = 3)
stp <- 0
for(i in unique(m$gene)){
  stp <- stp+1
  setTxtProgressBar(pb, stp)
  z <- subset(m, m$gene == i)
  g <- proteins(EnsDb.Hsapiens.v86, filter= GeneNameFilter(i), columns=c("uniprot_id","protein_sequence"),return.type = "data.frame")
  g <- subset(g, g$uniprot_id %in% unique(z$Uniprot_ID))
  g$r <- grepl("GLPGER",g$protein_sequence, fixed = TRUE)
  g <- as.data.frame(g)
  g$protein_sequence <- NULL
  l[[i]] <- g
}
l <- bind_rows(l)
l <- subset(l, l$r != "FALSE")

#next, backfilter m to include only the genes with a valid entry in l#
m <- subset(m, m$gene %in% l$gene_name & m$Uniprot_ID %in% l$uniprot_id)

#and then, the actual filter#
pb <- txtProgressBar(min = 0, max = length(unique(m$gene)), style = 3)
stp <- 0
res <- list()
res2 <- list()
for(i in unique(m$gene)){
  stp <- stp+1
  setTxtProgressBar(pb, stp)
  z <- subset(m, m$gene == i)
  g <- proteins(EnsDb.Hsapiens.v86, filter= GeneNameFilter(i), columns=c("uniprot_id","protein_sequence"),return.type = "data.frame")
  g <- subset(g, g$uniprot_id %in% unique(z$Uniprot_ID))
  if(nrow(g)==0){df <- data.frame(gene=i,position=0,available.prot.seq=0,of.which.seq.found=0,is.GVD="FALSE")
  res[[i]] <- df
  df <- NULL}else{
    g$n.aa <- nchar(g$protein_sequence)
    aa <- unique(z$Amino_Acid_Change.x)

    for(k in aa){
     min.aa <- k-3
     max.aa <- k+3
     s <- subset(g, g$n.aa >= min.aa & g$n.aa >= max.aa)
     if(nrow(s)==0){df <- data.frame(gene=i,position=0,available.prot.seq=0,of.which.seq.found=0,is.GVD="FALSE")
  res[[i]] <- df
  df <- NULL}else{
      
      for(w in 1:nrow(s)){
        v <- s[w,2]
        v <- subseq(v, start=min.aa, end=max.aa)
        r <- grepl("GLPGER",v, fixed = TRUE)
        res2[[stp]] <- data.frame(gene=i,position=k,available.prot.seq=nrow(g),of.which.seq.found=w,is.GVD=as.character(r))
      }}
    }
  }
}
res <- unique(bind_rows(res2))
res <- subset(res, res$is.GVD == "TRUE")
glpger <- res
#glpger$motif <- "GFPGER" #no LDV

### Bind results together ###

res <- bind_rows(gpp, ldv)
res$id <- paste0(res$gene,"_",res$position)
sm$id <- paste0(sm$gene,"_",sm$Amino_Acid_Change)
sm <- subset(sm, select = c("gene","id"))
res <- merge(sm,res,by="id")
res$id <- NULL
res$gene.y <- NULL
names(res)[1] <- "gene"
names(res)[5] <- "is.GVD.plus.minus.3"
fwrite(res, "MOTIF MAPPING OF non-PTMmut.txt",sep="\t",row.names = F, quote = F)

Actually, they are both unlikely but PTMmut seem better tolerated in toto! Then, are there sequence features that can tell us something about the mapping mutations?

tab <- fread("TABLE 1.txt",header=T,sep="\t")
allmutations <- fread("allmutations.txt", header=T, sep="\t") # import mutation data #
allmutations <- as.data.frame(allmutations)
allmutations$Amino_Acid_Change <- gsub("p.","",allmutations$Amino_Acid_Change)
allmutations$Amino_Acid_Change <- substring(allmutations$Amino_Acid_Change, 1, 1)
m <- merge(tab, allmutations, by.x=c("sample","gene","chr","start","end"), by.y=c("sample","gene","chr","start","end"))
m <- subset(m, m$is.matrisome == "matrisome")

pb <- txtProgressBar(min = 0, max = length(unique(m$gene)), style = 3)
stp <- 0
res <- list()
res2 <- list()
for(i in unique(m$gene)){
  stp <- stp+1
  setTxtProgressBar(pb, stp)
  z <- subset(m, m$gene == i)
  g <- proteins(EnsDb.Hsapiens.v86, filter= GeneNameFilter(i), columns=c("uniprot_id","protein_sequence"),return.type = "data.frame")
  g <- subset(g, g$uniprot_id %in% unique(z$Uniprot_ID))
  if(nrow(g)==0){df <- data.frame(gene=i,position=0,available.prot.seq=0,of.which.seq.found=0,is.GVD="FALSE")
  res[[i]] <- df
  df <- NULL}else{
    g$n.aa <- nchar(g$protein_sequence)
    aa <- unique(z$Amino_Acid_Change.x)
    aa <- aa[aa>11]

    for(k in aa){
     min.aa <- k-10
     max.aa <- k+10
     s <- subset(g, g$n.aa >= min.aa & g$n.aa >= max.aa)
     if(nrow(s)==0){df <- data.frame(gene=i,position=0,available.prot.seq=0,of.which.seq.found=0,is.GVD="FALSE")
  res[[i]] <- df
  df <- NULL}else{
      
      for(w in 1:nrow(s)){
        v <- s[w,2]
        v <- subseq(v, start=min.aa, end=max.aa)
        res2[[stp]] <- data.frame(gene=i,position=k,available.prot.seq=nrow(g),of.which.seq.found=w,is.GVD=as.character(v))
      }}
    }
  }
}
res <- unique(bind_rows(res2))
res$id <- paste0(res$gene,"_",res$position)
m$id <- paste0(sm$gene,"_",sm$Amino_Acid_Change.x)
m <- subset(m, select = c("gene","PTM","id"))
res <- merge(m,res,by="id")
res$id <- NULL
res$gene.y <- NULL
names(res)[1] <- "gene"
names(res)[6] <- "sequence.context.10aa"
ptm <- res

tab <- as.data.frame(fread("all mutations.txt",header=T,sep="\t"))
tab2 <- as.data.frame(fread("all PTM mutations.txt",header=T,sep="\t"))
tab$id <- paste0(tab$sample,"_",tab$gene,"_",tab$Amino_Acid_Change,"_",tab$`cancer type abbreviation`)
tab2$id <- paste0(tab2$sample,"_",tab2$gene,"_",tab2$Amino_Acid_Change,"_",tab2$`cancer type abbreviation`)
tab <- subset(tab, !(tab$id %in% tab2$id))
tab <- subset(tab, tab$is.matrisome == "matrisome")
tab <- na.omit(tab)
ntab <- fread("TABLE 1.txt",header=T,sep="\t")
ntab <- subset(ntab, ntab$is.matrisome == "matrisome")
ntab <- subset(ntab, select=c("gene","Uniprot_ID"))
m <- distinct(merge(tab,ntab,by="gene"))
m <- m %>% mutate_if(bit64::is.integer64, as.integer)
m <- subset(m, m$is.matrisome == "matrisome")

pb <- txtProgressBar(min = 0, max = length(unique(m$gene)), style = 3)
stp <- 0
res <- list()
res2 <- list()
for(i in unique(m$gene)){
  stp <- stp+1
  setTxtProgressBar(pb, stp)
  z <- subset(m, m$gene == i)
  g <- proteins(EnsDb.Hsapiens.v86, filter= GeneNameFilter(i), columns=c("uniprot_id","protein_sequence"),return.type = "data.frame")
  g <- subset(g, g$uniprot_id %in% unique(z$Uniprot_ID))
  if(nrow(g)==0){df <- data.frame(gene=i,position=0,available.prot.seq=0,of.which.seq.found=0,is.GVD="FALSE")
  res[[i]] <- df
  df <- NULL}else{
    g$n.aa <- nchar(g$protein_sequence)
    aa <- unique(z$Amino_Acid_Change)
    aa <- aa[aa>11]

    for(k in aa){
     min.aa <- k-10
     max.aa <- k+10
     s <- subset(g, g$n.aa >= min.aa & g$n.aa >= max.aa)
     if(nrow(s)==0){df <- data.frame(gene=i,position=0,available.prot.seq=0,of.which.seq.found=0,is.GVD="FALSE")
  res[[i]] <- df
  df <- NULL}else{
      
      for(w in 1:nrow(s)){
        v <- s[w,2]
        v <- subseq(v, start=min.aa, end=max.aa)
        res2[[stp]] <- data.frame(gene=i,position=k,available.prot.seq=nrow(g),of.which.seq.found=w,is.GVD=as.character(v))
      }}
    }
  }
}
res <- unique(bind_rows(res2))
res$id <- paste0(res$gene,"_",res$position)
m$id <- paste0(m$gene,"_",m$Amino_Acid_Change)
m <- subset(m, select = c("gene","id"))
res <- merge(m,res,by="id")
res$id <- NULL
res$gene.y <- NULL
names(res)[1] <- "gene"
names(res)[5] <- "sequence.context.10aa"
noptm <- res

l <- list()
l2 <- list()
for(i in unique(ptm$PTM)){
  z <- subset(ptm, ptm$PTM == i)
  g <- z$gene
  l[[i]] <- z$sequence.context.10aa
  z <- subset(noptm, noptm$gene %in% g)
  l2[[i]] <- z$sequence.context.10aa
}
names(l) <- unique(ptm$PTM)
names(l2) <- unique(ptm$PTM) #these are, of course, fictional!

ggseqlogo(l, ncol = 4)
ggseqlogo(l2, ncol = 4)
---
title: "Disruptive PTM mutations in the tumor matrisome"
self_contained: yes
output:
  html_notebook: default
  pdf_document: default
  html_document:
    df_print: paged
---

STEP 0: Import all data and fix formats.

```{r, eval=F}

library(dplyr)
library(data.table)

setwd() # fix this according to your system #

allmutations <- fread("allmutations.txt", header=T, sep="\t") # import mutation data #
allmutations <- as.data.frame(allmutations)
allmutations$effect <- gsub("3UTR.*", "3UTR", allmutations$effect) 
allmutations$SIFT <- gsub("\\(.*", "", allmutations$SIFT)
allmutations$PolyPhen <- gsub("\\(.*", "", allmutations$PolyPhen)
allmutations$Amino_Acid_Change <- gsub("[^0-9]", "", allmutations$Amino_Acid_Change) 
allmutations$Amino_Acid_Change <- as.numeric(allmutations$Amino_Acid_Change)

allmutations$effect <- gsub("Frame_Shift_Del", "Frameshift", allmutations$effect) # simple recode effect #
allmutations$effect <- gsub("Frame_Shift_Ins", "Frameshift", allmutations$effect)
allmutations$effect <- gsub("In_Frame_Del", "Deletion", allmutations$effect)
allmutations$effect <- gsub("In_Frame_Ins", "Insertion", allmutations$effect)
allmutations$effect <- gsub("Flank", "Others", allmutations$effect)
allmutations$effect <- gsub("Intron","Others", allmutations$effect)
allmutations$effect <- gsub("larged","D", allmutations$effect)
allmutations$effect <- gsub("_Mutation","", allmutations$effect)
allmutations$effect <- gsub("RNA", "Others", allmutations$effect)
allmutations$effect <- gsub("_S", " s", allmutations$effect)
allmutations$effect <- gsub("Translation start site", "Others", allmutations$effect)
allmutations$effect <- gsub("UTR", "Others", allmutations$effect)
allmutations$effect <- gsub("[^A-z]", "", allmutations$effect)

allmutations$PolyPhen <- gsub("^$","unknown",allmutations$PolyPhen) # recoding polyphen & SIFT #
allmutations$SIFT <- gsub("^$","unknown",allmutations$SIFT)

conv <- fread("uniprot_matrisome.txt", header=F, sep="\t") # load conversion from genes to protein names #
conv <- conv[,c(1:2)]
colnames(conv) <- c("gene","Uniprot_ID")

mat <- fread("matrisome_hs.txt", header = T, sep="\t") 
conv <- merge(conv, mat, by.x="gene", by.y="Gene Symbol")
conv <- as.data.frame(conv)

#Adding a column that states whether a gene is matrisome or not #
allmutations$is.matrisome <- ifelse(allmutations$gene %in% mat$`Gene Symbol`, "matrisome", "rest.of.genome")
cl <- fread("TCGAclindata.txt", header=T,sep="\t")
cl <- cl[,c(1,3)]
cl <- unique(merge(cl, allmutations, by="sample"))
cl <- subset(cl, cl$effect != "Silent")

```

STEP 1: A general overview of the data included in the study so far. We have a total of 2277979 non-silent entries from 9075 patients and 32 tumor types. Of these, only ~ 6.6% occur in the matrisome, but considering that the matrisome is 1027 genes in total (vs. 20228 other genes) the frequency of mutation "per gene" is ~ 147 in the matrisome vs. ~105 in the rest of the genome, in line with our previous report [https://pubmed.ncbi.nlm.nih.gov/32722287/]. Regardless, there is no difference between the % of the different types of mutation in matrisome vs. rest of the genome, again as reported. 

```{r, eval=F}

length(unique(cl$sample))
length(unique(cl$`cancer type abbreviation`))
allmutations <- cl

tab1 <- table(allmutations$is.matrisome)
tab1 <- (tab1/2277979)*100
length(unique(mat$`Gene Symbol`))
sb <- subset(allmutations, allmutations$is.matrisome != "matrisome")
length(unique(sb$gene))
tab1 <- table(allmutations$is.matrisome)
pie(tab1, col = c("#fdae61","#4575b4"))
tab1[1] <- tab1[1]/1027
tab1[2] <- tab1[2]/20259
barplot(tab1, col = c("#fdae61","#4575b4"), ylim = c(0,200))
tab1 <- table(allmutations$is.matrisome, allmutations$effect) 
tab2 <- (tab1/rowSums(tab1))*100
chisq.test(tab2) #not significant#

library(ggplot2)
vec <- c("#fdae61","#4575b4", "#313695", "#92c5de", "#d73027","#fee090", "#e0f3f8","#f46d43", "#abd9e9","#a50026") # vector for colours #
df <- as.data.frame(tab2)
colnames(df) <- c("type", "mutation", "frequency (%)")
df$type <- gsub("rest.of.genome", "nonmatrisome", df$type)
ggplot(df, aes(x=mutation, y=`frequency (%)`, fill=type)) + # side by side barplot #
     geom_bar(position=position_dodge() , stat="identity") + 
     theme_classic() + 
     scale_fill_manual(values = vec) + 
     coord_flip()

tab1 <- table(allmutations$is.matrisome, allmutations$SIFT)
tab2 <- (tab1/rowSums(tab1))*100
chisq.test(tab2) #not significant#
df <- as.data.frame(tab2)
colnames(df) <- c("type", "mutation", "frequency (%)")
df$type <- gsub("rest.of.genome", "nonmatrisome", df$type)
ggplot(df, aes(x=mutation, y=`frequency (%)`, fill=type)) + # side by side barplot #
     geom_bar(position=position_dodge() , stat="identity") + 
     theme_classic() + 
     scale_fill_manual(values = vec) + 
     coord_flip()

tab1 <- table(allmutations$is.matrisome, allmutations$PolyPhen)
tab2 <- (tab1/rowSums(tab1))*100
chisq.test(tab2) #not significant#
df <- as.data.frame(tab2)
colnames(df) <- c("type", "mutation", "frequency (%)")
df$type <- gsub("rest.of.genome", "nonmatrisome", df$type)
ggplot(df, aes(x=mutation, y=`frequency (%)`, fill=type)) + # side by side barplot #
     geom_bar(position=position_dodge() , stat="identity") + 
     theme_classic() + 
     scale_fill_manual(values = vec) + 
     coord_flip()

```

STEP 2: Intersect mutation data with PTM sites data and identify mutations which potentially disrupt PTMs in tumors. In the first chunk, all PTM sites are loaded and integrated with mutation sites.

```{r, eval=F}

# PTMs - phosphorylation #

library(R.utils)
# gunzip("Phosphorylation_site_dataset.gz") # After opening the file with editor as a txt file, the first three rows were deleted #

phosdata <- fread("Phosphorylation_site_dataset.txt", header=T, sep="\t") # load PTM #
phosdata$PTM <- c("phosphorylation")
phosdata$MOD_RSD <- gsub("[^0-9]", "", phosdata$MOD_RSD)
phosdata$MOD_RSD <- as.numeric(phosdata$MOD_RSD)

# PTMs - acetylation #

# gunzip("Acetylation_site_dataset.gz") # After opening the file with editor as a txt file, the first three rows were deleted #

acdata <- fread("Acetylation_site_dataset.txt", header=T, sep="\t") # load PTM #
acdata$PTM <- c("acetylation")
acdata$MOD_RSD <- gsub("[^0-9]", "", acdata$MOD_RSD)
acdata$MOD_RSD <- as.numeric(acdata$MOD_RSD)

# PTMs - methylation #

# gunzip("Methylation_site_dataset.gz") # After opening the file with editor as a txt file, the first three rows were deleted #

metdata <- fread("Methylation_site_dataset.txt", header=T, sep="\t") # load PTM #
metdata$PTM <- c("methylation")
metdata$MOD_RSD <- gsub("[^0-9]", "", metdata$MOD_RSD)
metdata$MOD_RSD <- as.numeric(metdata$MOD_RSD)

# PTMs - O-glycosylation #

# gunzip("O-GalNAc_site_dataset.gz") # After opening the file with editor as a txt file, the first three rows were deleted #
# gunzip("O-GlcNAc_site_dataset.gz") # Same as on top #

Ogal <- fread("O-GalNAc_site_dataset.txt", header=T, sep="\t") # load PTM #
Ogal$MOD_RSD <- gsub("[^0-9]", "", Ogal$MOD_RSD)
Ogal$MOD_RSD <- as.numeric(Ogal$MOD_RSD)

Oglc <- fread("O-GlcNAc_site_dataset.txt", header=T, sep="\t")
Oglc$MOD_RSD <- gsub("[^0-9]", "", Oglc$MOD_RSD)
Oglc$MOD_RSD <- as.numeric(Oglc$MOD_RSD)

OG <- rbind(Ogal, Oglc)
OG$PTM <- c("O-glycosylation")

# PTMs - sumoylation #

# gunzip("Sumoylation_site_dataset.gz") # After opening the file with editor as a txt file, the first three rows were deleted #

sumodata <- fread("Sumoylation_site_dataset.txt", header=T, sep="\t") # load PTM #
sumodata$PTM <- c("sumoylation")
sumodata$MOD_RSD <- gsub("[^0-9]", "", sumodata$MOD_RSD)
sumodata$MOD_RSD <- as.numeric(sumodata$MOD_RSD)

# PTMs - ubiquitylation #

# gunzip("Ubiquitination_site_dataset.gz") # After opening the file with editor as a txt file, the first three rows were deleted #

ubidata <- fread("Ubiquitination_site_dataset.txt", header=T, sep="\t") # load PTM #
ubidata$PTM <- c("ubiquitylation")
ubidata$MOD_RSD <- gsub("[^0-9]", "", ubidata$MOD_RSD)
ubidata$MOD_RSD <- as.numeric(ubidata$MOD_RSD)

# PTMs - N-glycosylation #

Nglyc <- fread("N-GlycositeAtlas_HumanAll.txt", sep="\t", header=T) # data from N-GlycositeAtlas, converted into txt file and deletion of first row #
Nglyc <- Nglyc[,2:5]
colnames(Nglyc) <- c("Uniprot_ID", "gene", "protein", "Amino_Acid_Change")
Nglyc$PTM <- c("N-glycosylation")
Nglyc$Amino_Acid_Change <- as.numeric(Nglyc$Amino_Acid_Change)

# PTMs - hydroxylation #

h <- fread("Hydroxylation sites_Uniprot.txt", sep="\t", header=T) # data from uniprot #
colnames(h) <- c("Uniprot_ID", "gene", "specification", "Amino_Acid_Change", "validation", "PTM")
h$specification <- NULL

## Combine all PTMs ##

allPTMs <- rbind(acdata, metdata) # combining all mutated PTM sites #
allPTMs <- rbind(allPTMs, OG)
allPTMs <- rbind(allPTMs, phosdata)
allPTMs <- rbind(allPTMs, sumodata)
allPTMs <- rbind(allPTMs, ubidata)
allPTMs <- allPTMs[, c(1:3, 5, 15)]
colnames(allPTMs) <- c("gene", "protein", "Uniprot_ID", "Amino_Acid_Change", "PTM")

allPTMs <- rbind(allPTMs, h, fill=T) 
allPTMs <- rbind(allPTMs, Nglyc, fill=T)
allPTMs$validation <- NULL

allPTMs <- as.data.frame(allPTMs)
allPTMs <- unique(allPTMs)
allPTMs$gene <- toupper(allPTMs$gene) # the file contains upper and lower case gene names, some wouldn't match with mutations otherwise #

hum <- fread("uniprot-proteome_UP000005640.tab",header=T,sep="\t") #need to cut to human proteins only, some of the IDs included are form other species!#
allPTMs <- subset(allPTMs, allPTMs$Uniprot_ID %in% hum$Entry)
allmutations_PTMs <- merge(allmutations, allPTMs, by.x=c("gene", "Amino_Acid_Change"), by.y=c("gene", "Amino_Acid_Change"))

fwrite(allmutations_PTMs, "All PTM mutations.txt", sep="\t", row.names = F, quote=F)
fwrite(allmutations, "All mutations.txt", sep="\t", row.names = F, quote=F)

```

In the second chunk, we start to analyze the breakdowns and genomic patterns of disruptive PTM mutations (PTMmut). In total, there are 42733 PTMmut (~1.87% of all mutations) from 6303 patients and 32 cancers, of which 1811 from the matrisome. The ratio of PTMmut/total mut is thus 1.2% for the matrisome and 1.9% for the rest of the genome. Note that the matrisome has, on average, lower PTMmut frequency per unit length as compared to the rest of the genome, oppositely to what observed for other mutations. 

```{r, eval=F}

#PTMmut vs non-PTMmut#
dim(allmutations_PTMs)
(nrow(allmutations_PTMs)/nrow(allmutations))*100
length(unique(allmutations_PTMs$sample))
length(unique(allmutations_PTMs$`cancer type abbreviation`))
nrow(subset(allmutations_PTMs, allmutations_PTMs$is.matrisome != "rest.of.genome"))

r1 <- (nrow(subset(allmutations_PTMs, allmutations_PTMs$is.matrisome != "rest.of.genome"))/nrow(subset(allmutations, allmutations$is.matrisome != "rest.of.genome")))*100
r2 <- (nrow(subset(allmutations_PTMs, allmutations_PTMs$is.matrisome == "rest.of.genome"))/nrow(subset(allmutations, allmutations$is.matrisome == "rest.of.genome")))*100
r1
r2

tab1 <- c((nrow(allmutations_PTMs)/nrow(allmutations))*100,100-((nrow(allmutations_PTMs)/nrow(allmutations))*100))
names(tab1) <- c("PTMmut", "other mutations")
pie(tab1, col = c("red","#4575b4"))

#N of PTMmut by gene length#
library(goseq)
g <- unique(allmutations_PTMs$gene)
l<- getlength(g,'hg19','geneSymbol')
df <- data.frame(g=g,l=l)
tab1 <- as.data.frame(table(allmutations_PTMs$gene))
df <- merge(df,tab1,by.x="g",by.y="Var1")
df$r <- df$Freq/df$l
df$source <- ifelse(df$g %in% mat$`Gene Symbol`, "matrisome", "rest.of.genome")
df <- na.omit(df)
tab1 <- as.data.frame(table(allmutations$gene))
df <- merge(df,tab1,by.x="g",by.y="Var1")
df$Freq.y <- df$Freq.y-df$Freq.x
df$r2 <- df$Freq.y/df$l
df <- na.omit(df)
v <- aggregate(df$r,list(df$source),mean)
v2 <- aggregate(df$r2,list(df$source),mean) #all non-PTM-muts#
v <- merge(v,v2,by="Group.1")

df <- data.frame(g=g,l=l)
tab1 <- as.data.frame(table(allmutations$gene, allmutations$`cancer type abbreviation`))
df <- merge(df,tab1,by.x="g",by.y="Var1")
df$r <- df$Freq/df$l
df <- na.omit(df)
v <- aggregate(df$r,list(df$Var2),mean)
v$ord <- c(1:nrow(v))
v <- v[order(-v$ord),]
v$ord <- c(1:nrow(v))

ggplot(v, aes(x=ord, y=x)) + # side by side barplot #
     geom_bar(position=position_dodge() , stat="identity", fill="#4575b4") + 
     theme_classic() +  
     coord_flip() +
     scale_x_continuous(breaks = c(1:32),labels = as.character(v$Group.1)) +
     xlab("") + ylab("mean frequency of mutations per gene length")

df <- data.frame(g=g,l=l)
tab1 <- as.data.frame(table(allmutations_PTMs$gene, allmutations_PTMs$`cancer type abbreviation`))
df <- merge(df,tab1,by.x="g",by.y="Var1")
df$r <- df$Freq/df$l
df <- na.omit(df)
v <- aggregate(df$r,list(df$Var2),mean)
v$ord <- c(1:nrow(v))
v <- v[order(-v$ord),]
v$ord <- c(1:nrow(v))

ggplot(v, aes(x=ord, y=x)) + # side by side barplot #
     geom_bar(position=position_dodge() , stat="identity", fill="#fdae61") + 
     theme_classic() +  
     coord_flip() +
     scale_fill_manual(values="#fdae61") +
     scale_x_continuous(breaks = c(1:32),labels = as.character(v$Group.1)) +
     xlab("") + ylab("mean frequency of PTMmut per gene length")

```

Transitions and transversions are also equal in absolute, in line with previous reports. In comparison, also, transitions and transversions seem to have similar mutational effect, the only noticeable difference being the reduced amount of transversions among splice-site mutations. 

```{r, eval=F}

m <- subset(allmutations_PTMs, !nchar(allmutations_PTMs$reference) > 1)
m <- subset(m, !nchar(m$alt) > 1)
m <- subset(m, !m$reference == "-")
m <- subset(m, !m$alt == "-")
m$mut.type <- paste0(m$reference,m$alt)
m$mut.type <- ifelse(m$mut.type == "AC", "transversions", ifelse(m$mut.type == "CA", "transversions", ifelse(m$mut.type == "GT", "transversions", ifelse(m$mut.type == "TG", "transversions", "transitions"))))
chisq.test(table(m$is.matrisome,m$mut.type)) #P-value = 0.5431
tab1 <- as.data.frame(table(m$is.matrisome,m$mut.type))

library(ggsci)
ggplot(tab1, aes(x=Var1, y=Freq, fill=Var2)) +
     geom_bar(position="fill" , stat="identity") + 
     scale_fill_aaas() + 
     theme_classic() + xlab("") 

m <- subset(allmutations, !nchar(allmutations$reference) > 1)
m <- subset(m, !nchar(m$alt) > 1)
m <- subset(m, !m$reference == "-")
m <- subset(m, !m$alt == "-")
m$mut.type <- paste0(m$reference,m$alt)
m$mut.type <- ifelse(m$mut.type == "AC", "transversions", ifelse(m$mut.type == "CA", "transversions", ifelse(m$mut.type == "GT", "transversions", ifelse(m$mut.type == "TG", "transversions", "transitions"))))
chisq.test(table(m$is.matrisome,m$mut.type)) #P-value = 0.01251, but if using %'s [tb <- table(m$is.matrisome,m$mut.type) ; tb <- (tb/rowSums(tb))*100] then there clearly are no differences!
tab1 <- as.data.frame(table(m$is.matrisome,m$mut.type))

ggplot(tab1, aes(x=Var1, y=Freq, fill=Var2)) +
     geom_bar(position="fill" , stat="identity") + 
     scale_fill_aaas() + 
     theme_classic() + xlab("")

m <- subset(allmutations_PTMs, !nchar(allmutations_PTMs$reference) > 1)
m <- subset(m, !nchar(m$alt) > 1)
m <- subset(m, !m$reference == "-")
m <- subset(m, !m$alt == "-")
m$mut.type <- paste0(m$reference,m$alt)
m$mut.type <- ifelse(m$mut.type == "AC", "transversions", ifelse(m$mut.type == "CA", "transversions", ifelse(m$mut.type == "GT", "transversions", ifelse(m$mut.type == "TG", "transversions", "transitions"))))

m.1 <- subset(m, m$is.matrisome == "matrisome")
chisq.test(table(m.1$mut.type,m.1$effect)) #P-value = 4.708e-06
tab1 <- as.data.frame(table(m.1$mut.type,m.1$effect))
m.2 <- subset(m, m$is.matrisome != "matrisome")
chisq.test(table(m.2$mut.type,m.2$effect)) #P-value = 2.2e-16
tab2 <- as.data.frame(table(m.2$mut.type,m.2$effect))
tab1$source <- "matrisome"
tab2$source <- "rest.of.genome"
tab <- bind_rows(tab1,tab2)
tab$source <- paste0(tab$source,"_",tab$Var1)
tab <- subset(tab, tab$Var2 != "Others") #there is jut one "Others" mutation here, to be removed to make columns compatible later#
res <- list()
for(i in unique(tab$Var2)){
  z <- subset(tab, tab$Var2 == i)
  df <- as.data.frame(z$Freq)
  names(df) <- unique(z$Var2)
  res[[i]] <- df
}
res <- bind_cols(res)
rownames(res) <- unique(tab$source)

ggplot(tab, aes(x=source, y=Freq, fill=Var2)) +
     geom_bar(position="fill" , stat="identity") + 
     scale_fill_aaas() + 
     theme_classic() + theme(axis.text.x = element_text(angle = 90, hjust = 0.5, vjust = 0.5, size = 10)) +   xlab("")

m.1 <- merge(mat,m.1,by.x="Gene Symbol",by.y="gene")
m.1a <- subset(m.1, m.1$mut.type == "transitions")
m.1b <- subset(m.1, m.1$mut.type == "transversions")
t1 <- as.data.frame(table(m.1a$Category))
t1$type="transitions"
t2 <- as.data.frame(table(m.1b$Category))
t2$type="transversions"

tab1 <- rbind(t1,t2) #P-value : 0.4122#

ggplot(tab1, aes(x=type, y=Freq, fill=Var1)) +
     geom_bar(position="fill" , stat="identity") + 
     scale_fill_d3() + 
     theme_classic() + theme(axis.text.x = element_text(angle = 90, hjust = 0.5, vjust = 0.5, size = 10)) + 
  xlab("")

```

At a glance, comparison of disruptive PTMmut in matrisome vs rest of the genome shows accumulation of events potentially affecting hydroxylation, N- and O-glycosylation in the matrisome, which is conversely devoid of methylation, sumoylation and ubiquitylation event. Acetylation and phoshporylation events seem comparable. However, the baseline frequency of PTM events in the genome is different, so we need to account for such differences before comparing. When we do, we get a measure we call "burden" (actually, the global burden) and we notice that the matrisome accumulates more PTMmut than the rest of the genome in almost all categories, with the most evident differences in hydroxylation, O-glycosylation and acetylation. 

```{r, eval=F}

#overall PTMs and PTMmut in matrisome vs. non-matrisome#
allPTMs$source <- ifelse(allPTMs$gene %in% mat$`Gene Symbol`, "matrisome","rest.of.genome")
tab1 <- table(allPTMs$source,allPTMs$PTM)
tab2 <- (tab1/rowSums(tab1))*100 
chisq.test(tab2) # p-value = 0.000149 -> significant! #
df <- as.data.frame(tab2)
colnames(df) <- c("type", "PTM", "frequency (%)")

ggplot(df, aes(x=type, y=`frequency (%)`, fill=PTM)) +
  geom_bar(position="fill" , stat="identity") + 
  theme_classic() + 
  scale_fill_d3() + 
  theme(axis.text.x = element_text(angle = 90, hjust = 0.5, vjust = 0.5, size = 10)) + 
  xlab("")

tab1 <- table(allmutations_PTMs$is.matrisome,allmutations_PTMs$PTM)
tab2 <- (tab1/rowSums(tab1))*100 
chisq.test(tab2) # p-value = 0.0002194 -> significant! #
df <- as.data.frame(tab2)
colnames(df) <- c("type", "PTM", "frequency (%)")

ggplot(df, aes(x=type, y=`frequency (%)`, fill=PTM)) +
  geom_bar(position="fill" , stat="identity") + 
  theme_classic() + 
  scale_fill_d3() + 
  theme(axis.text.x = element_text(angle = 90, hjust = 0.5, vjust = 0.5, size = 10)) + 
  xlab("")

#normalization of N of PTMmut by N Ptm sites (burden), matrisome vs. non-matrisome#
res <- list()
for(i in unique(allmutations_PTMs$`cancer type abbreviation`)){
  print(paste0("analyzing cancer: ",i))
  z <- subset(allmutations_PTMs, allmutations_PTMs$`cancer type abbreviation` == i)
  z <- unique(z[,c(1,2,17)])
  z <- as.data.frame(table(z$gene,z$PTM))
  z2 <- subset(allPTMs, allPTMs$gene %in% z$Var1)
  z2 <- as.data.frame(table(z2$gene,z2$PTM))
  m <- merge(z,z2,by=c("Var1","Var2"),all.y=T)
  m$tot <- rowSums(m[,c(3,4)])
  #m <- subset(m,m$tot > 0)
  m$burden <- round((m$Freq.x/m$tot)*100,2)
  m$burden[is.na(m$burden)] <- 0
  m$cancer <- i
  res[[i]] <- m
}
res <- bind_rows(res)
res$source <- ifelse(res$Var1 %in% mat$`Gene Symbol`, "matrisome", "rest of genome")
v <- aggregate(res$burden,list(res$Var2, res$source),mean)
v2 <- data.frame(Group.1="hydroxylation",Group.2="rest of genome",x=0)
v <- bind_rows(v,v2)

ggplot(v, aes(x=Group.1, y=x, fill=Group.2)) + # side by side barplot #
     geom_bar(position=position_dodge() , stat="identity") + 
     theme_classic() + 
     scale_fill_manual(values = vec) + 
     theme(axis.text.x = element_text(angle = 90, hjust = 0.5, vjust = 0.5, size = 10)) +      xlab("") + ylab("")

res2 <- list()
for(i in unique(res$Var2)){
  z <- subset(res, res$Var2==i)
  x <- subset(z,z$source == "matrisome")
  y <- subset(z, z$source != "matrisome")
  tt <- t.test(x$burden,y$burden)
  p <- paste0("p value for ",i," : ",format.pval(tt$p.value))
  res2[[i]] <- p
} #hydroxylation cannot be counted, we proxy at p-value < 2.22e-16#

v2 <- list()
for(i in unique(v$Group.2)){
  z <- subset(v, v$Group.2 == i)
  df <- data.frame(source=i, t(z$x))
  names(df)[2:ncol(df)] <- z$Group.1
  v2[[i]] <- df
}
v2 <- bind_rows(v2)
v2 <- v2[,c(1:9)]
rownames(v2) <- v2[,1]
v2[,1] <- NULL
v2[is.na(v2)] <- 0.00000000001
v2 <- v2[1,]/v2[2,]
v2

```

Finally, we recalculate the burdens (exact by mutation), merge with the PTMmut table and export it.

```{r, eval=F}

res <- list()
for(i in unique(allmutations_PTMs$`cancer type abbreviation`)){
  print(paste0("analyzing cancer: ",i))
  z <- subset(allmutations_PTMs, allmutations_PTMs$`cancer type abbreviation` == i)
  z <- unique(z[,c(1,2,17)])
  g <- z$gene
  z$gene <- paste0(z$gene,"_",z$Amino_Acid_Change)
  z <- as.data.frame(table(z$gene,z$PTM))
  z2 <- subset(allPTMs, allPTMs$gene %in% g)
  z2 <- as.data.frame(table(z2$gene,z2$PTM))
  z$nVar1 <- gsub("\\_.*","",z$Var1)
  names(z)[1] <- "id"
  names(z)[4] <- "Var1"
  m <- merge(z,z2,by=c("Var1","Var2"),all.y=T)
  m[is.na(m)]<- 0
  m$tot <- rowSums(m[,c(4,5)])
  m <- subset(m,m$tot > 0)
  m$burden <- round((m$Freq.x/m$tot)*100,2)
  m$cancer <- i
  m <- subset(m,m$burden > 0)
  res[[i]] <- m
}
res <- bind_rows(res)
allmutations_PTMs$id <- paste0(allmutations_PTMs$gene,"_",allmutations_PTMs$Amino_Acid_Change)

fin <- merge(allmutations_PTMs,res,by.x=c("cancer type abbreviation","id"),by.y=c("cancer","id"))
fin <- as.data.frame(fin)
fin <- fin[,c(1,4,3,4:18,24)]
fin[,4] <- NULL
fin <- fin[,c(4,1,3,2,5:ncol(fin))]
fwrite(fin,"TABLE 1.txt",sep="\t",row.names = F,quote = F)

```

Before moving on, we calculate the correlation between the burden and the N of mutations. Expectedly, these two measures are generally opposite and breaking down by tumor type and matrisome category confirms that all significant correlations found are negative.

```{r, eval=F}

fin2 <- merge(fin,mat,by.x="gene",by.y="Gene Symbol")
fin2 <- unique(fin2)
res <- list()
res2 <- list()
for(i in unique(fin2$`cancer type abbreviation`)){
  print(paste0("analyzing tumor: ",i))
  z <- subset(fin2, fin2$`cancer type abbreviation` == i)
  z <- na.omit(z)
  for(w in unique(z$Category)){
    print(paste0("*** now analyzing matrisome category: ",w))
    df <- subset(z, z$Category == w)
    x <- df[, c(1,18)]
    y <- as.data.frame(table(df$gene))
    m <- na.omit(m)
    m <- merge(x,y,by.x="gene",by.y="Var1")
    if(nrow(m)>=4){
        ct <- cor.test(m$burden,m$Freq)
        m$tumor <- i
        m$category <- w
        m$corr <- ct$estimate
        m$pval <- ct$p.value}else{
          m$tumor <- i
          m$category <- w
          m$corr <- 0
          m$pval <- 1
        }
    ct <- NULL
    res[[w]] <- m
  }
  res2[[i]] <- bind_rows(res)
}
res2 <- bind_rows(res2)
res2$eval <- ifelse(res2$pval < 0.05, "significant", "NS")
res2$dir <- ifelse(res2$corr > 0 , "positive", "negative")

for(i in unique(res2$category)){
    z <- subset(res2, res2$category == i)
    lm <- lm(z$burden~z$Freq)
    s <- summary(lm)
    df <- data.frame(category=i, lr=s$coefficients[2,1], pval=format.pval(s$coefficients[2,4]))
    print(df)
}

res3 <- unique(res2[,c(4:9)])
table(res3$category,res3$eval,res3$dir)

ggplot(res2, aes(x=burden, y=Freq, color=factor(category))) +
  geom_point() + 
  scale_color_d3() + 
  theme_minimal() + 
  geom_smooth(method = "lm")

tab <- as.data.frame(table(res3$category,res3$eval,res3$dir))
df <- aggregate(tab$Freq,by=list(tab$Var3, tab$Var2),sum)
pie(c(df[1,3],df[3,3]),col = c("blue","red"))
pie(c(df[2,3],df[4,3]),col = c("blue","red"))

```

STEP 3: PTMmut in the tumor matrisome at the gene level. Here we focus on the matrisome, comparing frequency of mutations between different matrisome categories across cancers etc. Aggregating data by matrisome category shows a clear redistribution of PTMmut types, so that, e.g., hydroxylation is 52% of all PTMmut in collagens (and hydroxy-PTMmut in collagens are 74% of all PTMmut for that PTM) while phoshorylation is 72% of all PTMmut in secreted factors and 21.4% of all phoshpo-PTMmut.

```{r, eval=F}

fin2 <- unique(merge(fin,mat,by.x="gene",by.y="Gene Symbol"))
m <- subset(fin2, fin2$is.matrisome == "matrisome")
chisq.test(table(m$Division, m$PTM)) #p-value < 2.2e-16 significant!#
df <- as.data.frame(table(m$Division, m$PTM))

vec <- c("darkkhaki","cadetblue3","skyblue4","peachpuff3","navajowhite4","sandybrown","cadetblue4","lightblue3")

ggplot(df, aes(x=Var1, y=Freq, fill=Var2)) +
  geom_bar(position="fill" , stat="identity") + 
  theme_classic() + 
  scale_fill_manual(values = vec) + 
  theme(axis.text.x = element_text(angle = 0, hjust = 0.5, vjust = 0.5, size = 10)) + 
  xlab("")

chisq.test(table(m$Category, m$PTM)) #p-value < 2.2e-16 significant!#
df <- as.data.frame(table(m$Category, m$PTM))

ggplot(df, aes(x=Var1, y=Freq, fill=Var2)) +
  geom_bar(position="fill" , stat="identity") + 
  theme_classic() + 
  scale_fill_manual(values = vec) + 
  theme(axis.text.x = element_text(angle = 90, hjust = 0.5, vjust = 0.5, size = 10)) + 
  xlab("")

#PTMmut, % of tot by matrisome category
tab <- table(fin2$Category,fin2$PTM)
tab1 <- round((tab/rowSums(tab))*100,1)
tab1

#PTMmut, % of tot by total in that type of PTM
v <- apply(tab,2,sum)
tab2 <- apply(tab,2,function(x){round((x/sum(x)*100),1)})

```

The trends seem very similar pan-cancer, with SKCM, STAD and UCEC clearly leading the way. There seem to be no tissue-of-origin effects, though various local differences exist.

```{r, eval=F}

tab <- as.data.frame(table(fin2$`cancer type abbreviation`,fin2$Category,fin2$PTM))

ggplot(tab, aes(x=Var1, y=Var3, size=Freq, fill=Var3)) + 
geom_point(alpha=1, shape=21, color="black") + 
theme_classic() + 
scale_size_area(max_size = 8, name="freq") +
scale_fill_manual(values=vec) +
theme(axis.text.x = element_text(angle=90, size=10, vjust=0.4), axis.text.y = element_text(size=8)) + 
xlab("") + 
ylab("") +
facet_wrap(tab$Var2)

res <- list()
res2 <- list()
for(i in unique(tab$Var2)){
  z <- subset(tab, tab$Var2 == i)
  z$Var2 <- NULL
  for(w in unique(z$Var3)){
    k <- subset(z, z$Var3==w)
    df <- data.frame(matrisome=i,PTM=w,t(k$Freq))
    names(df)[3:ncol(df)] <- as.character(k$Var1)
    res[[w]]<- df
  }
  res2[[i]] <- bind_rows(res)
}
res2 <- bind_rows(res2)
fwrite(res2,"TABLE 2.txt", sep="\t", row.names = F, quote = F)

ndf <- list()
for(i in unique(res2$matrisome)){
  z <- subset(res2, res2$matrisome == i)
  print(paste0("now graphing data for: ",i))
  rownames(z) <- z$PTM
  z <- z[,c(3:ncol(z))]
  for(w in 1:nrow(z)){
    k <- z[w,]
    k <- as.data.frame(t(k))
    k$tumor <- rownames(k)
    ndf[[w]] <- k
  }
  f <- Reduce(merge,ndf)
  f <- reshape2::melt(f)
  
  p <- ggplot(data=f, aes(x=tumor, y=value, fill=variable)) +
  geom_bar(stat="identity") + 
  scale_fill_manual(values=vec) + 
  theme_bw() +
  facet_wrap(~f$variable) + ggtitle(i) + ggtitle(i) + theme(strip.background = element_rect(colour="black", fill="white"), axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1, size = 4))
  plot(p)
}

```

Examples of hydroxylation and phosphorylation PTMmut in collagens.

```{r, eval=F}

#zoom in: collagens#
prova <- subset(fin2, fin2$Category == "Collagens")
prova <- subset(prova, prova$PTM == "phosphorylation" | prova$PTM == "hydroxylation")
prova <- as.data.frame(table(prova$gene,prova$PTM,prova$`cancer type abbreviation`))
prova <- subset(prova, prova$Freq > 0)

ndf <- list()
ndf2 <- list()
for(i in unique(prova$Var3)){
  z <- subset(prova, prova$Var3 == i)
  for(j in unique(z$Var2)){
    k <- subset(z, z$Var2 == j)
    k <- k[order(-k$Freq),]
    if(nrow(k) >= 10){k <- k[1:10,]}else{k <- k}
    ndf[[j]] <- k
  }
  ndf2[[i]] <- bind_rows(ndf)
}
ndf2 <- bind_rows(ndf2)
ndf2 <- merge(ndf2,mat,by.x="Var1",by.y="Gene Symbol")

ggplot(ndf2, aes(x=Var3, y=Var1, size=Freq, fill=Var3)) + 
geom_point(alpha=1, shape=21, color="black") + 
theme_classic() + 
scale_size_area(max_size = 8, name="freq") +
theme(axis.text.x = element_text(angle=90, size=10, vjust=0.4), axis.text.y = element_text(size=8)) + 
xlab("") + 
ylab("") +
facet_wrap(ndf2$Var2)

```

Finally, the absolute Top10 per tumor type and a look at eventual "hotspots" across and within the cohorts.

```{r, eval=F}

prova <- fin2

# to see the table, for describing it in text:
#prova <- table(prova$gene,prova$`cancer type abbreviation`)
#prova <- ifelse(prova == 0, 0 ,1)
#prova <- as.data.frame(prova)
#prova$tot <- rowSums(prova)
#prova <- prova[order(-prova$tot),]

prova <- as.data.frame(table(prova$gene,prova$`cancer type abbreviation`))
prova <- subset(prova, prova$Freq > 0)

res <- list()
for(i in unique(prova$Var2)){
  print(paste0("now analyzing: ",i))
  z <- subset(prova, prova$Var2 == i)
  z <- z[order(-z$Freq),]
  if(nrow(z) <= 10){z <- z}else{z <- z[1:10,]}
  res[[i]] <- z
}
res <- bind_rows(res)

ggplot(res, aes(x=Var2, y=Var1, size=Freq, fill=Var2)) + 
geom_point(alpha=1, shape=21, color="black") + 
theme_classic() + 
scale_size_area(max_size = 8, name="freq") +
theme(axis.text.x = element_text(angle=90, size=10, vjust=0.4), axis.text.y = element_text(size=8)) + 
xlab("") + 
ylab("") 

prova <- fin2
prova$mut <- paste(prova$gene,prova$Amino_Acid_Change,prova$PTM,sep="_")
prova <- table(prova$mut, prova$`cancer type abbreviation`)
prova <- as.data.frame.matrix(prova)
prova[prova > 0] <- 1 #this allows calculation of absolute occurrences#
prova$tot <- rowSums(prova)
prova <- prova[order(-prova$tot),]
df1 <- data.frame(mut=rownames(prova),n.of.cohorts=prova$tot)
prova <- fin2
prova$mut <- paste(prova$gene,prova$Amino_Acid_Change,prova$PTM,sep="_")
prova <- table(prova$mut, prova$`cancer type abbreviation`)
prova <- as.data.frame.matrix(prova)
prova$tot <- rowSums(prova)
prova <- prova[order(-prova$tot),]
df2 <- data.frame(mut=rownames(prova),n.total=prova$tot)
df <- merge(df1,df2,by="mut",all=T)
df$col <- ifelse(df$n.of.cohorts >= 3 & df$n.total >= 3, 1, 0)
df <- df[order(-df$col, -df$n.of.cohorts),]
df$col <- ifelse(df$col==1, "1", "0")
df$labs <- gsub("_.*","",df$mut)

ggplot(df, aes(n.total, n.of.cohorts, color=col, label=labs)) + geom_point() +
  theme_classic() + 
  scale_color_aaas() +
  geom_vline(xintercept = 3.0, linetype="dotted", color = "grey80") +
  geom_hline(yintercept = 3.0, linetype="dotted", color = "grey80")

df <- subset(df, df$col == 1)
df$col <- NULL
colnames(df) <- c("mutation","number of cohorts","number of total occurrences")
fwrite(df, "TABLE 3.txt", sep="\t", quote = F, row.names = F)

```

STEP 4: Non-conservation of hydroxylation and glycosylation PTMmut. Why is none of these PTMmut in the hotspot list? Is there a negative selection against them? We check here the distribution of PTMmut type per SIFT/polyphen category and evaluate the ratio of non-silent PTMmut to silent PTMmut (dN/dS), as reported [https://doi.org/10.1186/s13059-018-1434-0].

```{r, eval=F}

prova <- fin2
tab <- table(prova$PTM,prova$SIFT)
tab <- round((tab/rowSums(tab))*100,0)
#for SIFT, we move the low-confidence findings to the "unknown" group#
tab <- tab[,c(1,3,5)]
tab[,3] <- 100-(tab[,1]+tab[,2])
cc <- chisq.test(tab)
tab <- reshape2::melt(tab)

ggplot(tab, aes(x=Var1, y=value, fill=Var2)) +
  geom_bar(position="fill" , stat="identity") + 
  theme_classic() + 
  scale_fill_npg() + 
  theme(axis.text.x = element_text(angle = 90, hjust = 0.5, vjust = 0.5, size = 10)) + 
  xlab("") + ggtitle(paste0("matrisome_SIFT","\n",cc$p.value))

allp <- subset(allmutations_PTMs, allmutations_PTMs$is.matrisome == "rest.of.genome")
tab <- table(allp$PTM,allp$SIFT)
tab <- round((tab/rowSums(tab))*100,0)
#for SIFT, we move the low-confidence findings to the "unknown" group#
tab <- tab[,c(1,3,5)]
tab[,3] <- 100-(tab[,1]+tab[,2])
cc <- chisq.test(tab)
tab <- reshape2::melt(tab)

ggplot(tab, aes(x=Var1, y=value, fill=Var2)) +
  geom_bar(position="fill" , stat="identity") + 
  theme_classic() + 
  scale_fill_npg() + 
  theme(axis.text.x = element_text(angle = 90, hjust = 0.5, vjust = 0.5, size = 10)) + 
  xlab("") + ggtitle(paste0("rest of genome_SIFT","\n",cc$p.value))

tab <- table(prova$PTM,prova$PolyPhen)
tab <- round((tab/rowSums(tab))*100,0)
#for Polyphen, we merge the "damaging" groups together#
tab[,2] <- tab[,1]+tab[,2]
tab <- tab[,c(1,2,4)]
colnames(tab)[2] <- "damaging"
cc <- chisq.test(tab)
tab <- reshape2::melt(tab)
levels(tab$Var2) <- c("damaging","benign","unknown")

ggplot(tab, aes(x=Var1, y=value, fill=Var2)) +
  geom_bar(position="fill" , stat="identity") + 
  theme_classic() + 
  scale_fill_npg() + 
  theme(axis.text.x = element_text(angle = 90, hjust = 0.5, vjust = 0.5, size = 10)) + 
  xlab("") + ggtitle(paste0("matrisome_PolyPhen","\n",cc$p.value))

tab <- table(allp$PTM,allp$PolyPhen)
tab <- round((tab/rowSums(tab))*100,0)
#for Polyphen, we merge the "damaging" groups together#
tab[,2] <- tab[,1]+tab[,2]
tab <- tab[,c(1,2,4)]
colnames(tab)[2] <- "damaging"
cc <- chisq.test(tab)
tab <- reshape2::melt(tab)
levels(tab$Var2) <- c("damaging","benign","unknown")

ggplot(tab, aes(x=Var1, y=value, fill=Var2)) +
  geom_bar(position="fill" , stat="identity") + 
  theme_classic() + 
  scale_fill_npg() + 
  theme(axis.text.x = element_text(angle = 90, hjust = 0.5, vjust = 0.5, size = 10)) + 
  xlab("") + ggtitle(paste0("rest of genome_PolyPhen","\n",cc$p.value))

#to calculate the dN/dS ratio, the initial allmutations file must be recreated since we had previously removed all silent mutations!#
allmutations <- fread("allmutations.txt", header=T, sep="\t") # import mutation data #
allmutations <- as.data.frame(allmutations)
allmutations$effect <- gsub("3UTR.*", "3UTR", allmutations$effect) 
allmutations$SIFT <- gsub("\\(.*", "", allmutations$SIFT)
allmutations$PolyPhen <- gsub("\\(.*", "", allmutations$PolyPhen)
allmutations$Amino_Acid_Change <- gsub("[^0-9]", "", allmutations$Amino_Acid_Change) 
allmutations$Amino_Acid_Change <- as.numeric(allmutations$Amino_Acid_Change)
allmutations$effect <- gsub("Frame_Shift_Del", "Frameshift", allmutations$effect) # simple recode effect #
allmutations$effect <- gsub("Frame_Shift_Ins", "Frameshift", allmutations$effect)
allmutations$effect <- gsub("In_Frame_Del", "Deletion", allmutations$effect)
allmutations$effect <- gsub("In_Frame_Ins", "Insertion", allmutations$effect)
allmutations$effect <- gsub("Flank", "Others", allmutations$effect)
allmutations$effect <- gsub("Intron","Others", allmutations$effect)
allmutations$effect <- gsub("larged","D", allmutations$effect)
allmutations$effect <- gsub("_Mutation","", allmutations$effect)
allmutations$effect <- gsub("RNA", "Others", allmutations$effect)
allmutations$effect <- gsub("_S", " s", allmutations$effect)
allmutations$effect <- gsub("Translation start site", "Others", allmutations$effect)
allmutations$effect <- gsub("UTR", "Others", allmutations$effect)
allmutations$effect <- gsub("[^A-z]", "", allmutations$effect)
allmutations$PolyPhen <- gsub("^$","unknown",allmutations$PolyPhen) # recoding polyphen & SIFT #
allmutations$SIFT <- gsub("^$","unknown",allmutations$SIFT)
conv <- fread("uniprot_matrisome.txt", header=F, sep="\t") # load conversion from genes to protein names #
conv <- conv[,c(1:2)]
colnames(conv) <- c("gene","Uniprot_ID")
mat <- fread("matrisome_hs.txt", header = T, sep="\t") 
conv <- merge(conv, mat, by.x="gene", by.y="Gene Symbol")
conv <- as.data.frame(conv)
allmutations$is.matrisome <- ifelse(allmutations$gene %in% mat$`Gene Symbol`, "matrisome", "rest.of.genome")
cl <- fread("TCGAclindata.txt", header=T,sep="\t")
cl <- cl[,c(1,3)]
cl <- unique(merge(cl, allmutations, by="sample"))
allmutations <- cl

res <- list()
res2 <- list()
stp <- 0
pb <- txtProgressBar(min = 0, max = length(unique(allmutations_PTMs$gene)), style = 3)
for(i in unique(allmutations_PTMs$gene)){
  stp <- stp+1
  setTxtProgressBar(pb, stp)
  z1 <- subset(allmutations_PTMs, allmutations_PTMs$gene == i)
  z2 <- subset(allmutations, allmutations$gene == i)
  z2 <- subset(z2, z2$effect =="Silent")
  r1 <- nrow(z1)/nrow(z2)
  z3 <- subset(allmutations, allmutations$gene == i)
  z3 <- subset(z3, z3$effect !="Silent")
  r2 <- (nrow(z3)-nrow(z1))/nrow(z2)
  
  df <- data.frame(gene=i,dN_dS_tot=r2,dN_dS_PTM=r1,PTM=z1$PTM)
  
  res[[i]] <- df
}
res <- bind_rows(res)
res$is.matrisome <- ifelse(res$gene %in% mat$`Gene Symbol`, "matrisome", "rest.of.genome")
res$r <- res$dN_dS_PTM/res$dN_dS_tot

res2 <- list()
for(i in unique(res$PTM)[1:7]){
  x <- subset(res,res$PTM == i & res$is.matrisome == "matrisome")
  y <- subset(res,res$PTM == i & res$is.matrisome != "matrisome")
  tt <- t.test(x$r,y$r)
  df <- data.frame(PTM=i,mean.of.r.matrisome=tt$estimate[1],mean.of.r.rest=tt$estimate[2],pval=tt$p.value,direction=ifelse(tt$estimate[1]>tt$estimate[2],"enriched in matrisome","depleted in matrisome"))
  res2[[i]] <- df
}
res2 <- bind_rows(res2)
m <- reshape2::melt(res2, id.var="PTM")
m <- m[1:14,]

ggplot(m,aes(PTM, as.numeric(value), colour=variable), show.legend=FALSE) +
  geom_boxplot() +
  theme_bw() + theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1))

res2$diff <- res2$mean.of.r.matrisome/mean.of.r.rest
m <- reshape2::melt(res2, id.var="PTM")
m <- m[29:nrow(m),]
ggplot(m,aes(PTM,value,fill=PTM), show.legend=FALSE) +
  geom_bar(stat="identity") +
  theme_bw() + theme() + theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1)) + scale_fill_d3()

fwrite(res2,"table pvals dN-dS.txt",sep="\t",row.names = F, quote = F)

```

PTMmut in the tumor matrisome at the protein domain level. Comparing PTMmut and non-PTMmut we expect to find enrichments/depletions at some domains that might explain why some frequent types of PTMs are not in the top-10 list. To this aim, we proceed as follows:
1) map PTMmut and non-PTMmut into protein domains
2) subset to PFAM domains only
3) compare the ratio of PTMmut to non-PTMmut, by domain, in matrisome and non matrisome genes
4) evaluate the 53 domains in common between matrisome and non matrisome for their ratios
5) evaluate the ratios by PTM types in these domains.
O-glycosylation and actylation are, in fact, less "tolerated"!

```{r, eval=F}

library(ensembldb)
library(EnsDb.Hsapiens.v86)
edb <- EnsDb.Hsapiens.v86
library(magrittr)
pnames <- edb %>% filter(~ symbol == unique(allmutations_PTMs$gene) & tx_biotype =="protein_coding") %>% proteins
z <- select(edb, keys = pnames$protein_id, keytype = "PROTEINID",
            columns = c("PROTEINID", "GENENAME", "PROTDOMSTART", "PROTDOMEND", "PROTEINDOMAINID", "PROTEINDOMAINSOURCE"))
pos <- allmutations_PTMs[,c(1,2,4,14,17)]
pos <- pos[pos$gene %in% z$GENENAME,]
pos <- pos[order(pos$`cancer type abbreviation`),]
library(sqldf)
library(bit64)
res <- list()
res2 <- list()
for(i in unique(pos$'cancer type abbreviation')){
  print(paste0("PTM scanning domains for: ",i))
  k <- unique(subset(pos, pos$'cancer type abbreviation' == i))
  pb <- txtProgressBar(min = 0, max = length(unique(k$gene)), style = 3)
  stp <- 0
  for(q in unique(k$gene)){
    stp <- stp+1
    setTxtProgressBar(pb, stp)
    k2 <- subset(k, k$gene == q)
    nz <- subset(z, z$GENENAME == q)
    db <- sqldf("select * from k2
                left join nz
                on k2.Amino_Acid_Change between nz.PROTDOMSTART and nz.PROTDOMEND")
    db <- na.omit(db)
    res[[q]] <- db
  } 
  res2[[i]] <- bind_rows(res)
}
res2 <- bind_rows(res2)
res2 <- distinct(res2)
dom_ptm <- res2

allmutations_PTMs$mut <- paste0(allmutations_PTMs$gene,"_",allmutations_PTMs$Amino_Acid_Change,"_",allmutations_PTMs$sample)
allmutations$mut <- paste0(allmutations$gene,"_",allmutations$Amino_Acid_Change,"_",allmutations$sample)
pos <- subset(allmutations, !(allmutations$mut %in% allmutations_PTMs$mut))
pos <- pos[,c(8,1,2,10,14)]
pos <- pos[pos$gene %in% z$GENENAME,]
pos <- na.omit(pos)
pos <- pos[order(pos$`cancer type abbreviation`),]
res <- list()
res2 <- list()
for(i in unique(pos$'cancer type abbreviation')){
  print(paste0("NON-PTM scanning domains for: ",i))
  k <- unique(subset(pos, pos$'cancer type abbreviation' == i))
  pb <- txtProgressBar(min = 0, max = length(unique(k$gene)), style = 3)
  stp <- 0
  for(q in unique(k$gene)){
    stp <- stp+1
    setTxtProgressBar(pb, stp)
    k2 <- subset(k, k$gene == q)
    nz <- subset(z, z$GENENAME == q)
    db <- sqldf("select * from k2
                left join nz
                on k2.Amino_Acid_Change between nz.PROTDOMSTART and nz.PROTDOMEND")
    db <- na.omit(db)
    res[[q]] <- db
  } 
  res2[[i]] <- bind_rows(res)
}
res2 <- bind_rows(res2)
res2 <- distinct(res2)
dom_nonptm <- res2

#for a comparison, let's focus on PFAM domains only#
d1 <- subset(dom_ptm, dom_ptm$PROTEINDOMAINSOURCE == "pfam" & dom_ptm$is.matrisome == "matrisome")
d2 <- subset(dom_ptm, dom_ptm$PROTEINDOMAINSOURCE == "pfam" & dom_ptm$is.matrisome != "matrisome")
d3 <- subset(dom_nonptm, dom_nonptm$PROTEINDOMAINSOURCE == "pfam")

res <- list()
stp <- 0
print("checking mutation frequencies in matrisome domains")
pb <- txtProgressBar(min = 0, max = length(unique(d1$PROTEINDOMAINID)), style = 3)
for(i in unique(d1$PROTEINDOMAINID)){
  stp <- stp+1
  setTxtProgressBar(pb, stp)
  z <- subset(d3, d3$PROTEINDOMAINID == i)
  z <- as.data.table(z)[,.N, by = gene]
  z <- mean(z$N)
  z2 <- subset(d1, d1$PROTEINDOMAINID == i)
  z2 <- as.data.table(z2)[,.N, by = gene]
  z2 <- mean(z2$N)
  res[[i]] <- data.frame(domain=i, freq=(z2/z))
}
res <- bind_rows(res)

res2 <- list()
stp <- 0
print("checking mutation frequencies in non-matrisome domains")
pb <- txtProgressBar(min = 0, max = length(unique(d2$PROTEINDOMAINID)), style = 3)
for(i in unique(d2$PROTEINDOMAINID)){
  stp <- stp+1
  setTxtProgressBar(pb, stp)
  z <- subset(d3, d3$PROTEINDOMAINID == i)
  z <- as.data.table(z)[,.N, by = gene]
  z <- mean(z$N)
  z2 <- subset(d2, d2$PROTEINDOMAINID == i)
  z2 <- as.data.table(z2)[,.N, by = gene]
  z2 <- mean(z2$N)
  res2[[i]] <- data.frame(domain=i, freq=(z2/z))
}
res2 <- bind_rows(res2)
toex <- merge(res,res2,by="domain",all=T)
toex[is.na(toex)] <- 0
colnames(toex) <- c("PFAM domain","PTMmut/allmut frequency in matrisome", "PTMmut/allmut frequency in rest of genome")
toex$enrich <- ifelse(toex$`PTMmut/allmut frequency in matrisome`> toex$`PTMmut/allmut frequency in rest of genome`,"matrisome","rest of genome")
fwrite(toex,"relative freq of mutations at the domain level.txt",sep="\t",row.names = F, quote = F)

mt <- subset(toex,toex$`PTMmut/allmut frequency in matrisome`> toex$`PTMmut/allmut frequency in rest of genome`)
length(unique(mt$`PFAM domain`)) #128 enriched domains#

m1 <- subset(d1, d1$PROTEINDOMAINID %in% mt$`PFAM domain`)
m2 <- subset(d2, d2$PROTEINDOMAINID %in% mt$`PFAM domain`)
m1 <- distinct(m1[,c("gene","PTM","PROTEINDOMAINID")])
m2 <- distinct(m2[,c("gene","PTM","PROTEINDOMAINID")])
m3 <- distinct(d3[,c("gene","PROTEINDOMAINID")])

res1 <- list()
for(i in unique(m1$PROTEINDOMAINID)){
  z <- subset(m1, m1$PROTEINDOMAINID == i)
  z <- as.data.frame(table(z$PTM))
  z2 <- subset(m3, m3$PROTEINDOMAINID == i)
  z2 <- nrow(z2)
  z$tot <- z2
  z$rel.freq <- z$Freq/z$tot
  res1[[i]] <- z
}
res1 <- bind_rows(res1)
res1 <- aggregate(res1$rel.freq,by=list(res1$Var1),mean)

res2 <- list()
for(i in unique(m2$PROTEINDOMAINID)){
  z <- subset(m2, m2$PROTEINDOMAINID == i)
  z <- as.data.frame(table(z$PTM))
  z2 <- subset(m3, m3$PROTEINDOMAINID == i)
  z2 <- nrow(z2)
  z$tot <- z2
  z$rel.freq <- z$Freq/z$tot
  res2[[i]] <- z
}
res2 <- bind_rows(res2)
res2 <- aggregate(res2$rel.freq,by=list(res2$Var1),mean)

res <- merge(res1,res2,by="Group.1")
res[5,3] <- 1 #proxy
m <- reshape2::melt(res)

ggplot(data=m, aes(x=Group.1, y=value, fill=variable)) +
  geom_bar(stat="identity", position = "dodge") +
  theme_bw() +
  scale_fill_aaas() + 
  theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1))

```

Are there correlations between PTMmut location and gene function?

```{r, eval=F}

tab <- as.data.frame(fread("relative freq of mutations at the domain level.txt",sep="\t",header = T, dec=","))
library(clusterProfiler)
library(org.Hs.eg.db)
library(cowplot)

tab.mat <- subset(tab, tab[,2]>=2*tab[,3])
sel <- subset(dom_ptm, dom_ptm$PROTEINDOMAINID %in% tab.mat$`PFAM domain`)
gene <- table(sel$gene)
gene <- gene[order(-gene)]

ego <- enrichKEGG(gene)
p1 <- dotplot(ego, showCategory=10, title="matrisome enriched")

tab.mat <- subset(tab, tab[,3]>=2*tab[,2])
sel <- subset(dom_ptm, dom_ptm$PROTEINDOMAINID %in% tab.mat$`PFAM domain`)
gene2 <- table(sel$gene)
gene2 <- gene2[order(-gene2)]

ego <- enrichKEGG(gene2)
p2 <- dotplot(ego, showCategory=10, title="rest of genome enriched")

plot_grid(p1, p2, ncol=2)

ego <- enrichGO(gene          = gene,
                OrgDb         = org.Hs.eg.db,
                ont           = "BP",
                pAdjustMethod = "BH",
                pvalueCutoff  = 0.01,
                qvalueCutoff  = 0.05,
        readable      = TRUE)
p1 <- dotplot(ego)

ego <- enrichGO(gene          = gene2,
                OrgDb         = org.Hs.eg.db,
                ont           = "BP",
                pAdjustMethod = "BH",
                pvalueCutoff  = 0.01,
                qvalueCutoff  = 0.05,
        readable      = TRUE)
p2 <- dotplot(ego)
plot_grid(p1, p2, ncol=2)

l <- list(mat=gene,rest=gene2)
ck <- compareCluster(geneCluster = l, 
                fun = "enrichGO",
                OrgDb         = org.Hs.eg.db,
                ont           = "BP",
                pAdjustMethod = "BH",
                pvalueCutoff  = 0.01,
                qvalueCutoff  = 0.05)
dotplot(ck)


```

And are there matrisome PTMmut associated with protein functions?

```{r, eval=F}

reg <- as.data.frame(fread("regions.txt",header=T,sep="\t"))
reg <- reg[,c(1:16)]
reg <- subset(reg, reg$is.matrisome=="matrisome")
reg$`region I` <- ifelse(reg$`region I` == "iside","inside",reg$`region I`)

#of 921 mutations annotated with region information, 286 (286/921, 31%) fall in a functional region#

fmut <- subset(reg, reg$`region I` == "inside")
tfmut <- table(fmut$gene)
tfmut <- tfmut[order(-tfmut)]

```

Finally, do the proteins targeted by PTMmut interact? To answer this question, we will use PPI data from BioGrid v. 4.2.192.

```{r, eval=F}

ppi <- as.data.frame(fread("BIOGRID-ORGANISM-Homo_sapiens-4.2.192.tab3.txt",header=T,sep="\t"))
ppi <- ppi[,c(8,9)]

tab.mat <- subset(tab, tab[,2]>=tab[,3])
tab.mat <- subset(tab, tab$enrich == "matrisome")
sel <- subset(dom_ptm, dom_ptm$PROTEINDOMAINID %in% tab.mat$`PFAM domain`)
sel <- unique(sel$gene)
exp <- data.frame(gene=sel)
fwrite(exp,"gene with PTMmut in enriched domains.txt",sep="\t",row.names = F, quote = F)
ppi.mat <- subset(ppi, ppi$`Official Symbol Interactor A` %in% sel & ppi$`Official Symbol Interactor B` %in% sel)
ppi.mat$type <- ifelse(ppi.mat[,1]==ppi.mat[,2],"homo","hetero")
fwrite(ppi.mat,"PPI interactions in enriched domains.txt",sep="\t",row.names = F, quote = F)

#total would be:
#ppi.mat <- subset(ppi, ppi$`Official Symbol Interactor A` %in% sel | ppi$`Official Symbol Interactor B` %in% sel)

library(igraph)
net <- simplify(graph_from_data_frame(ppi.mat), remove.multiple = F, remove.loops = T)

V(net)$size <- 8
V(net)$frame.color <- "white"
#V(net)$color <- "orange"
E(net)$arrow.mode <- 0
deg <- degree(net, mode = "all")

g <- degree(net)
g <- g[order(-g)]
g <- names(g[1:10])

V(net)$label <- ifelse(V(net)$name %in% g, V(net)$name,"") 
V(net)$label.color <- "black"
V(net)$color <- ifelse(V(net)$label %in% names(tfmut), "red","orange")
l <- layout_with_fr(net)
plot(net, layout=l, vertex.size=deg*0.5) #this is for reference, labels will be added in post-production#
sel <- V(net)$label[V(net)$label %in% names(tfmut)]

V(net)$label <- ""
l <- layout_with_fr(net)
plot(net, layout=l, vertex.size=deg*0.5) 

```

Backtracking the TPMmut into their regions and domains

```{r, eval=F}

z <- fread("regions.txt",sep="\t",header=T)
z <- subset(z, z$is.matrisome != "rest.of.genome")
z <- subset(z, z$`region I` %in% c("inside","iside"))
z <- z[,c(1:18)]
sel <- unique(c(ppi.mat$`Official Symbol Interactor A`,ppi.mat$`Official Symbol Interactor B`))
z2 <- subset(z, z$gene %in% sel)
z2 <- as.data.frame(z2)

res <- list()
for(i in unique(z2$gene)){
  k <- subset(ppi.mat, ppi.mat$`Official Symbol Interactor A` == i | ppi.mat$`Official Symbol Interactor B` == i)
  k <- unique(c(k[,1],k[,2]))
  k <- paste(k, collapse = ",")
  df <- data.frame(gene=i, all.interacting.genes=k)
  res[[i]] <- df
}
res <- bind_rows(res)

m <- merge(z2,res,by="gene")
m$V11 <- NULL
m$V12 <- NULL
m$V17 <- NULL
m$V18 <- NULL

length(unique(m$gene))
fwrite(m, "FINAL MAPPING TAB.txt", sep="\t", row.names = F, quote = F)

net <- simplify(graph_from_data_frame(ppi.mat), remove.multiple = F, remove.loops = T)

V(net)$size <- 8
V(net)$frame.color <- "white"
E(net)$arrow.mode <- 0
V(net)$color <- "orange"
V(net)$color <- ifelse(names(V(net)) %in% m$gene, "red","orange")
V(net)$label <- ""
#n <- c("FN1","COL1A1","MMP2","ADAM10")
#V(net)$label <- ifelse(names(V(net)) %in% n,n,"") #to see labels of most-interesting hubs

deg <- degree(net, mode = "all")

l <- layout_with_fr(net)
plot(net, layout=l, vertex.size=deg*0.5) 

```

EXTRA IDEAS (from further thinking of, and discussing, the findings): 

What is the % of different amino acids impacted by PTMmut?

```{r, eval=F}

tab <- fread("TABLE 1.txt",header=T,sep="\t")
allmutations <- fread("allmutations.txt", header=T, sep="\t") # import mutation data #
allmutations <- as.data.frame(allmutations)
allmutations$Amino_Acid_Change <- gsub("p.","",allmutations$Amino_Acid_Change)
allmutations$Amino_Acid_Change <- substring(allmutations$Amino_Acid_Change, 1, 1)
m <- merge(tab, allmutations, by.x=c("sample","gene","chr","start","end"), by.y=c("sample","gene","chr","start","end"))
tab <- table(m$PTM, m$Amino_Acid_Change.y)
tab <- tab[,-1]
tab <- round((tab/rowSums(tab))*100,0)
tab <- as.data.frame.matrix(tab)
tab$PTM <- rownames(tab)
tab <- tab[,c(22,1:21)]
fwrite(tab,"percentage of mutated aminoacids_PTM.txt", sep="\t",row.names = F, quote = F)

```

Are there patients with concurring/coexisting mutations in potentially interacting proteins?

```{r, eval=F}

ppi <- as.data.frame(fread("BIOGRID-ORGANISM-Homo_sapiens-4.2.192.tab3.txt",header=T,sep="\t"))
ppi <- ppi[,c(8,9)]
tab <- as.data.frame(fread("TABLE 1.txt",header=T,sep="\t"))
tab <- subset(tab, tab$is.matrisome == "matrisome")

res <- list()
pb <- txtProgressBar(min = 0, max = length(unique(tab$sample)), style = 3)
stp <- 0
for(i in unique(tab$sample)){
  stp <- stp+1
  setTxtProgressBar(pb, stp)
  z <- subset(tab, tab$sample %in% i)
  g <- unique(z$gene)
  s1 <- subset(ppi, ppi$`Official Symbol Interactor A` %in% g & ppi$`Official Symbol Interactor B` %in%  g)
  if(nrow(s1)==0){next}else{
    s1 <- subset(s1, s1$`Official Symbol Interactor A` != s1$`Official Symbol Interactor B`)
    if(nrow(s1)==0){next}else{
      s1$patient <- i
      res[[i]] <- s1
    }
  }
}
res <- bind_rows(res)
res <- unique(res)
length(unique(res$patient))
fwrite(res, "TABLE 2A.txt", sep="\t", row.names = F, quote = F)
res1 <- res

tab <- as.data.frame(fread("TABLE 1.txt",header=T,sep="\t"))
tab <- subset(tab, tab$is.matrisome != "matrisome")
res <- list()
pb <- txtProgressBar(min = 0, max = length(unique(tab$sample)), style = 3)
stp <- 0
for(i in unique(tab$sample)){
  stp <- stp+1
  setTxtProgressBar(pb, stp)
  z <- subset(tab, tab$sample %in% i)
  g <- unique(z$gene)
  s1 <- subset(ppi, ppi$`Official Symbol Interactor A` %in% g & ppi$`Official Symbol Interactor B` %in%  g)
  if(nrow(s1)==0){next}else{
    s1 <- subset(s1, s1$`Official Symbol Interactor A` != s1$`Official Symbol Interactor B`)
    if(nrow(s1)==0){next}else{
      s1$patient <- i
      res[[i]] <- s1
    }
  }
}
res <- bind_rows(res)
res <- unique(res)
length(unique(res$patient))

tab <- as.data.frame(fread("TABLE 1.txt",header=T,sep="\t"))
tab <- subset(tab, tab$is.matrisome == "matrisome")
nrow(tab)
tab <- as.data.frame(fread("TABLE 1.txt",header=T,sep="\t"))
tab <- subset(tab, tab$is.matrisome != "matrisome")
nrow(tab)

m <- matrix(c(nrow(res1),nrow(res),1907,43962),ncol=2,nrow=2) 
chisq.test(m) #p-value < 2.2e-16

```

Are there mutations in motifs mediating cell/ECM adhesions (GPP, RGD, LDV, collagen motifs - GFPGER, GLPGER, the latter two taken from doi: 10.1074/jbc.M806865200)? First, let's check PTMmut. Results show that this kind of event is extremely infrequent, so much so as to ask whether other kind of mutations are similarly excluded.

```{r, eval=F}

tab <- fread("TABLE 1.txt",header=T,sep="\t")
allmutations <- fread("allmutations.txt", header=T, sep="\t") # import mutation data #
allmutations <- as.data.frame(allmutations)
allmutations$Amino_Acid_Change <- gsub("p.","",allmutations$Amino_Acid_Change)
allmutations$Amino_Acid_Change <- substring(allmutations$Amino_Acid_Change, 1, 1)
m <- merge(tab, allmutations, by.x=c("sample","gene","chr","start","end"), by.y=c("sample","gene","chr","start","end"))
m <- subset(m, m$is.matrisome == "matrisome")
sm <- m

library(EnsDb.Hsapiens.v86)  
library(ensembldb)
library(Biostrings)
library(bit64)

### GPP filter ###

#first, filter all proteins that have no GPP at all#
m <- sm
l <- list()
pb <- txtProgressBar(min = 0, max = length(unique(m$gene)), style = 3)
stp <- 0
for(i in unique(m$gene)){
  stp <- stp+1
  setTxtProgressBar(pb, stp)
  z <- subset(m, m$gene == i)
  g <- proteins(EnsDb.Hsapiens.v86, filter= GeneNameFilter(i), columns=c("uniprot_id","protein_sequence"),return.type = "data.frame")
  g <- subset(g, g$uniprot_id %in% unique(z$Uniprot_ID))
  g$r <- grepl("GPP",g$protein_sequence, fixed = TRUE)
  g <- as.data.frame(g)
  g$protein_sequence <- NULL
  l[[i]] <- g
}
l <- bind_rows(l)
l <- subset(l, l$r != "FALSE")

#next, backfilter m to include only the genes with a valid entry in l#
m <- subset(m, m$gene %in% l$gene_name & m$Uniprot_ID %in% l$uniprot_id)

#and then, the actual filter#
pb <- txtProgressBar(min = 0, max = length(unique(m$gene)), style = 3)
stp <- 0
res <- list()
res2 <- list()
for(i in unique(m$gene)){
  stp <- stp+1
  setTxtProgressBar(pb, stp)
  z <- subset(m, m$gene == i)
  g <- proteins(EnsDb.Hsapiens.v86, filter= GeneNameFilter(i), columns=c("uniprot_id","protein_sequence"),return.type = "data.frame")
  g <- subset(g, g$uniprot_id %in% unique(z$Uniprot_ID))
  if(nrow(g)==0){df <- data.frame(gene=i,position=0,available.prot.seq=0,of.which.seq.found=0,is.GVD="FALSE")
  res[[i]] <- df
  df <- NULL}else{
    g$n.aa <- nchar(g$protein_sequence)
    aa <- unique(z$Amino_Acid_Change.x)

    for(k in aa){
     min.aa <- k-3
     max.aa <- k+3
     s <- subset(g, g$n.aa >= min.aa & g$n.aa >= max.aa)
     if(nrow(s)==0){df <- data.frame(gene=i,position=0,available.prot.seq=0,of.which.seq.found=0,is.GVD="FALSE")
  res[[i]] <- df
  df <- NULL}else{
      
      for(w in 1:nrow(s)){
        v <- s[w,2]
        v <- subseq(v, start=min.aa, end=max.aa)
        r <- grepl("GPP",v, fixed = TRUE)
        res2[[stp]] <- data.frame(gene=i,position=k,available.prot.seq=nrow(g),of.which.seq.found=w,is.GVD=as.character(r))
      }}
    }
  }
}
res <- unique(bind_rows(res2))
res <- subset(res, res$is.GVD == "TRUE")
gpp <- res
gpp$motif <- "GPP"

### GVD filter ###

#first, filter all proteins that have no GVD at all#
m <- sm
l <- list()
pb <- txtProgressBar(min = 0, max = length(unique(m$gene)), style = 3)
stp <- 0
for(i in unique(m$gene)){
  stp <- stp+1
  setTxtProgressBar(pb, stp)
  z <- subset(m, m$gene == i)
  g <- proteins(EnsDb.Hsapiens.v86, filter= GeneNameFilter(i), columns=c("uniprot_id","protein_sequence"),return.type = "data.frame")
  g <- subset(g, g$uniprot_id %in% unique(z$Uniprot_ID))
  g$r <- grepl("GVD",g$protein_sequence, fixed = TRUE)
  g <- as.data.frame(g)
  g$protein_sequence <- NULL
  l[[i]] <- g
}
l <- bind_rows(l)
l <- subset(l, l$r != "FALSE")

#next, backfilter m to include only the genes with a valid entry in l#
m <- subset(m, m$gene %in% l$gene_name & m$Uniprot_ID %in% l$uniprot_id)

#and then, the actual filter#
pb <- txtProgressBar(min = 0, max = length(unique(m$gene)), style = 3)
stp <- 0
res <- list()
res2 <- list()
for(i in unique(m$gene)){
  stp <- stp+1
  setTxtProgressBar(pb, stp)
  z <- subset(m, m$gene == i)
  g <- proteins(EnsDb.Hsapiens.v86, filter= GeneNameFilter(i), columns=c("uniprot_id","protein_sequence"),return.type = "data.frame")
  g <- subset(g, g$uniprot_id %in% unique(z$Uniprot_ID))
  if(nrow(g)==0){df <- data.frame(gene=i,position=0,available.prot.seq=0,of.which.seq.found=0,is.GVD="FALSE")
  res[[i]] <- df
  df <- NULL}else{
    g$n.aa <- nchar(g$protein_sequence)
    aa <- unique(z$Amino_Acid_Change.x)

    for(k in aa){
     min.aa <- k-3
     max.aa <- k+3
     s <- subset(g, g$n.aa >= min.aa & g$n.aa >= max.aa)
     if(nrow(s)==0){df <- data.frame(gene=i,position=0,available.prot.seq=0,of.which.seq.found=0,is.GVD="FALSE")
  res[[i]] <- df
  df <- NULL}else{
      
      for(w in 1:nrow(s)){
        v <- s[w,2]
        v <- subseq(v, start=min.aa, end=max.aa)
        r <- grepl("GVD",v, fixed = TRUE)
        res2[[stp]] <- data.frame(gene=i,position=k,available.prot.seq=nrow(g),of.which.seq.found=w,is.GVD=as.character(r))
      }}
    }
  }
}
res <- unique(bind_rows(res2))
res <- subset(res, res$is.GVD == "TRUE")
gvd <- res
gvd$motif <- "GVD"
     
### RGD filter ###

#first, filter all proteins that have no RGD at all#
m <- sm
l <- list()
pb <- txtProgressBar(min = 0, max = length(unique(m$gene)), style = 3)
stp <- 0
for(i in unique(m$gene)){
  stp <- stp+1
  setTxtProgressBar(pb, stp)
  z <- subset(m, m$gene == i)
  g <- proteins(EnsDb.Hsapiens.v86, filter= GeneNameFilter(i), columns=c("uniprot_id","protein_sequence"),return.type = "data.frame")
  g <- subset(g, g$uniprot_id %in% unique(z$Uniprot_ID))
  g$r <- grepl("RGD",g$protein_sequence, fixed = TRUE)
  g <- as.data.frame(g)
  g$protein_sequence <- NULL
  l[[i]] <- g
}
l <- bind_rows(l)
l <- subset(l, l$r != "FALSE")

#next, backfilter m to include only the genes with a valid entry in l#
m <- subset(m, m$gene %in% l$gene_name & m$Uniprot_ID %in% l$uniprot_id)

#and then, the actual filter#
pb <- txtProgressBar(min = 0, max = length(unique(m$gene)), style = 3)
stp <- 0
res <- list()
res2 <- list()
for(i in unique(m$gene)){
  stp <- stp+1
  setTxtProgressBar(pb, stp)
  z <- subset(m, m$gene == i)
  g <- proteins(EnsDb.Hsapiens.v86, filter= GeneNameFilter(i), columns=c("uniprot_id","protein_sequence"),return.type = "data.frame")
  g <- subset(g, g$uniprot_id %in% unique(z$Uniprot_ID))
  if(nrow(g)==0){df <- data.frame(gene=i,position=0,available.prot.seq=0,of.which.seq.found=0,is.GVD="FALSE")
  res[[i]] <- df
  df <- NULL}else{
    g$n.aa <- nchar(g$protein_sequence)
    aa <- unique(z$Amino_Acid_Change.x)

    for(k in aa){
     min.aa <- k-3
     max.aa <- k+3
     s <- subset(g, g$n.aa >= min.aa & g$n.aa >= max.aa)
     if(nrow(s)==0){df <- data.frame(gene=i,position=0,available.prot.seq=0,of.which.seq.found=0,is.GVD="FALSE")
  res[[i]] <- df
  df <- NULL}else{
      
      for(w in 1:nrow(s)){
        v <- s[w,2]
        v <- subseq(v, start=min.aa, end=max.aa)
        r <- grepl("RGD",v, fixed = TRUE)
        res2[[stp]] <- data.frame(gene=i,position=k,available.prot.seq=nrow(g),of.which.seq.found=w,is.GVD=as.character(r))
      }}
    }
  }
}
res <- unique(bind_rows(res2))
res <- subset(res, res$is.GVD == "TRUE")
rgd <- res
#rgd$motif <- "RGD" #no RGD!

### LDV filter ###

#first, filter all proteins that have no LDV at all#
m <- sm
l <- list()
pb <- txtProgressBar(min = 0, max = length(unique(m$gene)), style = 3)
stp <- 0
for(i in unique(m$gene)){
  stp <- stp+1
  setTxtProgressBar(pb, stp)
  z <- subset(m, m$gene == i)
  g <- proteins(EnsDb.Hsapiens.v86, filter= GeneNameFilter(i), columns=c("uniprot_id","protein_sequence"),return.type = "data.frame")
  g <- subset(g, g$uniprot_id %in% unique(z$Uniprot_ID))
  g$r <- grepl("LDV",g$protein_sequence, fixed = TRUE)
  g <- as.data.frame(g)
  g$protein_sequence <- NULL
  l[[i]] <- g
}
l <- bind_rows(l)
l <- subset(l, l$r != "FALSE")

#next, backfilter m to include only the genes with a valid entry in l#
m <- subset(m, m$gene %in% l$gene_name & m$Uniprot_ID %in% l$uniprot_id)

#and then, the actual filter#
pb <- txtProgressBar(min = 0, max = length(unique(m$gene)), style = 3)
stp <- 0
res <- list()
res2 <- list()
for(i in unique(m$gene)){
  stp <- stp+1
  setTxtProgressBar(pb, stp)
  z <- subset(m, m$gene == i)
  g <- proteins(EnsDb.Hsapiens.v86, filter= GeneNameFilter(i), columns=c("uniprot_id","protein_sequence"),return.type = "data.frame")
  g <- subset(g, g$uniprot_id %in% unique(z$Uniprot_ID))
  if(nrow(g)==0){df <- data.frame(gene=i,position=0,available.prot.seq=0,of.which.seq.found=0,is.GVD="FALSE")
  res[[i]] <- df
  df <- NULL}else{
    g$n.aa <- nchar(g$protein_sequence)
    aa <- unique(z$Amino_Acid_Change.x)

    for(k in aa){
     min.aa <- k-3
     max.aa <- k+3
     s <- subset(g, g$n.aa >= min.aa & g$n.aa >= max.aa)
     if(nrow(s)==0){df <- data.frame(gene=i,position=0,available.prot.seq=0,of.which.seq.found=0,is.GVD="FALSE")
  res[[i]] <- df
  df <- NULL}else{
      
      for(w in 1:nrow(s)){
        v <- s[w,2]
        v <- subseq(v, start=min.aa, end=max.aa)
        r <- grepl("LDV",v, fixed = TRUE)
        res2[[stp]] <- data.frame(gene=i,position=k,available.prot.seq=nrow(g),of.which.seq.found=w,is.GVD=as.character(r))
      }}
    }
  }
}
res <- unique(bind_rows(res2))
res <- subset(res, res$is.GVD == "TRUE")
ldv <- res
#ldv$motif <- "LDV" #no LDV

### GFPGER filter ###

#first, filter all proteins that have no LDV at all#
m <- sm
l <- list()
pb <- txtProgressBar(min = 0, max = length(unique(m$gene)), style = 3)
stp <- 0
for(i in unique(m$gene)){
  stp <- stp+1
  setTxtProgressBar(pb, stp)
  z <- subset(m, m$gene == i)
  g <- proteins(EnsDb.Hsapiens.v86, filter= GeneNameFilter(i), columns=c("uniprot_id","protein_sequence"),return.type = "data.frame")
  g <- subset(g, g$uniprot_id %in% unique(z$Uniprot_ID))
  g$r <- grepl("GFPGER",g$protein_sequence, fixed = TRUE)
  g <- as.data.frame(g)
  g$protein_sequence <- NULL
  l[[i]] <- g
}
l <- bind_rows(l)
l <- subset(l, l$r != "FALSE")

#next, backfilter m to include only the genes with a valid entry in l#
m <- subset(m, m$gene %in% l$gene_name & m$Uniprot_ID %in% l$uniprot_id)

#and then, the actual filter#
pb <- txtProgressBar(min = 0, max = length(unique(m$gene)), style = 3)
stp <- 0
res <- list()
res2 <- list()
for(i in unique(m$gene)){
  stp <- stp+1
  setTxtProgressBar(pb, stp)
  z <- subset(m, m$gene == i)
  g <- proteins(EnsDb.Hsapiens.v86, filter= GeneNameFilter(i), columns=c("uniprot_id","protein_sequence"),return.type = "data.frame")
  g <- subset(g, g$uniprot_id %in% unique(z$Uniprot_ID))
  if(nrow(g)==0){df <- data.frame(gene=i,position=0,available.prot.seq=0,of.which.seq.found=0,is.GVD="FALSE")
  res[[i]] <- df
  df <- NULL}else{
    g$n.aa <- nchar(g$protein_sequence)
    aa <- unique(z$Amino_Acid_Change.x)

    for(k in aa){
     min.aa <- k-3
     max.aa <- k+3
     s <- subset(g, g$n.aa >= min.aa & g$n.aa >= max.aa)
     if(nrow(s)==0){df <- data.frame(gene=i,position=0,available.prot.seq=0,of.which.seq.found=0,is.GVD="FALSE")
  res[[i]] <- df
  df <- NULL}else{
      
      for(w in 1:nrow(s)){
        v <- s[w,2]
        v <- subseq(v, start=min.aa, end=max.aa)
        r <- grepl("GFPGER",v, fixed = TRUE)
        res2[[stp]] <- data.frame(gene=i,position=k,available.prot.seq=nrow(g),of.which.seq.found=w,is.GVD=as.character(r))
      }}
    }
  }
}
res <- unique(bind_rows(res2))
res <- subset(res, res$is.GVD == "TRUE")
gfpger <- res
#gfpger$motif <- "GFPGER" #no GFPGER

### GLPGER filter ###

#first, filter all proteins that have no LDV at all#
m <- sm
l <- list()
pb <- txtProgressBar(min = 0, max = length(unique(m$gene)), style = 3)
stp <- 0
for(i in unique(m$gene)){
  stp <- stp+1
  setTxtProgressBar(pb, stp)
  z <- subset(m, m$gene == i)
  g <- proteins(EnsDb.Hsapiens.v86, filter= GeneNameFilter(i), columns=c("uniprot_id","protein_sequence"),return.type = "data.frame")
  g <- subset(g, g$uniprot_id %in% unique(z$Uniprot_ID))
  g$r <- grepl("GLPGER",g$protein_sequence, fixed = TRUE)
  g <- as.data.frame(g)
  g$protein_sequence <- NULL
  l[[i]] <- g
}
l <- bind_rows(l)
l <- subset(l, l$r != "FALSE")

#next, backfilter m to include only the genes with a valid entry in l#
m <- subset(m, m$gene %in% l$gene_name & m$Uniprot_ID %in% l$uniprot_id)

#and then, the actual filter#
pb <- txtProgressBar(min = 0, max = length(unique(m$gene)), style = 3)
stp <- 0
res <- list()
res2 <- list()
for(i in unique(m$gene)){
  stp <- stp+1
  setTxtProgressBar(pb, stp)
  z <- subset(m, m$gene == i)
  g <- proteins(EnsDb.Hsapiens.v86, filter= GeneNameFilter(i), columns=c("uniprot_id","protein_sequence"),return.type = "data.frame")
  g <- subset(g, g$uniprot_id %in% unique(z$Uniprot_ID))
  if(nrow(g)==0){df <- data.frame(gene=i,position=0,available.prot.seq=0,of.which.seq.found=0,is.GVD="FALSE")
  res[[i]] <- df
  df <- NULL}else{
    g$n.aa <- nchar(g$protein_sequence)
    aa <- unique(z$Amino_Acid_Change.x)

    for(k in aa){
     min.aa <- k-3
     max.aa <- k+3
     s <- subset(g, g$n.aa >= min.aa & g$n.aa >= max.aa)
     if(nrow(s)==0){df <- data.frame(gene=i,position=0,available.prot.seq=0,of.which.seq.found=0,is.GVD="FALSE")
  res[[i]] <- df
  df <- NULL}else{
      
      for(w in 1:nrow(s)){
        v <- s[w,2]
        v <- subseq(v, start=min.aa, end=max.aa)
        r <- grepl("GLPGER",v, fixed = TRUE)
        res2[[stp]] <- data.frame(gene=i,position=k,available.prot.seq=nrow(g),of.which.seq.found=w,is.GVD=as.character(r))
      }}
    }
  }
}
res <- unique(bind_rows(res2))
res <- subset(res, res$is.GVD == "TRUE")
glpger <- res
#glpger$motif <- "GFPGER" #no GLPGER

### Bind results together ###

res <- bind_rows(gpp, gvd)
res$id <- paste0(res$gene,"_",res$position)
sm$id <- paste0(sm$gene,"_",sm$Amino_Acid_Change.x)
sm <- subset(sm, select = c("gene","PTM","id"))
res <- merge(sm,res,by="id")
res$id <- NULL
res$gene.y <- NULL
names(res)[1] <- "gene"
names(res)[6] <- "is.GVD.plus.minus.3"
fwrite(res, "MOTIF MAPPING OF PTMmut.txt",sep="\t",row.names = F, quote = F)

```

Check for non-PTMmut.

```{r, eval=F}
tab <- as.data.frame(fread("all mutations.txt",header=T,sep="\t"))
tab2 <- as.data.frame(fread("all PTM mutations.txt",header=T,sep="\t"))
tab$id <- paste0(tab$sample,"_",tab$gene,"_",tab$Amino_Acid_Change,"_",tab$`cancer type abbreviation`)
tab2$id <- paste0(tab2$sample,"_",tab2$gene,"_",tab2$Amino_Acid_Change,"_",tab2$`cancer type abbreviation`)
tab <- subset(tab, !(tab$id %in% tab2$id))
tab <- subset(tab, tab$is.matrisome == "matrisome")
tab <- na.omit(tab)
ntab <- fread("TABLE 1.txt",header=T,sep="\t")
ntab <- subset(ntab, ntab$is.matrisome == "matrisome")
ntab <- subset(ntab, select=c("gene","Uniprot_ID"))
m <- distinct(merge(tab,ntab,by="gene"))
m <- m %>% mutate_if(bit64::is.integer64, as.integer)

sm <- m

### GPP filter ###

#first, filter all proteins that have no GPP at all#
m <- sm
l <- list()
pb <- txtProgressBar(min = 0, max = length(unique(m$gene)), style = 3)
stp <- 0
for(i in unique(m$gene)){
  stp <- stp+1
  setTxtProgressBar(pb, stp)
  z <- subset(m, m$gene == i)
  g <- proteins(EnsDb.Hsapiens.v86, filter= GeneNameFilter(i), columns=c("uniprot_id","protein_sequence"),return.type = "data.frame")
  g <- subset(g, g$uniprot_id %in% unique(z$Uniprot_ID))
  g$r <- grepl("GPP",g$protein_sequence, fixed = TRUE)
  g <- as.data.frame(g)
  g$protein_sequence <- NULL
  l[[i]] <- g
}
l <- bind_rows(l)
l <- subset(l, l$r != "FALSE")

#next, backfilter m to include only the genes with a valid entry in l#
m <- subset(m, m$gene %in% l$gene_name & m$Uniprot_ID %in% l$uniprot_id)

#and then, the actual filter#
pb <- txtProgressBar(min = 0, max = length(unique(m$gene)), style = 3)
stp <- 0
res <- list()
res2 <- list()
for(i in unique(m$gene)){
  stp <- stp+1
  setTxtProgressBar(pb, stp)
  z <- subset(m, m$gene == i)
  g <- proteins(EnsDb.Hsapiens.v86, filter= GeneNameFilter(i), columns=c("uniprot_id","protein_sequence"),return.type = "data.frame")
  g <- subset(g, g$uniprot_id %in% unique(z$Uniprot_ID))
  if(nrow(g)==0){df <- data.frame(gene=i,position=0,available.prot.seq=0,of.which.seq.found=0,is.GVD="FALSE")
  res[[i]] <- df
  df <- NULL}else{
    g$n.aa <- nchar(g$protein_sequence)
    aa <- unique(z$Amino_Acid_Change)
    aa <- aa[aa > 4]

    for(k in aa){
     min.aa <- k-3
     max.aa <- k+3
     s <- subset(g, g$n.aa >= min.aa & g$n.aa >= max.aa)
     if(nrow(s)==0){df <- data.frame(gene=i,position=0,available.prot.seq=0,of.which.seq.found=0,is.GVD="FALSE")
  res[[i]] <- df
  df <- NULL}else{
      
      for(w in 1:nrow(s)){
        v <- s[w,2]
        v <- subseq(v, start=min.aa, end=max.aa)
        r <- grepl("GPP",v, fixed = TRUE)
        res2[[stp]] <- data.frame(gene=i,position=k,available.prot.seq=nrow(g),of.which.seq.found=w,is.GVD=as.character(r))
      }}
    }
  }
}
res <- unique(bind_rows(res2))
res <- subset(res, res$is.GVD == "TRUE")
gpp <- res
gpp$motif <- "GPP"

### GVD filter ###

#first, filter all proteins that have no GVD at all#
m <- sm
l <- list()
pb <- txtProgressBar(min = 0, max = length(unique(m$gene)), style = 3)
stp <- 0
for(i in unique(m$gene)){
  stp <- stp+1
  setTxtProgressBar(pb, stp)
  z <- subset(m, m$gene == i)
  g <- proteins(EnsDb.Hsapiens.v86, filter= GeneNameFilter(i), columns=c("uniprot_id","protein_sequence"),return.type = "data.frame")
  g <- subset(g, g$uniprot_id %in% unique(z$Uniprot_ID))
  g$r <- grepl("GVD",g$protein_sequence, fixed = TRUE)
  g <- as.data.frame(g)
  g$protein_sequence <- NULL
  l[[i]] <- g
}
l <- bind_rows(l)
l <- subset(l, l$r != "FALSE")

#next, backfilter m to include only the genes with a valid entry in l#
m <- subset(m, m$gene %in% l$gene_name & m$Uniprot_ID %in% l$uniprot_id)

#and then, the actual filter#
pb <- txtProgressBar(min = 0, max = length(unique(m$gene)), style = 3)
stp <- 0
res <- list()
res2 <- list()
for(i in unique(m$gene)){
  stp <- stp+1
  setTxtProgressBar(pb, stp)
  z <- subset(m, m$gene == i)
  g <- proteins(EnsDb.Hsapiens.v86, filter= GeneNameFilter(i), columns=c("uniprot_id","protein_sequence"),return.type = "data.frame")
  g <- subset(g, g$uniprot_id %in% unique(z$Uniprot_ID))
  if(nrow(g)==0){df <- data.frame(gene=i,position=0,available.prot.seq=0,of.which.seq.found=0,is.GVD="FALSE")
  res[[i]] <- df
  df <- NULL}else{
    g$n.aa <- nchar(g$protein_sequence)
    aa <- unique(z$Amino_Acid_Change)
    aa <- aa[aa > 4]

    for(k in aa){
     min.aa <- k-3
     max.aa <- k+3
     s <- subset(g, g$n.aa >= min.aa & g$n.aa >= max.aa)
     if(nrow(s)==0){df <- data.frame(gene=i,position=0,available.prot.seq=0,of.which.seq.found=0,is.GVD="FALSE")
  res[[i]] <- df
  df <- NULL}else{
      
      for(w in 1:nrow(s)){
        v <- s[w,2]
        v <- subseq(v, start=min.aa, end=max.aa)
        r <- grepl("GVD",v, fixed = TRUE)
        res2[[stp]] <- data.frame(gene=i,position=k,available.prot.seq=nrow(g),of.which.seq.found=w,is.GVD=as.character(r))
      }}
    }
  }
}
res <- unique(bind_rows(res2))
res <- subset(res, res$is.GVD == "TRUE")
gvd <- res
#gvd$motif <- "GVD" #no GVD!

### RGD filter ###

#first, filter all proteins that have no RGD at all#
m <- sm
l <- list()
pb <- txtProgressBar(min = 0, max = length(unique(m$gene)), style = 3)
stp <- 0
for(i in unique(m$gene)){
  stp <- stp+1
  setTxtProgressBar(pb, stp)
  z <- subset(m, m$gene == i)
  g <- proteins(EnsDb.Hsapiens.v86, filter= GeneNameFilter(i), columns=c("uniprot_id","protein_sequence"),return.type = "data.frame")
  g <- subset(g, g$uniprot_id %in% unique(z$Uniprot_ID))
  g$r <- grepl("RGD",g$protein_sequence, fixed = TRUE)
  g <- as.data.frame(g)
  g$protein_sequence <- NULL
  l[[i]] <- g
}
l <- bind_rows(l)
l <- subset(l, l$r != "FALSE")

#next, backfilter m to include only the genes with a valid entry in l#
m <- subset(m, m$gene %in% l$gene_name & m$Uniprot_ID %in% l$uniprot_id)

#and then, the actual filter#
pb <- txtProgressBar(min = 0, max = length(unique(m$gene)), style = 3)
stp <- 0
res <- list()
res2 <- list()
for(i in unique(m$gene)){
  stp <- stp+1
  setTxtProgressBar(pb, stp)
  z <- subset(m, m$gene == i)
  g <- proteins(EnsDb.Hsapiens.v86, filter= GeneNameFilter(i), columns=c("uniprot_id","protein_sequence"),return.type = "data.frame")
  g <- subset(g, g$uniprot_id %in% unique(z$Uniprot_ID))
  if(nrow(g)==0){df <- data.frame(gene=i,position=0,available.prot.seq=0,of.which.seq.found=0,is.GVD="FALSE")
  res[[i]] <- df
  df <- NULL}else{
    g$n.aa <- nchar(g$protein_sequence)
    aa <- unique(z$Amino_Acid_Change)
    aa <- aa[aa > 4]

    for(k in aa){
     min.aa <- k-3
     max.aa <- k+3
     s <- subset(g, g$n.aa >= min.aa & g$n.aa >= max.aa)
     if(nrow(s)==0){df <- data.frame(gene=i,position=0,available.prot.seq=0,of.which.seq.found=0,is.GVD="FALSE")
  res[[i]] <- df
  df <- NULL}else{
      
      for(w in 1:nrow(s)){
        v <- s[w,2]
        v <- subseq(v, start=min.aa, end=max.aa)
        r <- grepl("RGD",v, fixed = TRUE)
        res2[[stp]] <- data.frame(gene=i,position=k,available.prot.seq=nrow(g),of.which.seq.found=w,is.GVD=as.character(r))
      }}
    }
  }
}
res <- unique(bind_rows(res2))
res <- subset(res, res$is.GVD == "TRUE")
rgd <- res
#rgd$motif <- "RGD" #no RGD

### LDV filter ###

#first, filter all proteins that have no LDV at all#
m <- sm
l <- list()
pb <- txtProgressBar(min = 0, max = length(unique(m$gene)), style = 3)
stp <- 0
for(i in unique(m$gene)){
  stp <- stp+1
  setTxtProgressBar(pb, stp)
  z <- subset(m, m$gene == i)
  g <- proteins(EnsDb.Hsapiens.v86, filter= GeneNameFilter(i), columns=c("uniprot_id","protein_sequence"),return.type = "data.frame")
  g <- subset(g, g$uniprot_id %in% unique(z$Uniprot_ID))
  g$r <- grepl("LDV",g$protein_sequence, fixed = TRUE)
  g <- as.data.frame(g)
  g$protein_sequence <- NULL
  l[[i]] <- g
}
l <- bind_rows(l)
l <- subset(l, l$r != "FALSE")

#next, backfilter m to include only the genes with a valid entry in l#
m <- subset(m, m$gene %in% l$gene_name & m$Uniprot_ID %in% l$uniprot_id)

#and then, the actual filter#
pb <- txtProgressBar(min = 0, max = length(unique(m$gene)), style = 3)
stp <- 0
res <- list()
res2 <- list()
for(i in unique(m$gene)){
  stp <- stp+1
  setTxtProgressBar(pb, stp)
  z <- subset(m, m$gene == i)
  g <- proteins(EnsDb.Hsapiens.v86, filter= GeneNameFilter(i), columns=c("uniprot_id","protein_sequence"),return.type = "data.frame")
  g <- subset(g, g$uniprot_id %in% unique(z$Uniprot_ID))
  if(nrow(g)==0){df <- data.frame(gene=i,position=0,available.prot.seq=0,of.which.seq.found=0,is.GVD="FALSE")
  res[[i]] <- df
  df <- NULL}else{
    g$n.aa <- nchar(g$protein_sequence)
    aa <- unique(z$Amino_Acid_Change)
    aa <- aa[aa > 4]

    for(k in aa){
     min.aa <- k-3
     max.aa <- k+3
     s <- subset(g, g$n.aa >= min.aa & g$n.aa >= max.aa)
     if(nrow(s)==0){df <- data.frame(gene=i,position=0,available.prot.seq=0,of.which.seq.found=0,is.GVD="FALSE")
  res[[i]] <- df
  df <- NULL}else{
      
      for(w in 1:nrow(s)){
        v <- s[w,2]
        v <- subseq(v, start=min.aa, end=max.aa)
        r <- grepl("LDV",v, fixed = TRUE)
        res2[[stp]] <- data.frame(gene=i,position=k,available.prot.seq=nrow(g),of.which.seq.found=w,is.GVD=as.character(r))
      }}
    }
  }
}
res <- unique(bind_rows(res2))
res <- subset(res, res$is.GVD == "TRUE")
ldv <- res
ldv$motif <- "LDV"

### GFPGER filter ###

#first, filter all proteins that have no LDV at all#
m <- sm
l <- list()
pb <- txtProgressBar(min = 0, max = length(unique(m$gene)), style = 3)
stp <- 0
for(i in unique(m$gene)){
  stp <- stp+1
  setTxtProgressBar(pb, stp)
  z <- subset(m, m$gene == i)
  g <- proteins(EnsDb.Hsapiens.v86, filter= GeneNameFilter(i), columns=c("uniprot_id","protein_sequence"),return.type = "data.frame")
  g <- subset(g, g$uniprot_id %in% unique(z$Uniprot_ID))
  g$r <- grepl("GFPGER",g$protein_sequence, fixed = TRUE)
  g <- as.data.frame(g)
  g$protein_sequence <- NULL
  l[[i]] <- g
}
l <- bind_rows(l)
l <- subset(l, l$r != "FALSE")

#next, backfilter m to include only the genes with a valid entry in l#
m <- subset(m, m$gene %in% l$gene_name & m$Uniprot_ID %in% l$uniprot_id)

#and then, the actual filter#
pb <- txtProgressBar(min = 0, max = length(unique(m$gene)), style = 3)
stp <- 0
res <- list()
res2 <- list()
for(i in unique(m$gene)){
  stp <- stp+1
  setTxtProgressBar(pb, stp)
  z <- subset(m, m$gene == i)
  g <- proteins(EnsDb.Hsapiens.v86, filter= GeneNameFilter(i), columns=c("uniprot_id","protein_sequence"),return.type = "data.frame")
  g <- subset(g, g$uniprot_id %in% unique(z$Uniprot_ID))
  if(nrow(g)==0){df <- data.frame(gene=i,position=0,available.prot.seq=0,of.which.seq.found=0,is.GVD="FALSE")
  res[[i]] <- df
  df <- NULL}else{
    g$n.aa <- nchar(g$protein_sequence)
    aa <- unique(z$Amino_Acid_Change)
    aa <- aa[aa > 4]

    for(k in aa){
     min.aa <- k-3
     max.aa <- k+3
     s <- subset(g, g$n.aa >= min.aa & g$n.aa >= max.aa)
     if(nrow(s)==0){df <- data.frame(gene=i,position=0,available.prot.seq=0,of.which.seq.found=0,is.GVD="FALSE")
  res[[i]] <- df
  df <- NULL}else{
      
      for(w in 1:nrow(s)){
        v <- s[w,2]
        v <- subseq(v, start=min.aa, end=max.aa)
        r <- grepl("GFPGER",v, fixed = TRUE)
        res2[[stp]] <- data.frame(gene=i,position=k,available.prot.seq=nrow(g),of.which.seq.found=w,is.GVD=as.character(r))
      }}
    }
  }
}
res <- unique(bind_rows(res2))
res <- subset(res, res$is.GVD == "TRUE")
gfpger <- res
#gfpger$motif <- "GFPGER" #no GFPGER!

### GLPGER filter ###

#first, filter all proteins that have no LDV at all#
m <- sm
l <- list()
pb <- txtProgressBar(min = 0, max = length(unique(m$gene)), style = 3)
stp <- 0
for(i in unique(m$gene)){
  stp <- stp+1
  setTxtProgressBar(pb, stp)
  z <- subset(m, m$gene == i)
  g <- proteins(EnsDb.Hsapiens.v86, filter= GeneNameFilter(i), columns=c("uniprot_id","protein_sequence"),return.type = "data.frame")
  g <- subset(g, g$uniprot_id %in% unique(z$Uniprot_ID))
  g$r <- grepl("GLPGER",g$protein_sequence, fixed = TRUE)
  g <- as.data.frame(g)
  g$protein_sequence <- NULL
  l[[i]] <- g
}
l <- bind_rows(l)
l <- subset(l, l$r != "FALSE")

#next, backfilter m to include only the genes with a valid entry in l#
m <- subset(m, m$gene %in% l$gene_name & m$Uniprot_ID %in% l$uniprot_id)

#and then, the actual filter#
pb <- txtProgressBar(min = 0, max = length(unique(m$gene)), style = 3)
stp <- 0
res <- list()
res2 <- list()
for(i in unique(m$gene)){
  stp <- stp+1
  setTxtProgressBar(pb, stp)
  z <- subset(m, m$gene == i)
  g <- proteins(EnsDb.Hsapiens.v86, filter= GeneNameFilter(i), columns=c("uniprot_id","protein_sequence"),return.type = "data.frame")
  g <- subset(g, g$uniprot_id %in% unique(z$Uniprot_ID))
  if(nrow(g)==0){df <- data.frame(gene=i,position=0,available.prot.seq=0,of.which.seq.found=0,is.GVD="FALSE")
  res[[i]] <- df
  df <- NULL}else{
    g$n.aa <- nchar(g$protein_sequence)
    aa <- unique(z$Amino_Acid_Change)
    aa <- aa[aa > 4]

    for(k in aa){
     min.aa <- k-3
     max.aa <- k+3
     s <- subset(g, g$n.aa >= min.aa & g$n.aa >= max.aa)
     if(nrow(s)==0){df <- data.frame(gene=i,position=0,available.prot.seq=0,of.which.seq.found=0,is.GVD="FALSE")
  res[[i]] <- df
  df <- NULL}else{
      
      for(w in 1:nrow(s)){
        v <- s[w,2]
        v <- subseq(v, start=min.aa, end=max.aa)
        r <- grepl("GLPGER",v, fixed = TRUE)
        res2[[stp]] <- data.frame(gene=i,position=k,available.prot.seq=nrow(g),of.which.seq.found=w,is.GVD=as.character(r))
      }}
    }
  }
}
res <- unique(bind_rows(res2))
res <- subset(res, res$is.GVD == "TRUE")
glpger <- res
#glpger$motif <- "GLPGER" #no GLPGER!



tab <- fread("TABLE 1.txt",header=T,sep="\t")
allmutations <- fread("allmutations.txt", header=T, sep="\t") # import mutation data #
allmutations <- as.data.frame(allmutations)
allmutations$Amino_Acid_Change <- gsub("p.","",allmutations$Amino_Acid_Change)
allmutations$Amino_Acid_Change <- substring(allmutations$Amino_Acid_Change, 1, 1)
m <- merge(tab, allmutations, by.x=c("sample","gene","chr","start","end"), by.y=c("sample","gene","chr","start","end"))
m <- subset(m, m$is.matrisome == "matrisome")
sm <- m

library(EnsDb.Hsapiens.v86)  
library(ensembldb)
library(Biostrings)
library(bit64)

### GPP filter ###

#first, filter all proteins that have no GPP at all#
m <- sm
l <- list()
pb <- txtProgressBar(min = 0, max = length(unique(m$gene)), style = 3)
stp <- 0
for(i in unique(m$gene)){
  stp <- stp+1
  setTxtProgressBar(pb, stp)
  z <- subset(m, m$gene == i)
  g <- proteins(EnsDb.Hsapiens.v86, filter= GeneNameFilter(i), columns=c("uniprot_id","protein_sequence"),return.type = "data.frame")
  g <- subset(g, g$uniprot_id %in% unique(z$Uniprot_ID))
  g$r <- grepl("GPP",g$protein_sequence, fixed = TRUE)
  g <- as.data.frame(g)
  g$protein_sequence <- NULL
  l[[i]] <- g
}
l <- bind_rows(l)
l <- subset(l, l$r != "FALSE")

#next, backfilter m to include only the genes with a valid entry in l#
m <- subset(m, m$gene %in% l$gene_name & m$Uniprot_ID %in% l$uniprot_id)

#and then, the actual filter#
pb <- txtProgressBar(min = 0, max = length(unique(m$gene)), style = 3)
stp <- 0
res <- list()
res2 <- list()
for(i in unique(m$gene)){
  stp <- stp+1
  setTxtProgressBar(pb, stp)
  z <- subset(m, m$gene == i)
  g <- proteins(EnsDb.Hsapiens.v86, filter= GeneNameFilter(i), columns=c("uniprot_id","protein_sequence"),return.type = "data.frame")
  g <- subset(g, g$uniprot_id %in% unique(z$Uniprot_ID))
  if(nrow(g)==0){df <- data.frame(gene=i,position=0,available.prot.seq=0,of.which.seq.found=0,is.GVD="FALSE")
  res[[i]] <- df
  df <- NULL}else{
    g$n.aa <- nchar(g$protein_sequence)
    aa <- unique(z$Amino_Acid_Change.x)

    for(k in aa){
     min.aa <- k-3
     max.aa <- k+3
     s <- subset(g, g$n.aa >= min.aa & g$n.aa >= max.aa)
     if(nrow(s)==0){df <- data.frame(gene=i,position=0,available.prot.seq=0,of.which.seq.found=0,is.GVD="FALSE")
  res[[i]] <- df
  df <- NULL}else{
      
      for(w in 1:nrow(s)){
        v <- s[w,2]
        v <- subseq(v, start=min.aa, end=max.aa)
        r <- grepl("GPP",v, fixed = TRUE)
        res2[[stp]] <- data.frame(gene=i,position=k,available.prot.seq=nrow(g),of.which.seq.found=w,is.GVD=as.character(r))
      }}
    }
  }
}
res <- unique(bind_rows(res2))
res <- subset(res, res$is.GVD == "TRUE")
gpp <- res
gpp$motif <- "GPP"

### GVD filter ###

#first, filter all proteins that have no GVD at all#
m <- sm
l <- list()
pb <- txtProgressBar(min = 0, max = length(unique(m$gene)), style = 3)
stp <- 0
for(i in unique(m$gene)){
  stp <- stp+1
  setTxtProgressBar(pb, stp)
  z <- subset(m, m$gene == i)
  g <- proteins(EnsDb.Hsapiens.v86, filter= GeneNameFilter(i), columns=c("uniprot_id","protein_sequence"),return.type = "data.frame")
  g <- subset(g, g$uniprot_id %in% unique(z$Uniprot_ID))
  g$r <- grepl("GVD",g$protein_sequence, fixed = TRUE)
  g <- as.data.frame(g)
  g$protein_sequence <- NULL
  l[[i]] <- g
}
l <- bind_rows(l)
l <- subset(l, l$r != "FALSE")

#next, backfilter m to include only the genes with a valid entry in l#
m <- subset(m, m$gene %in% l$gene_name & m$Uniprot_ID %in% l$uniprot_id)

#and then, the actual filter#
pb <- txtProgressBar(min = 0, max = length(unique(m$gene)), style = 3)
stp <- 0
res <- list()
res2 <- list()
for(i in unique(m$gene)){
  stp <- stp+1
  setTxtProgressBar(pb, stp)
  z <- subset(m, m$gene == i)
  g <- proteins(EnsDb.Hsapiens.v86, filter= GeneNameFilter(i), columns=c("uniprot_id","protein_sequence"),return.type = "data.frame")
  g <- subset(g, g$uniprot_id %in% unique(z$Uniprot_ID))
  if(nrow(g)==0){df <- data.frame(gene=i,position=0,available.prot.seq=0,of.which.seq.found=0,is.GVD="FALSE")
  res[[i]] <- df
  df <- NULL}else{
    g$n.aa <- nchar(g$protein_sequence)
    aa <- unique(z$Amino_Acid_Change.x)

    for(k in aa){
     min.aa <- k-3
     max.aa <- k+3
     s <- subset(g, g$n.aa >= min.aa & g$n.aa >= max.aa)
     if(nrow(s)==0){df <- data.frame(gene=i,position=0,available.prot.seq=0,of.which.seq.found=0,is.GVD="FALSE")
  res[[i]] <- df
  df <- NULL}else{
      
      for(w in 1:nrow(s)){
        v <- s[w,2]
        v <- subseq(v, start=min.aa, end=max.aa)
        r <- grepl("GVD",v, fixed = TRUE)
        res2[[stp]] <- data.frame(gene=i,position=k,available.prot.seq=nrow(g),of.which.seq.found=w,is.GVD=as.character(r))
      }}
    }
  }
}
res <- unique(bind_rows(res2))
res <- subset(res, res$is.GVD == "TRUE")
gvd <- res
gvd$motif <- "GVD"
     
### RGD filter ###

#first, filter all proteins that have no RGD at all#
m <- sm
l <- list()
pb <- txtProgressBar(min = 0, max = length(unique(m$gene)), style = 3)
stp <- 0
for(i in unique(m$gene)){
  stp <- stp+1
  setTxtProgressBar(pb, stp)
  z <- subset(m, m$gene == i)
  g <- proteins(EnsDb.Hsapiens.v86, filter= GeneNameFilter(i), columns=c("uniprot_id","protein_sequence"),return.type = "data.frame")
  g <- subset(g, g$uniprot_id %in% unique(z$Uniprot_ID))
  g$r <- grepl("RGD",g$protein_sequence, fixed = TRUE)
  g <- as.data.frame(g)
  g$protein_sequence <- NULL
  l[[i]] <- g
}
l <- bind_rows(l)
l <- subset(l, l$r != "FALSE")

#next, backfilter m to include only the genes with a valid entry in l#
m <- subset(m, m$gene %in% l$gene_name & m$Uniprot_ID %in% l$uniprot_id)

#and then, the actual filter#
pb <- txtProgressBar(min = 0, max = length(unique(m$gene)), style = 3)
stp <- 0
res <- list()
res2 <- list()
for(i in unique(m$gene)){
  stp <- stp+1
  setTxtProgressBar(pb, stp)
  z <- subset(m, m$gene == i)
  g <- proteins(EnsDb.Hsapiens.v86, filter= GeneNameFilter(i), columns=c("uniprot_id","protein_sequence"),return.type = "data.frame")
  g <- subset(g, g$uniprot_id %in% unique(z$Uniprot_ID))
  if(nrow(g)==0){df <- data.frame(gene=i,position=0,available.prot.seq=0,of.which.seq.found=0,is.GVD="FALSE")
  res[[i]] <- df
  df <- NULL}else{
    g$n.aa <- nchar(g$protein_sequence)
    aa <- unique(z$Amino_Acid_Change.x)

    for(k in aa){
     min.aa <- k-3
     max.aa <- k+3
     s <- subset(g, g$n.aa >= min.aa & g$n.aa >= max.aa)
     if(nrow(s)==0){df <- data.frame(gene=i,position=0,available.prot.seq=0,of.which.seq.found=0,is.GVD="FALSE")
  res[[i]] <- df
  df <- NULL}else{
      
      for(w in 1:nrow(s)){
        v <- s[w,2]
        v <- subseq(v, start=min.aa, end=max.aa)
        r <- grepl("RGD",v, fixed = TRUE)
        res2[[stp]] <- data.frame(gene=i,position=k,available.prot.seq=nrow(g),of.which.seq.found=w,is.GVD=as.character(r))
      }}
    }
  }
}
res <- unique(bind_rows(res2))
res <- subset(res, res$is.GVD == "TRUE")
rgd <- res
#rgd$motif <- "RGD" #no RGD!

### LDV filter ###

#first, filter all proteins that have no LDV at all#
m <- sm
l <- list()
pb <- txtProgressBar(min = 0, max = length(unique(m$gene)), style = 3)
stp <- 0
for(i in unique(m$gene)){
  stp <- stp+1
  setTxtProgressBar(pb, stp)
  z <- subset(m, m$gene == i)
  g <- proteins(EnsDb.Hsapiens.v86, filter= GeneNameFilter(i), columns=c("uniprot_id","protein_sequence"),return.type = "data.frame")
  g <- subset(g, g$uniprot_id %in% unique(z$Uniprot_ID))
  g$r <- grepl("LDV",g$protein_sequence, fixed = TRUE)
  g <- as.data.frame(g)
  g$protein_sequence <- NULL
  l[[i]] <- g
}
l <- bind_rows(l)
l <- subset(l, l$r != "FALSE")

#next, backfilter m to include only the genes with a valid entry in l#
m <- subset(m, m$gene %in% l$gene_name & m$Uniprot_ID %in% l$uniprot_id)

#and then, the actual filter#
pb <- txtProgressBar(min = 0, max = length(unique(m$gene)), style = 3)
stp <- 0
res <- list()
res2 <- list()
for(i in unique(m$gene)){
  stp <- stp+1
  setTxtProgressBar(pb, stp)
  z <- subset(m, m$gene == i)
  g <- proteins(EnsDb.Hsapiens.v86, filter= GeneNameFilter(i), columns=c("uniprot_id","protein_sequence"),return.type = "data.frame")
  g <- subset(g, g$uniprot_id %in% unique(z$Uniprot_ID))
  if(nrow(g)==0){df <- data.frame(gene=i,position=0,available.prot.seq=0,of.which.seq.found=0,is.GVD="FALSE")
  res[[i]] <- df
  df <- NULL}else{
    g$n.aa <- nchar(g$protein_sequence)
    aa <- unique(z$Amino_Acid_Change.x)

    for(k in aa){
     min.aa <- k-3
     max.aa <- k+3
     s <- subset(g, g$n.aa >= min.aa & g$n.aa >= max.aa)
     if(nrow(s)==0){df <- data.frame(gene=i,position=0,available.prot.seq=0,of.which.seq.found=0,is.GVD="FALSE")
  res[[i]] <- df
  df <- NULL}else{
      
      for(w in 1:nrow(s)){
        v <- s[w,2]
        v <- subseq(v, start=min.aa, end=max.aa)
        r <- grepl("LDV",v, fixed = TRUE)
        res2[[stp]] <- data.frame(gene=i,position=k,available.prot.seq=nrow(g),of.which.seq.found=w,is.GVD=as.character(r))
      }}
    }
  }
}
res <- unique(bind_rows(res2))
res <- subset(res, res$is.GVD == "TRUE")
ldv <- res
#ldv$motif <- "LDV" #no LDV

### GFPGER filter ###

#first, filter all proteins that have no LDV at all#
m <- sm
l <- list()
pb <- txtProgressBar(min = 0, max = length(unique(m$gene)), style = 3)
stp <- 0
for(i in unique(m$gene)){
  stp <- stp+1
  setTxtProgressBar(pb, stp)
  z <- subset(m, m$gene == i)
  g <- proteins(EnsDb.Hsapiens.v86, filter= GeneNameFilter(i), columns=c("uniprot_id","protein_sequence"),return.type = "data.frame")
  g <- subset(g, g$uniprot_id %in% unique(z$Uniprot_ID))
  g$r <- grepl("GFPGER",g$protein_sequence, fixed = TRUE)
  g <- as.data.frame(g)
  g$protein_sequence <- NULL
  l[[i]] <- g
}
l <- bind_rows(l)
l <- subset(l, l$r != "FALSE")

#next, backfilter m to include only the genes with a valid entry in l#
m <- subset(m, m$gene %in% l$gene_name & m$Uniprot_ID %in% l$uniprot_id)

#and then, the actual filter#
pb <- txtProgressBar(min = 0, max = length(unique(m$gene)), style = 3)
stp <- 0
res <- list()
res2 <- list()
for(i in unique(m$gene)){
  stp <- stp+1
  setTxtProgressBar(pb, stp)
  z <- subset(m, m$gene == i)
  g <- proteins(EnsDb.Hsapiens.v86, filter= GeneNameFilter(i), columns=c("uniprot_id","protein_sequence"),return.type = "data.frame")
  g <- subset(g, g$uniprot_id %in% unique(z$Uniprot_ID))
  if(nrow(g)==0){df <- data.frame(gene=i,position=0,available.prot.seq=0,of.which.seq.found=0,is.GVD="FALSE")
  res[[i]] <- df
  df <- NULL}else{
    g$n.aa <- nchar(g$protein_sequence)
    aa <- unique(z$Amino_Acid_Change.x)

    for(k in aa){
     min.aa <- k-3
     max.aa <- k+3
     s <- subset(g, g$n.aa >= min.aa & g$n.aa >= max.aa)
     if(nrow(s)==0){df <- data.frame(gene=i,position=0,available.prot.seq=0,of.which.seq.found=0,is.GVD="FALSE")
  res[[i]] <- df
  df <- NULL}else{
      
      for(w in 1:nrow(s)){
        v <- s[w,2]
        v <- subseq(v, start=min.aa, end=max.aa)
        r <- grepl("GFPGER",v, fixed = TRUE)
        res2[[stp]] <- data.frame(gene=i,position=k,available.prot.seq=nrow(g),of.which.seq.found=w,is.GVD=as.character(r))
      }}
    }
  }
}
res <- unique(bind_rows(res2))
res <- subset(res, res$is.GVD == "TRUE")
gfpger <- res
#gfpger$motif <- "GFPGER" #no LDV

### GLPGER filter ###

#first, filter all proteins that have no LDV at all#
m <- sm
l <- list()
pb <- txtProgressBar(min = 0, max = length(unique(m$gene)), style = 3)
stp <- 0
for(i in unique(m$gene)){
  stp <- stp+1
  setTxtProgressBar(pb, stp)
  z <- subset(m, m$gene == i)
  g <- proteins(EnsDb.Hsapiens.v86, filter= GeneNameFilter(i), columns=c("uniprot_id","protein_sequence"),return.type = "data.frame")
  g <- subset(g, g$uniprot_id %in% unique(z$Uniprot_ID))
  g$r <- grepl("GLPGER",g$protein_sequence, fixed = TRUE)
  g <- as.data.frame(g)
  g$protein_sequence <- NULL
  l[[i]] <- g
}
l <- bind_rows(l)
l <- subset(l, l$r != "FALSE")

#next, backfilter m to include only the genes with a valid entry in l#
m <- subset(m, m$gene %in% l$gene_name & m$Uniprot_ID %in% l$uniprot_id)

#and then, the actual filter#
pb <- txtProgressBar(min = 0, max = length(unique(m$gene)), style = 3)
stp <- 0
res <- list()
res2 <- list()
for(i in unique(m$gene)){
  stp <- stp+1
  setTxtProgressBar(pb, stp)
  z <- subset(m, m$gene == i)
  g <- proteins(EnsDb.Hsapiens.v86, filter= GeneNameFilter(i), columns=c("uniprot_id","protein_sequence"),return.type = "data.frame")
  g <- subset(g, g$uniprot_id %in% unique(z$Uniprot_ID))
  if(nrow(g)==0){df <- data.frame(gene=i,position=0,available.prot.seq=0,of.which.seq.found=0,is.GVD="FALSE")
  res[[i]] <- df
  df <- NULL}else{
    g$n.aa <- nchar(g$protein_sequence)
    aa <- unique(z$Amino_Acid_Change.x)

    for(k in aa){
     min.aa <- k-3
     max.aa <- k+3
     s <- subset(g, g$n.aa >= min.aa & g$n.aa >= max.aa)
     if(nrow(s)==0){df <- data.frame(gene=i,position=0,available.prot.seq=0,of.which.seq.found=0,is.GVD="FALSE")
  res[[i]] <- df
  df <- NULL}else{
      
      for(w in 1:nrow(s)){
        v <- s[w,2]
        v <- subseq(v, start=min.aa, end=max.aa)
        r <- grepl("GLPGER",v, fixed = TRUE)
        res2[[stp]] <- data.frame(gene=i,position=k,available.prot.seq=nrow(g),of.which.seq.found=w,is.GVD=as.character(r))
      }}
    }
  }
}
res <- unique(bind_rows(res2))
res <- subset(res, res$is.GVD == "TRUE")
glpger <- res
#glpger$motif <- "GFPGER" #no LDV

### Bind results together ###

res <- bind_rows(gpp, ldv)
res$id <- paste0(res$gene,"_",res$position)
sm$id <- paste0(sm$gene,"_",sm$Amino_Acid_Change)
sm <- subset(sm, select = c("gene","id"))
res <- merge(sm,res,by="id")
res$id <- NULL
res$gene.y <- NULL
names(res)[1] <- "gene"
names(res)[5] <- "is.GVD.plus.minus.3"
fwrite(res, "MOTIF MAPPING OF non-PTMmut.txt",sep="\t",row.names = F, quote = F)
```

Actually, they are both unlikely but PTMmut seem better tolerated in toto! Then, are there sequence features that can tell us something about the mapping mutations?

```{r, eval=F}

tab <- fread("TABLE 1.txt",header=T,sep="\t")
allmutations <- fread("allmutations.txt", header=T, sep="\t") # import mutation data #
allmutations <- as.data.frame(allmutations)
allmutations$Amino_Acid_Change <- gsub("p.","",allmutations$Amino_Acid_Change)
allmutations$Amino_Acid_Change <- substring(allmutations$Amino_Acid_Change, 1, 1)
m <- merge(tab, allmutations, by.x=c("sample","gene","chr","start","end"), by.y=c("sample","gene","chr","start","end"))
m <- subset(m, m$is.matrisome == "matrisome")

pb <- txtProgressBar(min = 0, max = length(unique(m$gene)), style = 3)
stp <- 0
res <- list()
res2 <- list()
for(i in unique(m$gene)){
  stp <- stp+1
  setTxtProgressBar(pb, stp)
  z <- subset(m, m$gene == i)
  g <- proteins(EnsDb.Hsapiens.v86, filter= GeneNameFilter(i), columns=c("uniprot_id","protein_sequence"),return.type = "data.frame")
  g <- subset(g, g$uniprot_id %in% unique(z$Uniprot_ID))
  if(nrow(g)==0){df <- data.frame(gene=i,position=0,available.prot.seq=0,of.which.seq.found=0,is.GVD="FALSE")
  res[[i]] <- df
  df <- NULL}else{
    g$n.aa <- nchar(g$protein_sequence)
    aa <- unique(z$Amino_Acid_Change.x)
    aa <- aa[aa>11]

    for(k in aa){
     min.aa <- k-10
     max.aa <- k+10
     s <- subset(g, g$n.aa >= min.aa & g$n.aa >= max.aa)
     if(nrow(s)==0){df <- data.frame(gene=i,position=0,available.prot.seq=0,of.which.seq.found=0,is.GVD="FALSE")
  res[[i]] <- df
  df <- NULL}else{
      
      for(w in 1:nrow(s)){
        v <- s[w,2]
        v <- subseq(v, start=min.aa, end=max.aa)
        res2[[stp]] <- data.frame(gene=i,position=k,available.prot.seq=nrow(g),of.which.seq.found=w,is.GVD=as.character(v))
      }}
    }
  }
}
res <- unique(bind_rows(res2))
res$id <- paste0(res$gene,"_",res$position)
m$id <- paste0(sm$gene,"_",sm$Amino_Acid_Change.x)
m <- subset(m, select = c("gene","PTM","id"))
res <- merge(m,res,by="id")
res$id <- NULL
res$gene.y <- NULL
names(res)[1] <- "gene"
names(res)[6] <- "sequence.context.10aa"
ptm <- res

tab <- as.data.frame(fread("all mutations.txt",header=T,sep="\t"))
tab2 <- as.data.frame(fread("all PTM mutations.txt",header=T,sep="\t"))
tab$id <- paste0(tab$sample,"_",tab$gene,"_",tab$Amino_Acid_Change,"_",tab$`cancer type abbreviation`)
tab2$id <- paste0(tab2$sample,"_",tab2$gene,"_",tab2$Amino_Acid_Change,"_",tab2$`cancer type abbreviation`)
tab <- subset(tab, !(tab$id %in% tab2$id))
tab <- subset(tab, tab$is.matrisome == "matrisome")
tab <- na.omit(tab)
ntab <- fread("TABLE 1.txt",header=T,sep="\t")
ntab <- subset(ntab, ntab$is.matrisome == "matrisome")
ntab <- subset(ntab, select=c("gene","Uniprot_ID"))
m <- distinct(merge(tab,ntab,by="gene"))
m <- m %>% mutate_if(bit64::is.integer64, as.integer)
m <- subset(m, m$is.matrisome == "matrisome")

pb <- txtProgressBar(min = 0, max = length(unique(m$gene)), style = 3)
stp <- 0
res <- list()
res2 <- list()
for(i in unique(m$gene)){
  stp <- stp+1
  setTxtProgressBar(pb, stp)
  z <- subset(m, m$gene == i)
  g <- proteins(EnsDb.Hsapiens.v86, filter= GeneNameFilter(i), columns=c("uniprot_id","protein_sequence"),return.type = "data.frame")
  g <- subset(g, g$uniprot_id %in% unique(z$Uniprot_ID))
  if(nrow(g)==0){df <- data.frame(gene=i,position=0,available.prot.seq=0,of.which.seq.found=0,is.GVD="FALSE")
  res[[i]] <- df
  df <- NULL}else{
    g$n.aa <- nchar(g$protein_sequence)
    aa <- unique(z$Amino_Acid_Change)
    aa <- aa[aa>11]

    for(k in aa){
     min.aa <- k-10
     max.aa <- k+10
     s <- subset(g, g$n.aa >= min.aa & g$n.aa >= max.aa)
     if(nrow(s)==0){df <- data.frame(gene=i,position=0,available.prot.seq=0,of.which.seq.found=0,is.GVD="FALSE")
  res[[i]] <- df
  df <- NULL}else{
      
      for(w in 1:nrow(s)){
        v <- s[w,2]
        v <- subseq(v, start=min.aa, end=max.aa)
        res2[[stp]] <- data.frame(gene=i,position=k,available.prot.seq=nrow(g),of.which.seq.found=w,is.GVD=as.character(v))
      }}
    }
  }
}
res <- unique(bind_rows(res2))
res$id <- paste0(res$gene,"_",res$position)
m$id <- paste0(m$gene,"_",m$Amino_Acid_Change)
m <- subset(m, select = c("gene","id"))
res <- merge(m,res,by="id")
res$id <- NULL
res$gene.y <- NULL
names(res)[1] <- "gene"
names(res)[5] <- "sequence.context.10aa"
noptm <- res

l <- list()
l2 <- list()
for(i in unique(ptm$PTM)){
  z <- subset(ptm, ptm$PTM == i)
  g <- z$gene
  l[[i]] <- z$sequence.context.10aa
  z <- subset(noptm, noptm$gene %in% g)
  l2[[i]] <- z$sequence.context.10aa
}
names(l) <- unique(ptm$PTM)
names(l2) <- unique(ptm$PTM) #these are, of course, fictional!

ggseqlogo(l, ncol = 4)
ggseqlogo(l2, ncol = 4)

```
