SWATH_MS TurboID Data Analysis

This is the data analysis for the TurboID experiment I carried out using SWATH-MS for quantification of proteins interacting with the ASXL1 PHD domain. We must first load in the packages that are required in our analysis

library(gridExtra)
library(readxl)
library(tidyverse)
## ── Attaching packages ─────────────────────────────────────── tidyverse 1.3.0 ──
## ✓ ggplot2 3.3.2     ✓ purrr   0.3.4
## ✓ tibble  3.0.4     ✓ dplyr   1.0.2
## ✓ tidyr   1.1.2     ✓ stringr 1.4.0
## ✓ readr   1.4.0     ✓ forcats 0.5.0
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## x dplyr::combine() masks gridExtra::combine()
## x dplyr::filter()  masks stats::filter()
## x dplyr::lag()     masks stats::lag()
library(ggplot2)
library(gplots)
## 
## Attaching package: 'gplots'
## The following object is masked from 'package:stats':
## 
##     lowess
library(ggrepel)
library(gghighlight)
options(ggrepel.max.overlaps = Inf)

Load in, manipulate Dataset and set wd

setwd("~/Desktop/SWATH_R_Analysis")
Raw_SWATHMSData <- read_excel("A1P13 vs TID all t-test.xlsx")
SWATHMSData <- Raw_SWATHMSData
colnames(SWATHMSData)[18] <- "logFC"

##Volcano Plot of SWATHMS Data Generating a Volcano plot with proteins with a Log2FC greater than 0.5 and Adjusted p-value less than 0.05 (significant proteins) was generated as follows;

SWATHMSData$Adj_Pval <- p.adjust(SWATHMSData$`p-value`, "fdr")
colnames(SWATHMSData)[18] <- "logFC"
SWATHMSData$Log2FC <- log2(SWATHMSData$`Fold Change`)
UniProt <- read_excel("UniProtIDCompiled.xlsx")
SWATHMerge <- merge(SWATHMSData, UniProt, by = "Peak Name")
colnames(SWATHMerge)[23] <- "GeneName"
Prot_Sig <- SWATHMerge[(SWATHMerge$Log2FC > 0.5) & SWATHMerge$Adj_Pval <= 0.05, ]
Sig_Names <- Prot_Sig$`GeneName`
ggplot() + geom_point(data = SWATHMerge, aes(x = Log2FC, y = -log10(Adj_Pval)), size = 1.5, colour = "red") + 
gghighlight(GeneName %in% Sig_Names, unhighlighted_params = aes(colour = "lightblue"), use_group_by = FALSE) + 
geom_text_repel(aes(x = Log2FC, y = -log10(Adj_Pval), label = ifelse(`GeneName` %in% Sig_Names, `GeneName`, "")))

##Crapome comparison to Significant Protein List I want to compare my ‘hit list’ to their respective Crapome scores. The Crapome is a tool used to see whether proteins typically are produced from random MS experiments - if the Crapome and my enrichments align well, the identified proteins may simply be random.

GOSummary <- read_excel("GO_Summary.xlsx")
Prot_SigMerge <- merge(Prot_Sig, GOSummary, by = "Peak Name")
Crapome <- read_excel("Crapome score.xlsx")
View(Crapome)
CrapomeFC <- Prot_SigMerge %>% select("GeneName", "Log2FC")
colnames(Crapome)[1] <- "GeneName"
Prot_SigCrap<-merge(CrapomeFC, Crapome, by = "GeneName")
colnames(Prot_SigCrap)[3] <- "CrapomeScore"
as.numeric(as.character(Prot_SigCrap$Log2FC))
##  [1] 0.8030306 1.7020889 2.1946314 0.5456976 0.8139482 1.3045248 4.1406811
##  [8] 0.5793607 1.2950295 0.6797097 0.5851823 1.6184848 1.7640971 1.3185536
## [15] 1.0171161 0.6043471 0.5321922 0.5577577 0.7351181 0.9609650 0.5403184
## [22] 0.5465662 0.6188866 1.0422610 0.6142189 0.6491248 1.3067552 0.8875426
## [29] 0.7087546 0.5136300 0.6088305 1.3832653 1.3878896 1.3940634 0.8760706
## [36] 0.5014575 0.7272265 0.6379759 0.7188872 2.2748037 0.6301164 0.5238407
## [43] 0.7028585 1.1495263 0.5411471 0.6627587 1.4332280 0.6507403 0.5302902
## [50] 0.5434205 1.6139713 2.3427699 0.5545139 0.9504601 0.5047540 0.6357323
## [57] 1.5448411 1.0919703
as.numeric(as.character(Prot_SigCrap$CrapomeScore))
##  [1] 0.206703911 0.227653631 0.255586592 0.276536313 0.025139665 0.002793296
##  [7] 0.001396648 0.009776536 0.009776536 0.444134078 0.324022346 0.006983240
## [13] 0.048882682 0.078212291 0.284916201 0.113128492 0.709497207 0.301675978
## [19] 0.125698324 0.110335196 0.032122905 0.241620112 0.085195531 0.086592179
## [25] 0.013966480 0.529329609 0.044692737 0.002793296 0.206703911 0.020949721
## [31] 0.120111732 0.008379888 0.766759777 0.378491620 0.163407821 0.624301676
## [37] 0.600558659 0.544692737 0.493016760 0.667597765 0.205307263 0.353351955
## [43] 0.234636872 0.061452514 0.335195531 0.061452514 0.354748603 0.224860335
## [49] 0.650837989 0.303072626 0.001396648 0.638268156 0.094972067 0.159217877
## [55] 0.118715084 0.353351955 0.030726257 0.030726257
Prot_SigCrapNames <- Prot_SigCrap$'GeneName'
ggplot(Prot_SigCrap, aes(x = Log2FC, y = CrapomeScore)) + geom_point(color = "red", size = 1.5) + geom_text_repel(aes(x = Log2FC, y = CrapomeScore, label = ifelse(`GeneName` %in% Prot_SigCrapNames, `GeneName`, "")))

There appears to be no correlation between the Crapome score and the protein log2 fold change, which is promising.

##Comparison to MCF7 and HCC1417 TRIB1 RIME analysis Peter suggested that I compare my protein hits to Hamish’s for his different cell lines. Hamish sent me a list of significantly enriched proteins from either of his cell lines, filtered for nuclear proteins and for proteins with a log2FC greater than 2 and p-value of less than 0.05. I generated a graph of the MCF7 cells compared to all of my proteins, as follows;

MCF7Overlap <- read_excel("MCF7_Nuclear_For_Cam.xlsx")
colnames(MCF7Overlap)[1] <- "UniProtID"
MCFOverlapAll<-merge(SWATHMerge, MCF7Overlap, by = "UniProtID")
MCFOverlapSig<-merge(Prot_SigMerge, MCF7Overlap, by = "UniProtID")
MCFOverlapAllName <- MCFOverlapAll$GeneName
colnames(MCFOverlapSig)[24] <- "GeneName"
MCFOverlapSigName <- MCFOverlapSig$GeneName
as.numeric(as.character(MCFOverlapAll$Log2FC))
##   [1]  0.174637292 -0.474591568  0.404725166  0.005331378  0.113036650
##   [6] -0.714734510 -0.004393944 -0.305974715 -0.526246431  0.324694237
##  [11] -0.051744511  0.412514530 -0.183238657  0.045945450  1.333517521
##  [16]  0.008818077  0.763128402  0.315871761 -0.714937129  0.180183371
##  [21]  0.008407275 -0.090797058  0.837824029 -0.149975848 -0.631515210
##  [26] -0.023647229  0.141837272 -0.526764028 -0.249495387  0.053014355
##  [31]  0.061617563 -0.114590944  0.005436742  0.179659431  0.716025597
##  [36] -0.576742209  0.262886666  0.193971728  0.154348648  0.020764269
##  [41] -0.675956248 -0.035820458 -0.034596818 -0.628057361  0.076221144
##  [46]  0.062617946  0.071212987  0.071212987 -0.351780382 -0.112473825
##  [51] -0.269728741 -0.184449332  0.252840582 -0.307386663  0.068881592
##  [56] -0.191967064 -0.210087566  0.134904630 -0.754045782  1.702088880
##  [61]  0.114673496  0.609591141 -0.145206256 -0.093022408  0.176833421
##  [66]  0.039888177  0.608830480  0.155583011  0.050389101  0.097320517
##  [71]  0.198729065 -0.368435878 -0.275789602 -0.073293937  0.047982755
##  [76]  0.046170824  0.067275997  0.242327829 -0.295653884 -0.212499775
##  [81]  0.876644370  1.386261102 -0.064420278 -0.415315871  0.577163190
##  [86] -0.057280885  0.360667462 -0.388721365 -0.716034979  0.408204429
##  [91]  0.014547636  0.081804535  1.258748571  0.330732861 -0.339031322
##  [96]  0.025965874 -0.007083889  0.203896263  0.366953605 -0.040613614
## [101]  0.016196779  0.178695796 -0.620552085  0.073974070  0.128041536
## [106]  0.080293964 -0.930433230  0.876070645  0.041695526  0.410765735
## [111] -0.131237133  0.194577167  0.372209042  0.067706838  0.137196596
## [116]  0.371926687 -0.255943161  1.613971341 -0.106145696  0.044747812
## [121] -0.218675723  0.120742375  0.038518927  0.098741378 -1.333580813
## [126] -0.093362391  0.056903494 -0.132398546  0.110281260  0.037493101
## [131] -0.370541358  0.154377684 -0.031262911  0.317532794 -0.355095885
## [136]  0.486700503 -0.283656792  0.079333704  0.125773536  0.045560882
## [141]  0.318378747 -0.534371107 -1.656492212 -0.084862502 -0.038304706
## [146]  0.021333530 -0.057388864  0.384627315  0.389036548  0.121958835
## [151] -0.238254664 -0.097892927 -0.501222502 -0.973900740 -0.483595611
## [156] -0.306472170 -0.237507914  0.613201025  0.147481684
as.numeric(as.character(MCFOverlapAll$log2FCMCF7))
##   [1] 2.23 2.01 2.46 2.04 2.06 2.33 2.03 2.05 2.58 2.43 2.23 2.45 2.45 2.18 2.70
##  [16] 2.01 3.54 2.95 2.55 2.20 2.29 2.89 2.01 2.17 2.07 2.75 2.05 2.04 2.39 2.73
##  [31] 2.73 3.15 2.44 2.87 2.03 2.73 2.17 2.10 2.13 2.12 2.50 2.24 2.51 2.43 2.24
##  [46] 2.30 2.54 2.54 2.23 4.85 2.30 2.89 2.12 2.27 2.08 2.04 2.48 3.13 2.16 2.67
##  [61] 2.09 2.36 2.91 2.63 2.99 3.01 2.77 2.55 2.66 2.80 2.12 2.01 2.10 2.10 2.07
##  [76] 2.07 2.11 2.94 2.07 2.32 2.16 2.06 2.77 2.02 2.42 2.04 2.02 2.01 2.28 2.01
##  [91] 2.49 2.07 2.37 2.51 3.09 2.19 2.19 2.30 2.10 2.77 2.73 2.90 2.88 2.02 2.34
## [106] 2.32 2.27 2.46 2.07 2.15 3.08 3.08 2.42 2.06 2.51 2.64 2.15 2.46 2.18 2.30
## [121] 2.85 2.26 2.87 2.13 2.67 2.58 2.53 2.22 2.65 2.22 4.79 2.33 2.71 2.10 2.09
## [136] 2.05 2.01 2.09 2.31 4.81 2.04 2.04 2.30 2.17 2.01 2.82 2.56 2.53 2.53 2.11
## [151] 2.07 4.43 2.82 2.01 2.87 3.17 2.07 2.03 2.82
MCFAllPlot <- ggplot(MCFOverlapAll, aes(x = Log2FC, y = log2FCMCF7))
MCFAllPlot + geom_point(color = "red", size = 1.5) + geom_text_repel(aes(x = Log2FC, y = log2FCMCF7, label = ifelse(`GeneName` %in% MCFOverlapAllName, `GeneName`, ""))) 

I also made up a seperate graph comparing my significantly enriched proteins to Hamish’s list.

ggplot() + geom_point(data = MCFOverlapAll, aes(x = Log2FC, y = log2FCMCF7), size = 1.5, colour = "red") + 
  gghighlight(GeneName %in% MCFOverlapSigName, unhighlighted_params = aes(colour = "lightblue"), use_group_by = FALSE) + 
  geom_text_repel(aes(x = Log2FC, y = log2FCMCF7, label = ifelse(`GeneName` %in% MCFOverlapSigName, `GeneName`, "")))

Finally, this figure was adapted to show only the proteins with a Log2FC in the ASXL TurboID analysis of more than 0, i.e. enriched proteins.

MCF7Sig <- subset(MCFOverlapAll, Log2FC > 0)
ggplot() + geom_point(data = MCF7Sig, aes(x = Log2FC, y = log2FCMCF7), size = 1.5, colour = "red") + 
  gghighlight(GeneName %in% MCFOverlapSigName, unhighlighted_params = aes(colour = "lightblue"), use_group_by = FALSE) + 
  geom_text_repel(aes(x = Log2FC, y = log2FCMCF7, label = ifelse(`GeneName` %in% MCFOverlapSigName, `GeneName`, ""))) 

HCC1419Overlap <- read_excel("HCC1419.xlsx")
colnames(HCC1419Overlap)[1] <- "UniProtID"
HCCOverlapAll<-merge(SWATHMerge, HCC1419Overlap, by = "UniProtID")
HCCOverlapSig<-merge(Prot_SigMerge, HCC1419Overlap, by = "UniProtID")
HCCOverlapAllName <- HCCOverlapAll$GeneName
colnames(HCCOverlapSig)[24] <- "Role"
HCCOverlapSigName <- HCCOverlapSig$GeneName
as.numeric(as.character(HCCOverlapAll$Log2FC))
##   [1]  0.404725166 -0.221964092  0.016734882  0.051078633  0.037787073
##   [6]  0.113036650 -0.226152845 -0.209290827 -0.305974715 -0.526246431
##  [11] -0.217779189  0.658775688 -0.183238657 -0.208291947  1.333517521
##  [16] -0.071475200  0.395124621  0.133341646  0.336747128 -0.211811552
##  [21]  0.317480309  0.315871761 -0.059744761  0.019933077  0.054933823
##  [26] -0.742956945  0.251008436 -0.142688557 -0.294500935  0.217019592
##  [31]  0.009539927 -0.035722444 -0.067099901  0.008407275 -0.002068983
##  [36] -0.036226908 -0.600543776  0.019467392  0.396642639  0.136757905
##  [41]  0.191577558 -0.019531139 -0.132451044  0.340475603 -0.022411049
##  [46]  0.177343152 -0.631515210 -0.023647229 -0.071986259 -0.526764028
##  [51] -0.249495387 -0.298675952  0.061617563  0.053014355  0.085164012
##  [56]  0.546566230 -0.114590944 -0.024632024 -0.135202241 -0.167255303
##  [61] -0.086358340  0.011077903 -0.386660988  0.154348648  0.015421603
##  [66] -0.089351885 -0.679168161  0.128147042 -0.212221158 -0.675956248
##  [71] -0.034596818 -0.628057361 -0.177332894 -0.094313953 -0.434105942
##  [76]  2.274803666  0.076221144 -0.007274094 -0.391531178  0.242590146
##  [81]  0.085589855 -0.351780382 -0.112473825 -0.269728741 -0.041137916
##  [86] -0.041137916 -0.184449332  0.252840582 -0.120677967 -0.191967064
##  [91] -0.210087566 -0.754045782 -0.205366966 -0.535762460  0.609591141
##  [96]  0.148176521 -0.145206256  0.039888177  0.530290162  0.050389101
## [101]  0.097320517  0.198729065  0.034230289 -0.368435878 -0.275789602
## [106] -0.373406791  0.121702198  0.046170824  0.047982755  0.067275997
## [111] -0.324626763  0.242327829 -0.295653884 -0.210183998 -0.251444057
## [116]  0.033572540 -0.064420278  0.152776929  0.094942247 -0.415315871
## [121]  0.408204429  0.093084239  1.258748571  0.330732861  0.541147077
## [126]  0.464766513  0.088180100 -0.620552085 -0.417701596  0.128041536
## [131] -0.086346469  0.127859758  0.460751264  0.702858544  0.410765735
## [136] -0.434266624  0.371926687  1.613971341  0.190906045  0.044512021
## [141]  0.098741378 -0.191848998 -0.093362391  0.056903494 -0.132398546
## [146] -0.219458487  0.110281260  0.114898444  0.079333704  0.125773536
## [151]  0.045560882  0.318378747  1.916431384  0.430020700 -0.196841780
## [156]  0.389036548  0.384627315 -0.104512454  0.329407547 -0.097892927
## [161] -0.501222502  0.147481684
as.numeric(as.character(HCCOverlapAll$log2FC))
##   [1] 2.80 2.51 2.54 2.59 2.17 2.42 2.72 2.72 2.09 2.73 3.06 2.22 2.59 2.83 2.37
##  [16] 2.21 2.01 2.11 2.15 2.61 2.22 3.10 2.49 3.31 2.09 2.10 2.21 2.19 2.59 2.13
##  [31] 3.16 2.28 2.35 2.43 2.48 2.41 2.56 2.98 2.19 2.30 2.06 2.41 2.17 2.75 2.42
##  [46] 2.70 3.22 3.10 2.16 2.64 3.01 2.46 2.90 2.90 2.02 2.31 3.32 2.94 2.76 2.76
##  [61] 2.03 2.30 2.04 2.85 3.13 2.56 2.40 2.11 2.03 3.26 2.20 2.88 2.36 2.12 2.49
##  [76] 2.09 3.26 2.30 2.71 2.86 2.16 2.31 5.00 2.03 2.51 2.51 3.40 2.40 2.53 2.36
##  [91] 2.94 2.45 2.51 2.39 3.34 2.79 2.08 3.14 3.06 2.77 3.08 2.07 3.29 2.80 2.70
## [106] 2.02 2.64 2.47 2.47 2.04 2.37 2.20 2.62 2.07 2.63 2.02 2.35 2.25 2.54 2.83
## [121] 2.04 2.31 3.64 2.62 2.67 2.59 3.09 2.31 2.38 2.84 2.51 2.77 2.07 2.98 2.94
## [136] 2.15 2.46 5.00 2.52 2.62 2.48 2.33 4.84 3.38 2.00 2.08 2.85 2.43 2.28 2.49
## [151] 4.55 3.21 2.69 2.05 2.20 4.57 4.57 2.66 2.02 4.05 3.16 2.22
HCCSig <- subset(HCCOverlapAll, Log2FC > 0)
ggplot() + geom_point(data = HCCSig, aes(x = Log2FC, y = log2FC), size = 1.5, colour = "red") + 
  gghighlight(GeneName %in% HCCOverlapSigName, unhighlighted_params = aes(colour = "lightblue"), use_group_by = FALSE) + 
  geom_text_repel(aes(x = Log2FC, y = log2FC, label = ifelse(`GeneName` %in% HCCOverlapSigName, `GeneName`, "")))