Introduction

This report is an exercise in simulating a toy version of a protein assay inspired by the aptamer-based Nautilus device. The proteome used is the collection of human blood-secreted proteins from the Protein Atlas. All aptamers are tripeptides, and 1% of possible tripeptides here actually have an aptamer to be used for detection.

Data

Proteins

Human proteins from the Protein Atlas known to be secreted to blood:

hpa = read.csv("data/proteinatlas.csv")
hpa = hpa %>% dplyr::select(Gene, Ensembl, Gene.description, Uniprot, Secretome.location)
hpa = hpa %>% mutate(SecretedBlood = Secretome.location == "Secreted to blood")
secretome = hpa %>% dplyr::filter(SecretedBlood) %>% dplyr::select(-Secretome.location,-SecretedBlood)

# build secretome library and aptamer library
edb <- EnsDb.Hsapiens.v86
prts1 <- proteins(edb, filter = GeneNameFilter(secretome$Gene), columns = c("gene_id","protein_sequence")) %>% data.frame
prts = prts1 %>% group_by(gene_name) %>% dplyr::slice(1)
meanConc = read.csv("data/meanConc.txt",sep="\t",header=F) # from 
colnames(meanConc) = c("gene_name","description","X40.g.L.1","x")
meanConc = meanConc %>% dplyr::select(-x)
meanConc <- meanConc %>% separate(X40.g.L.1, c('MeanConc', 'MeanUnit'), sep = " ") %>% mutate(MeanConc = as.numeric(MeanConc))
multHash = c(
  "g/L" = 1e9,
  "mg/L" = 1e6,
  "µg/L" = 1e3,
  "ng/L" = 1
)
meanConc = meanConc %>% mutate(MeanVal = MeanConc * multHash[MeanUnit])
prts = inner_join(prts,meanConc)
datatable(prts %>% dplyr::select(-protein_sequence, -MeanVal))

Aptamers

Here we create the aptamers. 1% of possible aptamers are available for use. Binding affinities of aptamers are assumed to be experimentally measured with a standard deviation of log10 0.6. A differential affinity of specific to non-specific aptamer binding is 1 * 104.

# set  parameters
onTargetAffinity = 1e-5
offTargetAffinity = 1e-1
conc = 1e-3
pctAptamerCoverage = 0.1
affinityStdDev = 0.6

get_kmers <- function(string, k=3) {
  if (k > nchar(string)) { stop("k cannot be greater than the length of the string") } # Check if k is greater than the length of the string
  kmers <- character(0) # Initialize an empty vector to store the kmers
  for (i in 1:(nchar(string) - k + 1)) { # Loop through the string to extract kmers
    kmer <- substr(string, i, i + k - 1)
    kmers <- c(kmers, kmer)
  }
  kmers
}

trimers = sapply(prts$protein_sequence, get_kmers) %>% unlist %>% unique
aptamers = sample(trimers, size = floor(length(trimers)/2))
onTargetAffinities = rnorm(length(aptamers), mean = log10(onTargetAffinity), sd = affinityStdDev)
names(onTargetAffinities) = aptamers
offTargetAffinities = rnorm(length(aptamers), mean = log10(offTargetAffinity), sd = affinityStdDev)
names(offTargetAffinities) = aptamers

3949 aptamers are made.

Plot the distribution of observed aptamer affinities for all tripeptides. The large peak is low affinity non-specific binding, and the small peak around -6 is the high affinity on-target binding.

affinityMatrix = do.call(cbind,lapply(prts$protein_sequence, function(aseq) {
  aseq_trimer_table = table(get_kmers(aseq))
  aseq_trimer_apt = aseq_trimer_table[names(aseq_trimer_table) %in% aptamers]
  aseq_trimer_apt[setdiff(aptamers,names(aseq_trimer_apt))] = 0
  all_aptamers_counts = aseq_trimer_apt[aptamers]
  affinities = offTargetAffinities + onTargetAffinities * all_aptamers_counts
  affinities
}))
colnames(affinityMatrix) = prts$gene_name
probMatrix = 1 / (1 + exp(1) ^ (affinityMatrix - log10(conc)))
plot(density(affinityMatrix),xlim=c(-9,1))

Probability fit

Let’s fit a two-group gaussian mixed model to the binding probabilities.

dataToFit = sample(unlist(probMatrix),1e3)
fit = Mclust(dataToFit, G=2, model="V")
summary(fit)
---------------------------------------------------- 
Gaussian finite mixture model fitted by EM algorithm 
---------------------------------------------------- 

Mclust V (univariate, unequal variance) model with 2 components: 

 log-likelihood    n df      BIC      ICL
       1062.198 1000  5 2089.858 2052.325

Clustering table:
  1   2 
945  55 
plot(fit, what="density", xlab = "Probability")

Make test sample

Now let’s create one test sample of 100 proteins on 100 tiles (100% occupancy!), with proteins selected at random according to typical normal healthy blood serum protein relative abundances. Mostly albumin!

numOfSpots = 1e2
testProteins = sample(prts$gene_name, size = numOfSpots, prob = prts$MeanVal / max(prts$MeanVal), replace = T)
aptamerHits = do.call(cbind, lapply(testProteins, function(testProtein) {
  sapply(aptamers, function(aptamer) {
    p = probMatrix[aptamer,testProtein]
    sample(c(1,0), 1, prob = c(p,1-p))
  })
}))
table(testProteins)
testProteins
     A2M     AHSG      ALB    APOA1     APOB       C3    C4BPB       CP 
       2        1       51        5        5        2        1        2 
   GPLD1 SERPINA1 SERPINA3       TF      VTN 
       1        2        1       26        1 

Assess test sample

Finally, let’s run our virtual assay and empirically identify the proteins by the discrete values of aptamers that bound or did not bind to each protein, by computing a joint probability of each observed aptamer binding event for each possible protein identity. The raw score is just the pearson correlation of the observed aptamer binding states and the aptamer binding probabilities of the top match. The marginal p-value is the log10 difference between the probability of the top hit and the probability of the second place hit (so, the higher the better).

cors = cor(aptamerHits, probMatrix)
testResults = bind_rows(lapply(c(1:100),function(test_i){
  aCors = cors[test_i,]
  maxCor = max(aCors)
  secondCor = sort(aCors,decreasing = T)[2]
  fit = fitdistr(aCors, "normal")
  para = fit$estimate
  p1 = pnorm(maxCor, mean = para["mean"], sd = para["sd"], lower.tail=F)
  p2 = pnorm(secondCor, mean = para["mean"], sd = para["sd"], lower.tail=F)
  names(p2) = NULL
  assignment = prts$gene_name[aCors==maxCor]
  c("True_protein" = testProteins[test_i], "Inferred_protein" = assignment, "Score" = maxCor, "Marginal_Pval" = -log10(p1) + log10(p2))
}))

datatable(testResults)

And a confusion table of the results.

confusion = confusionMatrix(data = factor(testResults$Inferred_protein), reference = factor(testResults$True_protein))
confusion
Confusion Matrix and Statistics

          Reference
Prediction A2M AHSG ALB APOA1 APOB C3 C4BPB CP GPLD1 SERPINA1 SERPINA3 TF VTN
  A2M        2    0   0     0    0  0     0  0     0        0        0  0   0
  AHSG       0    1   0     0    0  0     0  0     0        0        0  0   0
  ALB        0    0  51     0    0  0     0  0     0        0        0  0   0
  APOA1      0    0   0     5    0  0     0  0     0        0        0  0   0
  APOB       0    0   0     0    5  0     0  0     0        0        0  0   0
  C3         0    0   0     0    0  2     0  0     0        0        0  0   0
  C4BPB      0    0   0     0    0  0     1  0     0        0        0  0   0
  CP         0    0   0     0    0  0     0  2     0        0        0  0   0
  GPLD1      0    0   0     0    0  0     0  0     1        0        0  0   0
  SERPINA1   0    0   0     0    0  0     0  0     0        2        0  0   0
  SERPINA3   0    0   0     0    0  0     0  0     0        0        1  0   0
  TF         0    0   0     0    0  0     0  0     0        0        0 26   0
  VTN        0    0   0     0    0  0     0  0     0        0        0  0   1

Overall Statistics
                                     
               Accuracy : 1          
                 95% CI : (0.9638, 1)
    No Information Rate : 0.51       
    P-Value [Acc > NIR] : < 2.2e-16  
                                     
                  Kappa : 1          
                                     
 Mcnemar's Test P-Value : NA         

Statistics by Class:

                     Class: A2M Class: AHSG Class: ALB Class: APOA1 Class: APOB
Sensitivity                1.00        1.00       1.00         1.00        1.00
Specificity                1.00        1.00       1.00         1.00        1.00
Pos Pred Value             1.00        1.00       1.00         1.00        1.00
Neg Pred Value             1.00        1.00       1.00         1.00        1.00
Prevalence                 0.02        0.01       0.51         0.05        0.05
Detection Rate             0.02        0.01       0.51         0.05        0.05
Detection Prevalence       0.02        0.01       0.51         0.05        0.05
Balanced Accuracy          1.00        1.00       1.00         1.00        1.00
                     Class: C3 Class: C4BPB Class: CP Class: GPLD1
Sensitivity               1.00         1.00      1.00         1.00
Specificity               1.00         1.00      1.00         1.00
Pos Pred Value            1.00         1.00      1.00         1.00
Neg Pred Value            1.00         1.00      1.00         1.00
Prevalence                0.02         0.01      0.02         0.01
Detection Rate            0.02         0.01      0.02         0.01
Detection Prevalence      0.02         0.01      0.02         0.01
Balanced Accuracy         1.00         1.00      1.00         1.00
                     Class: SERPINA1 Class: SERPINA3 Class: TF Class: VTN
Sensitivity                     1.00            1.00      1.00       1.00
Specificity                     1.00            1.00      1.00       1.00
Pos Pred Value                  1.00            1.00      1.00       1.00
Neg Pred Value                  1.00            1.00      1.00       1.00
Prevalence                      0.02            0.01      0.26       0.01
Detection Rate                  0.02            0.01      0.26       0.01
Detection Prevalence            0.02            0.01      0.26       0.01
Balanced Accuracy               1.00            1.00      1.00       1.00