Review of Dipeptide Results

For ChiStar function, here is the comparison.

Current ChiStr

ChiStr <- function(x) {
        chiStr.v <- unlist(lapply(inverseMatrices$getNames(), function(name){
                c(aa, ss) %=% strsplit(name, "-")[[1]]
                chiStr <- t(as.matrix(x) - as.matrix(c(CAMuTable[CAMuTable$Residue==aa, ss], CBMuTable[CBMuTable$Residue==aa, ss]))) %*% inverseMatrices$getInvMatrix(name) %*% (as.matrix(x) - as.matrix(c(CAMuTable[CAMuTable$Residue==aa, ss], CBMuTable[CBMuTable$Residue==aa, ss])))
                x <- rev(x)
                chiStr_rev <- t(as.matrix(x) - as.matrix(c(CAMuTable[CAMuTable$Residue==aa, ss], CBMuTable[CBMuTable$Residue==aa, ss]))) %*% inverseMatrices$getInvMatrix(name) %*% (as.matrix(x) - as.matrix(c(CAMuTable[CAMuTable$Residue==aa, ss], CBMuTable[CBMuTable$Residue==aa, ss])))
                return(min(chiStr, chiStr_rev))
        }))
        #print(chiStr.v)

        chiStr.table <- data.table::data.table(cbind(AaCodesSingle.v,matrix(chiStr.v, ncol = 3, nrow = 20, byrow = T)))
        colnames(chiStr.table) <- c("AA", "H", "B", "C")
        return(chiStr.table)
}

ChiStr(c(34,43))

##     AA                H                B                C
##  1:  A 351.119798868781  54.766072423982  148.43310926143
##  2:  B 117.214958927936 57.8161657142611 48.0512068268409
##  3:  C 97.0189957666744 50.0156381915919  68.814701886732
##  4:  D 94.4521194047846  72.921983998985 97.4095895674267
##  5:  E  46.844456300035 58.6073278954048 37.9363417268897
##  6:  F 58.4558558968378 60.4697455659707  37.389660806511
##  7:  H 182.195721658528 71.9681822029339  58.387652691814
##  8:  I 199.305831995368 58.2039222602786 58.1441826683741
##  9:  K 101.882097738982 47.8249481536775 45.0888360402009
## 10:  L 153.865581687268 138.407036714523 115.245248071327
## 11:  M 248.045239011618 133.317741572127 92.4912906993465
## 12:  N 126.942249531144 96.6656784269204 60.3332577189862
## 13:  P 89.6949486611014 87.2264423753447 70.0361463734268
## 14:  Q 94.0287778448671 107.281297788162 65.0216317466562
## 15:  R 435.680998501612 364.711615589622 266.008252842324
## 16:  S  750.41730402596 657.807815282462 731.712161221233
## 17:  T 1250.73596627015 1014.52532339198 1146.03569566919
## 18:  V 112.064866825081 87.3314118976387 55.9795081352774
## 19:  W 94.4919474240699 51.4252751829369 74.7242536717096
## 20:  Y 224.091638391939 172.758815889319  89.340921189346

Previous ChiStr, which used in the manuscript

ChiStr.f <- function(dataCaCb.v, secondaryStructure, invCovMatList) {
        names <- c()
        chiStr.v <- c()
        ss <- secondaryStructure
        dataCaCb.m <- as.matrix(dataCaCb.v)
        
        for (aa in AaCodesSingle.v) {
                name <- paste(aa, secondaryStructure, sep = "-")
                names <- c(names, name)
                caMu <- CAMuTable[CAMuTable$Residue==aa, ss]
                cbMu <- CBMuTable[CBMuTable$Residue==aa, ss]
                mean.m <- as.matrix(c(caMu, cbMu))
                invCov <- invCovMatList[[name]]
                
                dataCaCb.m <- as.matrix(dataCaCb.v)
                chiStr <- t(dataCaCb.m - mean.m) %*% invCov %*% (dataCaCb.m - mean.m)
                chiStr.v <- c(chiStr.v, chiStr)
        }
        
        names(chiStr.v) <- names
        return(chiStr.v)
}

ChiStr(c(34,43))

##     AA                H                B                C
##  1:  A 351.119798868781  54.766072423982  148.43310926143
##  2:  B 117.214958927936 57.8161657142611 48.0512068268409
##  3:  C 97.0189957666744 50.0156381915919  68.814701886732
##  4:  D 94.4521194047846  72.921983998985 97.4095895674267
##  5:  E  46.844456300035 58.6073278954048 37.9363417268897
##  6:  F 58.4558558968378 60.4697455659707  37.389660806511
##  7:  H 182.195721658528 71.9681822029339  58.387652691814
##  8:  I 199.305831995368 58.2039222602786 58.1441826683741
##  9:  K 101.882097738982 47.8249481536775 45.0888360402009
## 10:  L 153.865581687268 138.407036714523 115.245248071327
## 11:  M 248.045239011618 133.317741572127 92.4912906993465
## 12:  N 126.942249531144 96.6656784269204 60.3332577189862
## 13:  P 89.6949486611014 87.2264423753447 70.0361463734268
## 14:  Q 94.0287778448671 107.281297788162 65.0216317466562
## 15:  R 435.680998501612 364.711615589622 266.008252842324
## 16:  S  750.41730402596 657.807815282462 731.712161221233
## 17:  T 1250.73596627015 1014.52532339198 1146.03569566919
## 18:  V 112.064866825081 87.3314118976387 55.9795081352774
## 19:  W 94.4919474240699 51.4252751829369 74.7242536717096
## 20:  Y 224.091638391939 172.758815889319  89.340921189346

For Density function, here is the comparison.

Current Version

AaDensity <- function(chiStr, df=2){
        return(dchisq(chiStr, df=df))
}
AaDensity(351.119798868781)

## [1] 2.846259128494142e-77

Previous Version

AaProb.f = function(chiStr){
        prob = dchisq(chiStr, df=2)
        return(prob)
}
AaProb.f(351.119798868781)

## [1] 2.846259128494142e-77

For GridSearch Part, this was pulled out as a single module, which was not in the previous verion that we submitted in manuscript.

I don’t think this has issues, since for testing only dataset, the behave is typical, and there’s no calculation in this module.

GridSearch = function(input_data, from, to, aaFreq, stepNum = 50){
        N <- seq(from, to, length.out = stepNum)
        #print(sequence)
        sumAbsDiff.v <- lapply(N, function(i) {
                SumAbsDiff(i, input_data, aaFreq)
        })

        min_index <- which.min(sumAbsDiff.v)
        min_value <- sumAbsDiff.v[min_index]

        result <- list(sumAbsDiff.v, min_index,  min_value, N)
        return(result)
}

Put everything together:

Current Version

Setup parameters:

actual_freq <- dipeptide_data[[1]]$PriorFreq
temp_actual_freq <- actual_freq[c(1:57),3]/sum(actual_freq[c(1:57),3])
input_data <- dipeptide_data[[1]]$Dipeptide
sequence <- dipeptide_data[[1]]$AA
secStr <- dipeptide_data[[1]]$SS

from=-5
to=5
stepNum=50
onlyAA = F

From previous results

tmpGlyDat <- as.data.frame(carbonDat_GU[[1]])
tmpDat <- as.data.frame(carbonDat_rmGU[[1]])
ca <- as.matrix(tmpGlyDat[,4])
seq <- tmpDat[,2]
ss <- tmpDat[,3]
cacb <- as.matrix(tmpDat[,4:5])
tmpPrior <- apply(AA_SS_label, 1, function(x){
                        aa <- x[1]
                        ss <- x[2]
                        
                        pp <- prior[[1]]$Freq[which(prior[[1]]$AA==aa & prior[[1]]$SS==ss)]
                        ssPrior <- ifelse(length(pp)==0, 0, pp)
                        return(ssPrior)
                        
                })
tmpPrior <- cbind(AA_SS_label, tmpPrior)
Diff_in_actualFreq <- cbind(tmpPrior[,3], temp_actual_freq)[c(1:57),]

## Warning in cbind(tmpPrior[, 3], temp_actual_freq): number of rows of result
## is not a multiple of vector length (arg 2)

colnames(Diff_in_actualFreq) <- c("Previous", "Current")
Diff_in_actualFreq

##                  Previous             Current
##  [1,] 0.02083333333333333 0.01923076923076923
##  [2,] 0.06250000000000000 0.05769230769230770
##  [3,] 0.00000000000000000 0.00000000000000000
##  [4,] 0.08333333333333333 0.07692307692307693
##  [5,] 0.00000000000000000 0.00000000000000000
##  [6,] 0.00000000000000000 0.00000000000000000
##  [7,] 0.00000000000000000 0.00000000000000000
##  [8,] 0.04166666666666666 0.05769230769230770
##  [9,] 0.10416666666666667 0.09615384615384616
## [10,] 0.06250000000000000 0.05769230769230770
## [11,] 0.00000000000000000 0.00000000000000000
## [12,] 0.02083333333333333 0.01923076923076923
## [13,] 0.04166666666666666 0.03846153846153846
## [14,] 0.06250000000000000 0.05769230769230770
## [15,] 0.08333333333333333 0.07692307692307693
## [16,] 0.02083333333333333 0.01923076923076923
## [17,] 0.02083333333333333 0.01923076923076923
## [18,] 0.04166666666666666 0.03846153846153846
## [19,] 0.00000000000000000 0.00000000000000000
## [20,] 0.02083333333333333 0.01923076923076923
## [21,] 0.04166666666666666 0.05769230769230770
## [22,] 0.06250000000000000 0.05769230769230770
## [23,] 0.04166666666666666 0.03846153846153846
## [24,] 0.00000000000000000 0.00000000000000000
## [25,] 0.00000000000000000 0.00000000000000000
## [26,] 0.02083333333333333 0.01923076923076923
## [27,] 0.00000000000000000 0.00000000000000000
## [28,] 0.02083333333333333 0.03846153846153846
## [29,] 0.00000000000000000 0.00000000000000000
## [30,] 0.02083333333333333 0.01923076923076923
## [31,] 0.04166666666666666 0.05769230769230770
## [32,] 0.00000000000000000 0.00000000000000000
## [33,] 0.00000000000000000 0.00000000000000000
## [34,] 0.04166666666666666 0.03846153846153846
## [35,] 0.02083333333333333 0.01923076923076923
## [36,] 0.00000000000000000 0.00000000000000000
## [37,] 0.00000000000000000 0.00000000000000000
## [38,] 0.00000000000000000 0.00000000000000000
## [39,] 0.00000000000000000 0.00000000000000000
## [40,] 0.00000000000000000 0.00000000000000000
## [41,] 0.00000000000000000 0.00000000000000000
## [42,] 0.00000000000000000 0.00000000000000000
## [43,] 0.00000000000000000 0.00000000000000000
## [44,] 0.00000000000000000 0.00000000000000000
## [45,] 0.00000000000000000 0.00000000000000000
## [46,] 0.00000000000000000 0.00000000000000000
## [47,] 0.00000000000000000 0.00000000000000000
## [48,] 0.00000000000000000 0.00000000000000000
## [49,] 0.00000000000000000 0.00000000000000000
## [50,] 0.00000000000000000 0.00000000000000000
## [51,] 0.00000000000000000 0.00000000000000000
## [52,] 0.00000000000000000 0.00000000000000000
## [53,] 0.00000000000000000 0.00000000000000000
## [54,] 0.00000000000000000 0.00000000000000000
## [55,] 0.00000000000000000 0.00000000000000000
## [56,] 0.00000000000000000 0.00000000000000000
## [57,] 0.00000000000000000 0.00000000000000000

ProcDipeptideData <- function(input_data) {
        processed_data <- lapply(c(1:nrow(input_data)), function(i){
                nC2 <- combinat::combn(input_data[i,], 2)
                peakList <- lapply( seq_len(ncol(nC2)/2), function(pair_index) {
                        # Returns "3 x 4"
                        intra <- nC2[,pair_index]
                        #sequential <- input_data[i,][which(!input_data[i,] %in% intra)]
                        sequential <- setdiff(input_data[i,], intra)
                        return(c(intra, sequential))
                })
                # peakList <- do.call(rbind, peakList)
                return(peakList)
        })
        return(processed_data)
}
# test: processed_data <- ProcDipeptideData(input_data)
####################################
ConverFreq <- function(actual_freq, onlyAA = F){
        "
        These forlow 4 lines for testing the data from our generated data from RefDB.
        "
        right_index <- match(cname[-c(13,14,15)], paste0(actual_freq[,1], "-", actual_freq[,2])[-c(58, 59, 60)])
        freq_name <- paste0(actual_freq[,1], "-", actual_freq[,2])[-c(58, 59, 60)][right_index]
        freq_val <- actual_freq[,3][-c(58, 59, 60)]/sum(actual_freq[,3][-c(58, 59, 60)]) # remove glycine, re-nomolize
        if (onlyAA) {
                # sum every 3 of the vector
                freq_val <- unname(tapply(freq_val, (seq_along(freq_val)-1) %/% 3, sum))
                freq_name = c("A","R","N","D","C","Q","E", "H","I","L","K","M","F","P","S","T","Y","W","V")
        }
        joint_freq <-  matrix(freq_val, nrow=length(freq_val)) %*% t(matrix(freq_val, nrow=length(freq_val)))
        rownames(joint_freq) <- freq_name
        colnames(joint_freq) <- freq_name
        return(joint_freq)
}
# test: actual_frea_test <- ConverFreq(actual_freq, F)
####################################
DensityVectorFunc <- function(dat_cacb){
        CaCbCS.v <- unlist(c(dat_cacb[1], dat_cacb[2]))
        chiStar.v <- as.numeric(as.vector(t(ChiStr(CaCbCS.v)[,2:4])))
        density.v <- AaDensity(chiStar.v)
        names(density.v) <- cname
        C_index <- which(cname == c("C-H","C-B","C-C"))
        B_index <- which(cname == c("B-H","B-B","B-C"))
        density.v[C_index] <- density.v[C_index] + density.v[B_index]
        density.v <- density.v[-B_index]
        return(density.v/sum(density.v))
}
# test: sum(DensityVectorFunc(c(56,65)))

####################################
Max3D_Zdim <- function(processed_data, onlyAA = F){
        results <-  lapply(processed_data, function(peak_lists){

                results_peakListsOf3 <- lapply(peak_lists, function(peak_list) {
                        intra = peak_list[c(1:2)]
                        sequential = peak_list[c(3:4)]

                        intra_den_v <- DensityVectorFunc(intra)
                        #print(intra_den_v)
                        sequential_den_v <- DensityVectorFunc(sequential)
                        if (onlyAA) {
                                # sum every 3 of the vector
                                intra_den_v <- unname(tapply(intra_den_v, (seq_along(intra_den_v)-1) %/% 3, sum))
                                sequential_den_v <- unname(tapply(sequential_den_v, (seq_along(sequential_den_v)-1) %/% 3, sum))
                                AAname = c("A","R","N","D","C","Q","E", "H","I","L","K","M","F","P","S","T","Y","W","V")
                        }
                        joint_density <-  matrix(intra_den_v, nrow=length(intra_den_v)) %*% t(matrix(sequential_den_v, nrow=length(sequential_den_v)))
                        return(joint_density)
                })
                dim_x = 57
                dim_y = 57
                if (onlyAA) {
                        dim_x = 19
                        dim_y = 19
                }
                array3d <- array(as.numeric(unlist(results_peakListsOf3)), dim=c(dim_x, dim_y, 3))
                array3dMax <- apply( array3d , c(1,2) , max )
                return(array3dMax)
        })
        return(results)
}

# test: max3D <- Max3D_Zdim(processed_data)
####################################
sum3D_Zdim <- function(dat, onlyAA = F) {
        dim_z = length(dt)
        dim_x = 57
        dim_y = 57
        if (onlyAA) {
                dim_x = 19
                dim_y = 19
        }
        array3d <- array(as.numeric(unlist(dat)), dim=c(dim_x, dim_y, dim_z))
        array3dSum <- apply( array3d , c(1,2) , sum )
        return(array3dSum)
}
# test: sum3D <- sum3D_Zdim(max3D)
####################################
ListAdd <- function(l, val) {
        result <- lapply(l, function(i){
                return(lapply(i, function(j){
                        return(j + val)
                }))
        })
        return(result)

}
####################################
Dipeptide_RefCor <- function(input_data, actual_freq, from=-5, to=5, stepNum=50, onlyAA = F) {
        N <- seq(from, to, length.out = stepNum)
        # Set data in to 3 combination
        joint_freq <- ConverFreq(actual_freq, onlyAA)
        processed_data <- ProcDipeptideData(input_data) # return a list sequence_len * combination_num(3)
        gridSearchResult <- unlist(lapply(N, function(step){
                dat <- ListAdd(processed_data, step)
                max3D <- Max3D_Zdim(dat, onlyAA)
                sum3D <- sum3D_Zdim(max3D, onlyAA)

                #absSumDiff <- sum(abs(sum3D - joint_freq))
                absSumDiff <- sum((sum3D - joint_freq)^2)
                return(absSumDiff)
        }))
        mininal <- N[which.min(gridSearchResult)]
        result <- list(N, gridSearchResult, mininal)
        return(result)

}

testFunc <- function(input_data, actual_freq, from, to, stepNum, onlyAA) {
        R1 <- Dipeptide_RefCor(input_data, actual_freq, from, to, stepNum, onlyAA)
        f <- R1[[3]] - 1
        t <- R1[[3]] + 1
        R2 <- Dipeptide_RefCor(input_data, actual_freq, f, t, stepNum, onlyAA)
        return(list(R1, R2))
}

dat_set <- dipeptide_data[[1]]
in_data <- dat_set$Dipeptide
freq <- dat_set$PriorFreq
results_currentV_currentFreq <-testFunc(input_data = in_data, actual_freq = freq, from=-5, to=5, stepNum=50, onlyAA = F)
results_currentV_currentFreq[[2]][[3]]

## [1] 0.448979591836735

results_currentV_previousFreq <-testFunc(input_data = in_data, actual_freq = tmpPrior, from=-5, to=5, stepNum=50, onlyAA = F)
results_currentV_previousFreq[[2]][[3]]

## [1] 0.448979591836735

Previous Version

SumAbsDiff.f <- function(step, dat_cacb, dat_ca, aaFreq, invCovMatList, ss_prior) {
        sumAaProb.v <- rep(0, 57)
        testDataLen <- nrow(dat_cacb)
        
        
        for (i in 1:testDataLen) {
                aaProb.v <- c()
                CaCbCS.v <- c(dat_cacb[i, 1] + step, dat_cacb[i, 2] + step)
                CaCbCS.v.flip <- rev(CaCbCS.v)

                chiStrH.v <- apply(rbind(ChiStr.f(CaCbCS.v, secondaryStructure = "H", invCovMatList), 
                                         ChiStr.f(CaCbCS.v.flip, secondaryStructure = "H", invCovMatList)), 2, min)
                chiStrB.v <- apply(rbind(ChiStr.f(CaCbCS.v, secondaryStructure = "B", invCovMatList), 
                                         ChiStr.f(CaCbCS.v.flip, secondaryStructure = "B", invCovMatList)), 2, min)
                chiStrC.v <- apply(rbind(ChiStr.f(CaCbCS.v, secondaryStructure = "C", invCovMatList), 
                                         ChiStr.f(CaCbCS.v.flip, secondaryStructure = "C", invCovMatList)), 2, min)
                
                chiStr.v <- c(chiStrB.v, chiStrC.v, chiStrH.v)  # 19 AA SS Order: B, C, H
                
                Prob <- AaProb.f(chiStr.v)
                Prob[3] <- Prob[2] + Prob[3]
                Prob[23] <- Prob[22] + Prob[23]
                Prob[43] <- Prob[42] + Prob[43]
                Prob <- Prob[-c(2,22,42)]
                
                PostProb <- Prob#*ss_prior[1:57]
                sumAaProb.v <- sumAaProb.v + PostProb
        }
        

        sumAaProb.v[1:57] <- sumAaProb.v[1:57]*AA57OLWeights
        sumAaProb.v <- sumAaProb.v/sum(sumAaProb.v)
        
        
        
        actualFreq.v <- aaFreq$tmpPrior[1:57]
        actualFreq.v[1:57] <- actualFreq.v[1:57] %*% as.matrix(AA57OLMatrix)
        actualFreq.v[1:57] <- actualFreq.v[1:57]*AA57OLWeights
        actualFreq.v <- actualFreq.v/sum(actualFreq.v)
        print(length(actualFreq.v))
        
        #print(length(sumAaProb.v*control))
        sumAbsDiff <- sum(abs(sumAaProb.v*control - actualFreq.v*control))
        
        return(sumAbsDiff)
}