ChiStr <- function(x) {
chiStr.v <- unlist(lapply(inverseMatrices$getNames(), function(name){
c(aa, ss) %=% strsplit(name, "-")[[1]]
chiStr <- t(as.matrix(x) - as.matrix(c(CAMuTable[CAMuTable$Residue==aa, ss], CBMuTable[CBMuTable$Residue==aa, ss]))) %*% inverseMatrices$getInvMatrix(name) %*% (as.matrix(x) - as.matrix(c(CAMuTable[CAMuTable$Residue==aa, ss], CBMuTable[CBMuTable$Residue==aa, ss])))
x <- rev(x)
chiStr_rev <- t(as.matrix(x) - as.matrix(c(CAMuTable[CAMuTable$Residue==aa, ss], CBMuTable[CBMuTable$Residue==aa, ss]))) %*% inverseMatrices$getInvMatrix(name) %*% (as.matrix(x) - as.matrix(c(CAMuTable[CAMuTable$Residue==aa, ss], CBMuTable[CBMuTable$Residue==aa, ss])))
return(min(chiStr, chiStr_rev))
}))
#print(chiStr.v)
chiStr.table <- data.table::data.table(cbind(AaCodesSingle.v,matrix(chiStr.v, ncol = 3, nrow = 20, byrow = T)))
colnames(chiStr.table) <- c("AA", "H", "B", "C")
return(chiStr.table)
}
ChiStr(c(34,43))
## AA H B C
## 1: A 351.119798868781 54.766072423982 148.43310926143
## 2: B 117.214958927936 57.8161657142611 48.0512068268409
## 3: C 97.0189957666744 50.0156381915919 68.814701886732
## 4: D 94.4521194047846 72.921983998985 97.4095895674267
## 5: E 46.844456300035 58.6073278954048 37.9363417268897
## 6: F 58.4558558968378 60.4697455659707 37.389660806511
## 7: H 182.195721658528 71.9681822029339 58.387652691814
## 8: I 199.305831995368 58.2039222602786 58.1441826683741
## 9: K 101.882097738982 47.8249481536775 45.0888360402009
## 10: L 153.865581687268 138.407036714523 115.245248071327
## 11: M 248.045239011618 133.317741572127 92.4912906993465
## 12: N 126.942249531144 96.6656784269204 60.3332577189862
## 13: P 89.6949486611014 87.2264423753447 70.0361463734268
## 14: Q 94.0287778448671 107.281297788162 65.0216317466562
## 15: R 435.680998501612 364.711615589622 266.008252842324
## 16: S 750.41730402596 657.807815282462 731.712161221233
## 17: T 1250.73596627015 1014.52532339198 1146.03569566919
## 18: V 112.064866825081 87.3314118976387 55.9795081352774
## 19: W 94.4919474240699 51.4252751829369 74.7242536717096
## 20: Y 224.091638391939 172.758815889319 89.340921189346
ChiStr.f <- function(dataCaCb.v, secondaryStructure, invCovMatList) {
names <- c()
chiStr.v <- c()
ss <- secondaryStructure
dataCaCb.m <- as.matrix(dataCaCb.v)
for (aa in AaCodesSingle.v) {
name <- paste(aa, secondaryStructure, sep = "-")
names <- c(names, name)
caMu <- CAMuTable[CAMuTable$Residue==aa, ss]
cbMu <- CBMuTable[CBMuTable$Residue==aa, ss]
mean.m <- as.matrix(c(caMu, cbMu))
invCov <- invCovMatList[[name]]
dataCaCb.m <- as.matrix(dataCaCb.v)
chiStr <- t(dataCaCb.m - mean.m) %*% invCov %*% (dataCaCb.m - mean.m)
chiStr.v <- c(chiStr.v, chiStr)
}
names(chiStr.v) <- names
return(chiStr.v)
}
ChiStr(c(34,43))
## AA H B C
## 1: A 351.119798868781 54.766072423982 148.43310926143
## 2: B 117.214958927936 57.8161657142611 48.0512068268409
## 3: C 97.0189957666744 50.0156381915919 68.814701886732
## 4: D 94.4521194047846 72.921983998985 97.4095895674267
## 5: E 46.844456300035 58.6073278954048 37.9363417268897
## 6: F 58.4558558968378 60.4697455659707 37.389660806511
## 7: H 182.195721658528 71.9681822029339 58.387652691814
## 8: I 199.305831995368 58.2039222602786 58.1441826683741
## 9: K 101.882097738982 47.8249481536775 45.0888360402009
## 10: L 153.865581687268 138.407036714523 115.245248071327
## 11: M 248.045239011618 133.317741572127 92.4912906993465
## 12: N 126.942249531144 96.6656784269204 60.3332577189862
## 13: P 89.6949486611014 87.2264423753447 70.0361463734268
## 14: Q 94.0287778448671 107.281297788162 65.0216317466562
## 15: R 435.680998501612 364.711615589622 266.008252842324
## 16: S 750.41730402596 657.807815282462 731.712161221233
## 17: T 1250.73596627015 1014.52532339198 1146.03569566919
## 18: V 112.064866825081 87.3314118976387 55.9795081352774
## 19: W 94.4919474240699 51.4252751829369 74.7242536717096
## 20: Y 224.091638391939 172.758815889319 89.340921189346
AaDensity <- function(chiStr, df=2){
return(dchisq(chiStr, df=df))
}
AaDensity(351.119798868781)
## [1] 2.846259128494142e-77
AaProb.f = function(chiStr){
prob = dchisq(chiStr, df=2)
return(prob)
}
AaProb.f(351.119798868781)
## [1] 2.846259128494142e-77
I don’t think this has issues, since for testing only dataset, the behave is typical, and there’s no calculation in this module.
GridSearch = function(input_data, from, to, aaFreq, stepNum = 50){
N <- seq(from, to, length.out = stepNum)
#print(sequence)
sumAbsDiff.v <- lapply(N, function(i) {
SumAbsDiff(i, input_data, aaFreq)
})
min_index <- which.min(sumAbsDiff.v)
min_value <- sumAbsDiff.v[min_index]
result <- list(sumAbsDiff.v, min_index, min_value, N)
return(result)
}
actual_freq <- dipeptide_data[[1]]$PriorFreq
temp_actual_freq <- actual_freq[c(1:57),3]/sum(actual_freq[c(1:57),3])
input_data <- dipeptide_data[[1]]$Dipeptide
sequence <- dipeptide_data[[1]]$AA
secStr <- dipeptide_data[[1]]$SS
from=-5
to=5
stepNum=50
onlyAA = F
tmpGlyDat <- as.data.frame(carbonDat_GU[[1]])
tmpDat <- as.data.frame(carbonDat_rmGU[[1]])
ca <- as.matrix(tmpGlyDat[,4])
seq <- tmpDat[,2]
ss <- tmpDat[,3]
cacb <- as.matrix(tmpDat[,4:5])
tmpPrior <- apply(AA_SS_label, 1, function(x){
aa <- x[1]
ss <- x[2]
pp <- prior[[1]]$Freq[which(prior[[1]]$AA==aa & prior[[1]]$SS==ss)]
ssPrior <- ifelse(length(pp)==0, 0, pp)
return(ssPrior)
})
tmpPrior <- cbind(AA_SS_label, tmpPrior)
Diff_in_actualFreq <- cbind(tmpPrior[,3], temp_actual_freq)[c(1:57),]
## Warning in cbind(tmpPrior[, 3], temp_actual_freq): number of rows of result
## is not a multiple of vector length (arg 2)
colnames(Diff_in_actualFreq) <- c("Previous", "Current")
Diff_in_actualFreq
## Previous Current
## [1,] 0.02083333333333333 0.01923076923076923
## [2,] 0.06250000000000000 0.05769230769230770
## [3,] 0.00000000000000000 0.00000000000000000
## [4,] 0.08333333333333333 0.07692307692307693
## [5,] 0.00000000000000000 0.00000000000000000
## [6,] 0.00000000000000000 0.00000000000000000
## [7,] 0.00000000000000000 0.00000000000000000
## [8,] 0.04166666666666666 0.05769230769230770
## [9,] 0.10416666666666667 0.09615384615384616
## [10,] 0.06250000000000000 0.05769230769230770
## [11,] 0.00000000000000000 0.00000000000000000
## [12,] 0.02083333333333333 0.01923076923076923
## [13,] 0.04166666666666666 0.03846153846153846
## [14,] 0.06250000000000000 0.05769230769230770
## [15,] 0.08333333333333333 0.07692307692307693
## [16,] 0.02083333333333333 0.01923076923076923
## [17,] 0.02083333333333333 0.01923076923076923
## [18,] 0.04166666666666666 0.03846153846153846
## [19,] 0.00000000000000000 0.00000000000000000
## [20,] 0.02083333333333333 0.01923076923076923
## [21,] 0.04166666666666666 0.05769230769230770
## [22,] 0.06250000000000000 0.05769230769230770
## [23,] 0.04166666666666666 0.03846153846153846
## [24,] 0.00000000000000000 0.00000000000000000
## [25,] 0.00000000000000000 0.00000000000000000
## [26,] 0.02083333333333333 0.01923076923076923
## [27,] 0.00000000000000000 0.00000000000000000
## [28,] 0.02083333333333333 0.03846153846153846
## [29,] 0.00000000000000000 0.00000000000000000
## [30,] 0.02083333333333333 0.01923076923076923
## [31,] 0.04166666666666666 0.05769230769230770
## [32,] 0.00000000000000000 0.00000000000000000
## [33,] 0.00000000000000000 0.00000000000000000
## [34,] 0.04166666666666666 0.03846153846153846
## [35,] 0.02083333333333333 0.01923076923076923
## [36,] 0.00000000000000000 0.00000000000000000
## [37,] 0.00000000000000000 0.00000000000000000
## [38,] 0.00000000000000000 0.00000000000000000
## [39,] 0.00000000000000000 0.00000000000000000
## [40,] 0.00000000000000000 0.00000000000000000
## [41,] 0.00000000000000000 0.00000000000000000
## [42,] 0.00000000000000000 0.00000000000000000
## [43,] 0.00000000000000000 0.00000000000000000
## [44,] 0.00000000000000000 0.00000000000000000
## [45,] 0.00000000000000000 0.00000000000000000
## [46,] 0.00000000000000000 0.00000000000000000
## [47,] 0.00000000000000000 0.00000000000000000
## [48,] 0.00000000000000000 0.00000000000000000
## [49,] 0.00000000000000000 0.00000000000000000
## [50,] 0.00000000000000000 0.00000000000000000
## [51,] 0.00000000000000000 0.00000000000000000
## [52,] 0.00000000000000000 0.00000000000000000
## [53,] 0.00000000000000000 0.00000000000000000
## [54,] 0.00000000000000000 0.00000000000000000
## [55,] 0.00000000000000000 0.00000000000000000
## [56,] 0.00000000000000000 0.00000000000000000
## [57,] 0.00000000000000000 0.00000000000000000
ProcDipeptideData <- function(input_data) {
processed_data <- lapply(c(1:nrow(input_data)), function(i){
nC2 <- combinat::combn(input_data[i,], 2)
peakList <- lapply( seq_len(ncol(nC2)/2), function(pair_index) {
# Returns "3 x 4"
intra <- nC2[,pair_index]
#sequential <- input_data[i,][which(!input_data[i,] %in% intra)]
sequential <- setdiff(input_data[i,], intra)
return(c(intra, sequential))
})
# peakList <- do.call(rbind, peakList)
return(peakList)
})
return(processed_data)
}
# test: processed_data <- ProcDipeptideData(input_data)
####################################
ConverFreq <- function(actual_freq, onlyAA = F){
"
These forlow 4 lines for testing the data from our generated data from RefDB.
"
right_index <- match(cname[-c(13,14,15)], paste0(actual_freq[,1], "-", actual_freq[,2])[-c(58, 59, 60)])
freq_name <- paste0(actual_freq[,1], "-", actual_freq[,2])[-c(58, 59, 60)][right_index]
freq_val <- actual_freq[,3][-c(58, 59, 60)]/sum(actual_freq[,3][-c(58, 59, 60)]) # remove glycine, re-nomolize
if (onlyAA) {
# sum every 3 of the vector
freq_val <- unname(tapply(freq_val, (seq_along(freq_val)-1) %/% 3, sum))
freq_name = c("A","R","N","D","C","Q","E", "H","I","L","K","M","F","P","S","T","Y","W","V")
}
joint_freq <- matrix(freq_val, nrow=length(freq_val)) %*% t(matrix(freq_val, nrow=length(freq_val)))
rownames(joint_freq) <- freq_name
colnames(joint_freq) <- freq_name
return(joint_freq)
}
# test: actual_frea_test <- ConverFreq(actual_freq, F)
####################################
DensityVectorFunc <- function(dat_cacb){
CaCbCS.v <- unlist(c(dat_cacb[1], dat_cacb[2]))
chiStar.v <- as.numeric(as.vector(t(ChiStr(CaCbCS.v)[,2:4])))
density.v <- AaDensity(chiStar.v)
names(density.v) <- cname
C_index <- which(cname == c("C-H","C-B","C-C"))
B_index <- which(cname == c("B-H","B-B","B-C"))
density.v[C_index] <- density.v[C_index] + density.v[B_index]
density.v <- density.v[-B_index]
return(density.v/sum(density.v))
}
# test: sum(DensityVectorFunc(c(56,65)))
####################################
Max3D_Zdim <- function(processed_data, onlyAA = F){
results <- lapply(processed_data, function(peak_lists){
results_peakListsOf3 <- lapply(peak_lists, function(peak_list) {
intra = peak_list[c(1:2)]
sequential = peak_list[c(3:4)]
intra_den_v <- DensityVectorFunc(intra)
#print(intra_den_v)
sequential_den_v <- DensityVectorFunc(sequential)
if (onlyAA) {
# sum every 3 of the vector
intra_den_v <- unname(tapply(intra_den_v, (seq_along(intra_den_v)-1) %/% 3, sum))
sequential_den_v <- unname(tapply(sequential_den_v, (seq_along(sequential_den_v)-1) %/% 3, sum))
AAname = c("A","R","N","D","C","Q","E", "H","I","L","K","M","F","P","S","T","Y","W","V")
}
joint_density <- matrix(intra_den_v, nrow=length(intra_den_v)) %*% t(matrix(sequential_den_v, nrow=length(sequential_den_v)))
return(joint_density)
})
dim_x = 57
dim_y = 57
if (onlyAA) {
dim_x = 19
dim_y = 19
}
array3d <- array(as.numeric(unlist(results_peakListsOf3)), dim=c(dim_x, dim_y, 3))
array3dMax <- apply( array3d , c(1,2) , max )
return(array3dMax)
})
return(results)
}
# test: max3D <- Max3D_Zdim(processed_data)
####################################
sum3D_Zdim <- function(dat, onlyAA = F) {
dim_z = length(dt)
dim_x = 57
dim_y = 57
if (onlyAA) {
dim_x = 19
dim_y = 19
}
array3d <- array(as.numeric(unlist(dat)), dim=c(dim_x, dim_y, dim_z))
array3dSum <- apply( array3d , c(1,2) , sum )
return(array3dSum)
}
# test: sum3D <- sum3D_Zdim(max3D)
####################################
ListAdd <- function(l, val) {
result <- lapply(l, function(i){
return(lapply(i, function(j){
return(j + val)
}))
})
return(result)
}
####################################
Dipeptide_RefCor <- function(input_data, actual_freq, from=-5, to=5, stepNum=50, onlyAA = F) {
N <- seq(from, to, length.out = stepNum)
# Set data in to 3 combination
joint_freq <- ConverFreq(actual_freq, onlyAA)
processed_data <- ProcDipeptideData(input_data) # return a list sequence_len * combination_num(3)
gridSearchResult <- unlist(lapply(N, function(step){
dat <- ListAdd(processed_data, step)
max3D <- Max3D_Zdim(dat, onlyAA)
sum3D <- sum3D_Zdim(max3D, onlyAA)
#absSumDiff <- sum(abs(sum3D - joint_freq))
absSumDiff <- sum((sum3D - joint_freq)^2)
return(absSumDiff)
}))
mininal <- N[which.min(gridSearchResult)]
result <- list(N, gridSearchResult, mininal)
return(result)
}
testFunc <- function(input_data, actual_freq, from, to, stepNum, onlyAA) {
R1 <- Dipeptide_RefCor(input_data, actual_freq, from, to, stepNum, onlyAA)
f <- R1[[3]] - 1
t <- R1[[3]] + 1
R2 <- Dipeptide_RefCor(input_data, actual_freq, f, t, stepNum, onlyAA)
return(list(R1, R2))
}
dat_set <- dipeptide_data[[1]]
in_data <- dat_set$Dipeptide
freq <- dat_set$PriorFreq
results_currentV_currentFreq <-testFunc(input_data = in_data, actual_freq = freq, from=-5, to=5, stepNum=50, onlyAA = F)
results_currentV_currentFreq[[2]][[3]]
## [1] 0.448979591836735
results_currentV_previousFreq <-testFunc(input_data = in_data, actual_freq = tmpPrior, from=-5, to=5, stepNum=50, onlyAA = F)
results_currentV_previousFreq[[2]][[3]]
## [1] 0.448979591836735
SumAbsDiff.f <- function(step, dat_cacb, dat_ca, aaFreq, invCovMatList, ss_prior) {
sumAaProb.v <- rep(0, 57)
testDataLen <- nrow(dat_cacb)
for (i in 1:testDataLen) {
aaProb.v <- c()
CaCbCS.v <- c(dat_cacb[i, 1] + step, dat_cacb[i, 2] + step)
CaCbCS.v.flip <- rev(CaCbCS.v)
chiStrH.v <- apply(rbind(ChiStr.f(CaCbCS.v, secondaryStructure = "H", invCovMatList),
ChiStr.f(CaCbCS.v.flip, secondaryStructure = "H", invCovMatList)), 2, min)
chiStrB.v <- apply(rbind(ChiStr.f(CaCbCS.v, secondaryStructure = "B", invCovMatList),
ChiStr.f(CaCbCS.v.flip, secondaryStructure = "B", invCovMatList)), 2, min)
chiStrC.v <- apply(rbind(ChiStr.f(CaCbCS.v, secondaryStructure = "C", invCovMatList),
ChiStr.f(CaCbCS.v.flip, secondaryStructure = "C", invCovMatList)), 2, min)
chiStr.v <- c(chiStrB.v, chiStrC.v, chiStrH.v) # 19 AA SS Order: B, C, H
Prob <- AaProb.f(chiStr.v)
Prob[3] <- Prob[2] + Prob[3]
Prob[23] <- Prob[22] + Prob[23]
Prob[43] <- Prob[42] + Prob[43]
Prob <- Prob[-c(2,22,42)]
PostProb <- Prob#*ss_prior[1:57]
sumAaProb.v <- sumAaProb.v + PostProb
}
sumAaProb.v[1:57] <- sumAaProb.v[1:57]*AA57OLWeights
sumAaProb.v <- sumAaProb.v/sum(sumAaProb.v)
actualFreq.v <- aaFreq$tmpPrior[1:57]
actualFreq.v[1:57] <- actualFreq.v[1:57] %*% as.matrix(AA57OLMatrix)
actualFreq.v[1:57] <- actualFreq.v[1:57]*AA57OLWeights
actualFreq.v <- actualFreq.v/sum(actualFreq.v)
print(length(actualFreq.v))
#print(length(sumAaProb.v*control))
sumAbsDiff <- sum(abs(sumAaProb.v*control - actualFreq.v*control))
return(sumAbsDiff)
}