library("Biostrings")
## Loading required package: BiocGenerics
## Loading required package: parallel
##
## Attaching package: 'BiocGenerics'
## The following objects are masked from 'package:parallel':
##
## clusterApply, clusterApplyLB, clusterCall, clusterEvalQ,
## clusterExport, clusterMap, parApply, parCapply, parLapply,
## parLapplyLB, parRapply, parSapply, parSapplyLB
## The following objects are masked from 'package:stats':
##
## IQR, mad, xtabs
## The following objects are masked from 'package:base':
##
## anyDuplicated, append, as.data.frame, cbind, colnames,
## do.call, duplicated, eval, evalq, Filter, Find, get, grep,
## grepl, intersect, is.unsorted, lapply, lengths, Map, mapply,
## match, mget, order, paste, pmax, pmax.int, pmin, pmin.int,
## Position, rank, rbind, Reduce, rownames, sapply, setdiff,
## sort, table, tapply, union, unique, unsplit, which, which.max,
## which.min
## Loading required package: S4Vectors
## Loading required package: stats4
##
## Attaching package: 'S4Vectors'
## The following objects are masked from 'package:base':
##
## colMeans, colSums, expand.grid, rowMeans, rowSums
## Loading required package: IRanges
## Loading required package: XVector
GenerateDNAseq <- function(l)
{
seq <- sample(DNA_ALPHABET[1:4], size=l, replace=TRUE)
seq <- paste(seq, collapse="")
return(DNAString(seq))
}
SubstituteSymbol <- function(x,k, len_alphabet)
{
symbol <- alphabet[sample(1:len_alphabet, 1)]
substr(x,k,k) <- symbol
return(x)
}
InsertWithLeftShift <- function(x,k, len_alphabet)
{
substr(x, 1, k-1) <- substr(x, 2, k)
res <- SubstituteSymbol(x,k, len_alphabet)
return(res)
}
InsertWithRightShift <- function(x,k, len_alphabet)
{
substr(x, k+1,nchar(x)) <- substr(x, k, nchar(x) - 1)
res <- SubstituteSymbol(x,k, len_alphabet)
return(res)
}
DelAndLeftShift <- function(x,k, len_alphabet)
{
substr(x, 2, k) <- substr(x, 1, k-1)
res <- SubstituteSymbol(x,1, len_alphabet)
return(res)
}
DelAndRightShift <- function(x,k, len_alphabet)
{
substr(x, k,nchar(x)-1) <- substr(x, k+1, nchar(x))
res <- SubstituteSymbol(x,nchar(x), len_alphabet)
return(res)
}
Neighbor <- function(x, len_alphabet)
{
k = sample(1:len_alphabet, 1)
roll <- sample(1:5, 1, prob = c(0.5, 1/8,1/8,1/8,1/8))
if(roll == 1) return(SubstituteSymbol(x,k, len_alphabet))
if(roll == 2) return(InsertWithLeftShift(x,k, len_alphabet))
if(roll == 3) return(InsertWithRightShift(x,k, len_alphabet))
if(roll == 4) return(DelAndLeftShift(x,k, len_alphabet))
if(roll == 5) return(DelAndRightShift(x,k, len_alphabet))
}
isNOTflat <- function(H, Smax, Smin)
{
res <- sum(H > 0.6*sum(H)/(Smax - Smin + 1))
if(res == Smax) return(FALSE)
else return(TRUE)
}
MetropolisUpdate <- function(y,s,w, query, len_alphabet = 4, mat,gapOpening = 12, gapExtension = 1)
{
x <- query
y <- toString(y)
y_new <- Neighbor(y, len_alphabet)
y <- DNAString(y)
y_new <- DNAString(y_new)
resAli <- pairwiseAlignment(y_new, x, type = "local", substitutionMatrix=mat, gapOpening=gapOpening, gapExtension=gapExtension) # посмотреть параметры
s_new <- resAli@score
alpha <- exp(w[s_new] - w[s])
if(runif(1)< alpha)
return(list(y_new, s_new))
else
return(list(y,s))
}
WangLandau <- function(w, phi, phiFinal,Smax, Smin, Nsim, query, subject, len_alphabet, mat, gapOpening= 12, gapExtension = 1)
{
x <- query
y <- subject
resAli <- pairwiseAlignment(y,x, type = "local",substitutionMatrix=mat, gapOpening=gapOpening, gapExtension=gapExtension)
s <- resAli@score
kMax = 15
k <- 0
while(k <= kMax)
{
H <- rep(0, Smax)
while(isNOTflat(H, Smax, Smin))
{
res <- MetropolisUpdate(y, s, w, query, len_alphabet, mat,gapOpening, gapExtension)
y <- res[[1]]
s <- res[[2]]
H[s] <- H[s] + 1
print(H)
w[s] <- w[s] - phi
}
phi <- phi/2
#phi <- sqrt(phi)
k = k + 1
}
H <- rep(0, Smax)
for(j in 1:Nsim)
{
res <- MetropolisUpdate(y, s, w, query, len_alphabet, mat, gapOpening, gapExtension)
y <- res[[1]]
s <- res[[2]]
H[s] <- H[s] + 1
print(j)
}
return(list(H, w))
}
######## Параметры ########
l = 5 # длина строки
#query <- GenerateDNAseq(l) # что ищем, фиксированный
query <- DNAString("ACGTG")
subject <- GenerateDNAseq(l)
alphabet <- alphabet(query, baseOnly=TRUE)
len_alphabet <- length(alphabet)
Smin <- 1 # мин значение score
Smax <- length(query) # макс значение score
Nsim <- 10^6 # количество итераций в генерации выборки
weight<- rep(1, Smax)
weight_log <- rep(0, Smax) # веса для WangLandau
phi <- exp(0.1)
#phi_log <- -0.1
phi_log <- 1
phiFinal <- exp(0.0002)
phiFinal_log <- 0.0002
mat <- nucleotideSubstitutionMatrix(match = 1, mismatch = -3, baseOnly = TRUE)
gapOpening = 12
gapExtension = 1
# resWL <- WangLandau(w = weight_log, phi_log, phiFinal_log, Smax, Smin, Nsim, query, subject, len_alphabet, mat, gapOpening, gapExtension)
#
# hist_res <- resWL[[1]]
# hist_res
# weight_res <- resWL[[2]]
# weight_res
####### WL Score #######
setwd("E:/Study/Magistracy/10 term/НИР/Prog")
s_res <- read.table("ScoreWL")
s <- read.table("ScoreSample")
weight.wl <- c(9.296220e-11, 6.467032e-11, 3.392456e-10, 3.435640e-09, 2.993556e-08)
hist.wl <- table(s_res)
barplot(hist.wl/sum(hist.wl), main = "Scores without weights")

scores_weight <- hist.wl*(1/weight.wl)
const <- sum(scores_weight)
prob.wl <- (1/const)*scores_weight
barplot(prob.wl, main = "Scores with weights", col=rgb(0,0,1,1/4))
# #h1 <- hist(scores_weight)
###### Score Sample ######
# s <- NULL
# x <- query
# y <- GenerateDNAseq(l)
# H_s <- rep(0, Smax)
# for (i in 1:Nsim)
# {
# y <- Neighbor(toString(y), len_alphabet)
# res <- pairwiseAlignment(y,x, substitutionMatrix = mat, type = "local", gapOpening=12, gapExtension=1)
# print(paste(i, "sample"))
# H_s[res@score] = H_s[res@score] + 1
#
# }
hist.sample <- table(s)
barplot(hist.sample/sum(hist.sample),add = TRUE, col=rgb(0,1,0,1/4))
prob.sample <- hist.sample/sum(hist.sample)
conf <- function(z, alpha = 0.05)
{
confUp <- z + qnorm(1-alpha/2)*sqrt(z*(1-z)/Nsim)
conftDown <- z - qnorm(1-alpha/2)*sqrt(z*(1-z)/Nsim)
c(z,conftDown, confUp)
}
#library("Hmisc")
conf.prob.sample <- lapply(as.vector(prob.sample), conf)
conf.prob.wl <- lapply(as.vector(prob.wl), conf)
# l.prop.score <- lapply(as.vector(s_tbl), function(x) {binconf(x, Nsim)} )
# l.prop.wl <- lapply(as.vector(scores_weight), function(x) {binconf(x, Nsim)} )
df1 <- as.data.frame(t(as.data.frame(conf.prob.sample)), row.names = 1)
df2 <- as.data.frame(t(as.data.frame(conf.prob.wl)), row.names = 1)
df <- rbind(df1, df2)
#df <- df[with(df,order(V1)),]
df <- cbind(x =c(rep("Sample",5),rep("WL", 5)), df)
colnames(df) <- c("x","PropScore", "UpperCI", "LowerCI")
require(ggplot2)
## Loading required package: ggplot2

plot.CI <- function(a,b, num_score)
{
ggplot(df[c(a,b),], aes(x = x,y = PropScore)) +
geom_point(size = 4) +
geom_errorbar(aes(ymax = UpperCI, ymin = LowerCI))+
ggtitle(paste("Score = ", num_score))
}
num_score = 1
for (i in 1:(nrow(df) - l))
{
print(plot.CI(i, i+l, num_score))
num_score <- num_score + 1
print(df[c(i,(i+l)),])
}

## x PropScore UpperCI LowerCI
## 1 Sample 0.3150690 0.3141585 0.3159795
## 6 WL 0.3145039 0.3135939 0.3154140

## x PropScore UpperCI LowerCI
## 2 Sample 0.5592280 0.5582549 0.5602011
## 7 WL 0.5591086 0.5581355 0.5600817

## x PropScore UpperCI LowerCI
## 3 Sample 0.1106030 0.1099883 0.1112177
## 8 WL 0.1113157 0.1106992 0.1119322

## x PropScore UpperCI LowerCI
## 4 Sample 0.0140500 0.01381932 0.01428068
## 9 WL 0.0140769 0.01384600 0.01430780

## x PropScore UpperCI LowerCI
## 5 Sample 0.001050000 0.0009865233 0.001113477
## 10 WL 0.000994912 0.0009331211 0.001056703
#setwd("/media/ekaterina/Data/Study/Magistracy/10 term/")
# setwd("E:/??????/????? ????. ???????????? (?????)/????? 4 2.05.17/Prog")
# write(s_res, file = "ScoreWL", ncolumns = 1)
# write(s, file = "ScoreSample", ncolumns = 1)