library("Biostrings")
## Loading required package: BiocGenerics
## Loading required package: parallel
##
## Attaching package: 'BiocGenerics'
## The following objects are masked from 'package:parallel':
##
## clusterApply, clusterApplyLB, clusterCall, clusterEvalQ,
## clusterExport, clusterMap, parApply, parCapply, parLapply,
## parLapplyLB, parRapply, parSapply, parSapplyLB
## The following objects are masked from 'package:stats':
##
## IQR, mad, xtabs
## The following objects are masked from 'package:base':
##
## anyDuplicated, append, as.data.frame, cbind, colnames,
## do.call, duplicated, eval, evalq, Filter, Find, get, grep,
## grepl, intersect, is.unsorted, lapply, lengths, Map, mapply,
## match, mget, order, paste, pmax, pmax.int, pmin, pmin.int,
## Position, rank, rbind, Reduce, rownames, sapply, setdiff,
## sort, table, tapply, union, unique, unsplit, which, which.max,
## which.min
## Loading required package: S4Vectors
## Loading required package: stats4
##
## Attaching package: 'S4Vectors'
## The following objects are masked from 'package:base':
##
## colMeans, colSums, expand.grid, rowMeans, rowSums
## Loading required package: IRanges
## Loading required package: XVector
GenerateDNAseq <- function(l)
{
seq <- sample(DNA_ALPHABET[1:4], size=l, replace=TRUE)
seq <- paste(seq, collapse="")
return(DNAString(seq))
}
dict <- function(n, Smax, Smin)
{
x <- vector(mode = "list", length = Smax - Smin + 1)
x <- rep(n, Smax-Smin + 1)
names(x) <- seq(Smin, Smax)
return(x)
}
SubstituteSymbol <- function(x,k, len_alphabet)
{
symbol <- alphabet[sample(1:len_alphabet, 1)]
substr(x,k,k) <- symbol
return(x)
}
InsertWithLeftShift <- function(x,k, len_alphabet)
{
substr(x, 1, k-1) <- substr(x, 2, k)
res <- SubstituteSymbol(x,k, len_alphabet)
return(res)
}
InsertWithRightShift <- function(x,k, len_alphabet)
{
substr(x, k+1,nchar(x)) <- substr(x, k, nchar(x) - 1)
res <- SubstituteSymbol(x,k, len_alphabet)
return(res)
}
DelAndLeftShift <- function(x,k, len_alphabet)
{
substr(x, 2, k) <- substr(x, 1, k-1)
res <- SubstituteSymbol(x,1, len_alphabet)
return(res)
}
DelAndRightShift <- function(x,k, len_alphabet)
{
substr(x, k,nchar(x)-1) <- substr(x, k+1, nchar(x))
res <- SubstituteSymbol(x,nchar(x), len_alphabet)
return(res)
}
Neighbor <- function(x, len_alphabet)
{
k = sample(1:len_alphabet, 1)
roll <- sample(1:5, 1, prob = c(0.5, 1/8,1/8,1/8,1/8))
if(roll == 1) return(SubstituteSymbol(x,k, len_alphabet))
if(roll == 2) return(InsertWithLeftShift(x,k, len_alphabet))
if(roll == 3) return(InsertWithRightShift(x,k, len_alphabet))
if(roll == 4) return(DelAndLeftShift(x,k, len_alphabet))
if(roll == 5) return(DelAndRightShift(x,k, len_alphabet))
}
isNOTflat <- function(H, Smax, Smin)
{
res <- sum(H > 0.6*sum(H)/(Smax - Smin + 1))
if(res == Smax) return(FALSE)
else return(TRUE)
}
MetropolisUpdate <- function(y,s,w, query, len_alphabet = 4, mat,gapOpening = 12, gapExtension = 1)
{
x <- query
y <- toString(y)
y_new <- Neighbor(y, len_alphabet)
y <- DNAString(y)
y_new <- DNAString(y_new)
resAli <- pairwiseAlignment(y_new, x, type = "local", substitutionMatrix=mat, gapOpening=gapOpening, gapExtension=gapExtension) # посмотреть параметры
s_new <- resAli@score
if (s_new<=0)
{
s_new <- 0
print("########")
print(x)
print(y_new)
}
alpha <- w[[as.character(s_new)]]/ w[[as.character(s)]]
#alpha <- exp(w[[as.character(s_new)]] - w[[as.character(s)]])
if(runif(1)< alpha)
return(list(y_new, s_new))
else
return(list(y,s))
}
WangLandau <- function(w, phi, phiFinal,Smax, Smin, Nsim, query, subject, len_alphabet, mat, gapOpening= 12, gapExtension = 1)
{
x <- query
y <- subject
resAli <- pairwiseAlignment(y,x, type = "local",substitutionMatrix=mat, gapOpening=gapOpening, gapExtension=gapExtension)
s <- resAli@score
if(s<0) s <- 0
while(phi > phiFinal)
{
H <- dict(0, Smax, Smin)
while(isNOTflat(H, Smax, Smin))
{
res <- MetropolisUpdate(y, s, w, query, len_alphabet, mat,gapOpening, gapExtension)
y <- res[[1]]
s <- res[[2]]
if (s<0) s <- 0
H[[as.character(s)]] <- H[[as.character(s)]] + 1
w[[as.character(s)]] <- w[[as.character(s)]]/phi
print(H)
#w[[as.character(s)]] <- w[[as.character(s)]] - phi
}
#phi <- phi/2
phi <- sqrt(phi)
}
H <- dict(0, Smax, Smin)
s_res <- rep(0, Nsim)
step <- 1
for(j in 1:Nsim)
{
res <- MetropolisUpdate(y, s, w, query, len_alphabet, mat, gapOpening, gapExtension)
y <- res[[1]]
s <- res[[2]]
s_res[j] <- s
H[[as.character(s)]] <- H[[as.character(s)]] + 1
print(step)
step = step +1
}
return(list(H, w, s_res))
}
######## Параметры ########
l = 5 # длина строки
#query <- GenerateDNAseq(l) # что ищем, фиксированный
query <- DNAString("ACGTG")
subject <- GenerateDNAseq(l)
alphabet <- alphabet(query, baseOnly=TRUE)
len_alphabet <- length(alphabet)
Smin <- 1 # мин значение score
Smax <- length(query) # макс значение score
Nsim <- 10^6 # количество итераций в генерации выборки
weight<- dict(1, Smax, Smin)
weight_log <- dict(log(1), Smax, Smin) # веса для WangLandau
phi <- exp(0.1)
phi_log <- log(phi)
phiFinal <- exp(0.0002)
phiFinal_log <- log(phiFinal)
mat <- nucleotideSubstitutionMatrix(match = 1, mismatch = -3, baseOnly = TRUE)
gapOpening = 12
gapExtension = 1
# res <- WangLandau(w = weight, phi, phiFinal, Smax, Smin, Nsim, query, subject, len_alphabet, mat, gapOpening, gapExtension)
#
# hist_res <- res[[1]]
# hist_res
# weight_res <- res[[2]]
# weight_res
# s_res <- res[[3]]
setwd("/media/ekaterina/Data/Study/Magistracy/10 term/НИР/Prog/")
s_res <- read.table("ScoreWL")
s <- read.table("ScoreSample")
weight_res <- c(9.296220e-11, 6.467032e-11, 3.392456e-10, 3.435640e-09, 2.993556e-08)
## для длины 5
counts5 <- table(s_res)
barplot(counts5/sum(counts5), main = "Scores without weights")

scores_weight <- counts5*(1/weight_res)
barplot(scores_weight/sum(scores_weight), main = "Scores with weights", col=rgb(0,0,1,1/4))
scores_weight/sum(scores_weight)
## s_res
## 1 2 3 4 5
## 0.314503919 0.559108570 0.111315703 0.014076896 0.000994912
s_tbl <- table(s)
barplot(s_tbl/sum(s_tbl),add = TRUE, col=rgb(1,0,0,1/4))

s_tbl/sum(s_tbl)
## s
## 1 2 3 4 5
## 0.315069 0.559228 0.110603 0.014050 0.001050
#h1 <- hist(scores_weight)
###### Score Sample ######
# s <- NULL
# x <- query
# y <- GenerateDNAseq(l)
# for (i in 491026:Nsim)
# {
# y <- Neighbor(toString(y), len_alphabet)
# res <- pairwiseAlignment(y,x, substitutionMatrix = mat, type = "local", gapOpening=12, gapExtension=1)
# print(i)
# s[i] <- res@score
#
# }
# s_tbl <- table(s)
# barplot(s_tbl/sum(s_tbl),col=rgb(0,1,0,1/4))
prop.wl <- scores_weight/sum(scores_weight)
prop.score <- s_tbl/sum(s_tbl)
conf <- function(z)
{
confUp <- z + 1.96*sqrt(z*(1-z)/Nsim)
conftDown <- z - 1.96*sqrt(z*(1-z)/Nsim)
c(z,confUp, conftDown)
}
l.prop.score <- lapply(as.vector(prop.score), conf)
l.prop.wl <- lapply(as.vector(prop.wl), conf)
df1 <- as.data.frame(t(as.data.frame(l.prop.score)), row.names = 1)
df2 <- as.data.frame(t(as.data.frame(l.prop.wl)), row.names = 1)
df <- rbind(df1, df2)
df <- df[with(df,order(V1)),]
df <- cbind(x =rep(c(1,2),5), df)
colnames(df) <- c("x","PropScore", "UpperCI", "LowerCI")
require(ggplot2)
## Loading required package: ggplot2
plot.CI <- function(a,b, num_score)
{
ggplot(df[a:b,], aes(x = x,y = PropScore)) +
geom_point(size = 4) +
geom_errorbar(aes(ymax = UpperCI, ymin = LowerCI))+
ggtitle(paste("Score = ", num_score))
}
num_score = 5
for (i in seq(1,(nrow(df)-1), by = 2))
{
print(plot.CI(i, i+1, num_score))
num_score <- num_score - 1
print(df[i:(i+1),])
}

## x PropScore UpperCI LowerCI
## 10 1 0.000994912 0.001056704 0.0009331200
## 5 2 0.001050000 0.001113478 0.0009865221

## x PropScore UpperCI LowerCI
## 4 1 0.0140500 0.01428069 0.01381931
## 9 2 0.0140769 0.01430780 0.01384599

## x PropScore UpperCI LowerCI
## 3 1 0.1106030 0.1112177 0.1099883
## 8 2 0.1113157 0.1119322 0.1106992

## x PropScore UpperCI LowerCI
## 6 1 0.3145039 0.3154140 0.3135939
## 1 2 0.3150690 0.3159795 0.3141585

## x PropScore UpperCI LowerCI
## 7 1 0.5591086 0.5600817 0.5581354
## 2 2 0.5592280 0.5602011 0.5582549