library("Biostrings")
## Loading required package: BiocGenerics
## Loading required package: parallel
##
## Attaching package: 'BiocGenerics'
## The following objects are masked from 'package:parallel':
##
## clusterApply, clusterApplyLB, clusterCall, clusterEvalQ,
## clusterExport, clusterMap, parApply, parCapply, parLapply,
## parLapplyLB, parRapply, parSapply, parSapplyLB
## The following objects are masked from 'package:stats':
##
## IQR, mad, xtabs
## The following objects are masked from 'package:base':
##
## anyDuplicated, append, as.data.frame, cbind, colnames,
## do.call, duplicated, eval, evalq, Filter, Find, get, grep,
## grepl, intersect, is.unsorted, lapply, lengths, Map, mapply,
## match, mget, order, paste, pmax, pmax.int, pmin, pmin.int,
## Position, rank, rbind, Reduce, rownames, sapply, setdiff,
## sort, table, tapply, union, unique, unsplit, which, which.max,
## which.min
## Loading required package: S4Vectors
## Loading required package: stats4
##
## Attaching package: 'S4Vectors'
## The following objects are masked from 'package:base':
##
## colMeans, colSums, expand.grid, rowMeans, rowSums
## Loading required package: IRanges
## Loading required package: XVector
GenerateDNAseq <- function(l)
{
seq <- sample(DNA_ALPHABET[1:4], size=l, replace=TRUE)
seq <- paste(seq, collapse="")
return(DNAString(seq))
}
SubstituteSymbol <- function(x,k, len_alphabet)
{
symbol <- alphabet[sample(1:len_alphabet, 1)]
substr(x,k,k) <- symbol
return(x)
}
InsertWithLeftShift <- function(x,k, len_alphabet)
{
substr(x, 1, k-1) <- substr(x, 2, k)
res <- SubstituteSymbol(x,k, len_alphabet)
return(res)
}
InsertWithRightShift <- function(x,k, len_alphabet)
{
substr(x, k+1,nchar(x)) <- substr(x, k, nchar(x) - 1)
res <- SubstituteSymbol(x,k, len_alphabet)
return(res)
}
DelAndLeftShift <- function(x,k, len_alphabet)
{
substr(x, 2, k) <- substr(x, 1, k-1)
res <- SubstituteSymbol(x,1, len_alphabet)
return(res)
}
DelAndRightShift <- function(x,k, len_alphabet)
{
substr(x, k,nchar(x)-1) <- substr(x, k+1, nchar(x))
res <- SubstituteSymbol(x,nchar(x), len_alphabet)
return(res)
}
Neighbor <- function(x, len_alphabet)
{
x <- toString(x)
k = sample(1:len_alphabet, 1)
roll <- sample(1:5, 1, prob = c(0.5, 1/8,1/8,1/8,1/8))
if(roll == 1) return(SubstituteSymbol(x,k, len_alphabet))
if(roll == 2) return(InsertWithLeftShift(x,k, len_alphabet))
if(roll == 3) return(InsertWithRightShift(x,k, len_alphabet))
if(roll == 4) return(DelAndLeftShift(x,k, len_alphabet))
if(roll == 5) return(DelAndRightShift(x,k, len_alphabet))
}
isNOTflat <- function(H, Smax, Smin)
{
res <- sum(H > 0.6*sum(H)/(Smax - Smin + 1))
if(res == Smax) return(FALSE)
else return(TRUE)
}
MetropolisUpdate <- function(y,s,w, query, len_alphabet = 4, mat,gapOpening = 12, gapExtension = 1)
{
x <- query
y_new <- DNAString(Neighbor(y, len_alphabet))
resAli <- pairwiseAlignment(y_new, x, type = "local", substitutionMatrix=mat, gapOpening=gapOpening, gapExtension=gapExtension) # посмотреть параметры
s_new <- resAli@score
alpha <- w[s_new]-w[s]
if(log(runif(1))< alpha)
return(list(y_new, s_new))
else
return(list(y,s))
}
WangLandau <- function(w, phi, phiFinal,Smax, Smin, Nsim, query, subject, len_alphabet, mat, gapOpening= 12, gapExtension = 1)
{
x <- query
y <- subject
resAli <- pairwiseAlignment(y,x, type = "local",substitutionMatrix=mat, gapOpening=gapOpening, gapExtension=gapExtension)
s <- resAli@score
while(phi > phiFinal)
{
H <- rep(0, Smax)
while(isNOTflat(H, Smax, Smin))
{
res <- MetropolisUpdate(y, s, w, query, len_alphabet, mat,gapOpening, gapExtension)
y <- res[[1]]
s <- res[[2]]
H[s] <- H[s] + 1
print(H)
w[s] <- w[s] - phi
}
phi <- phi/2
#phi <- sqrt(phi)
}
s <- rep(0, Nsim)
H <- rep(0, Smax)
for(j in 1:Nsim)
{
res <- MetropolisUpdate(y, s, w, query, len_alphabet, mat, gapOpening, gapExtension)
y <- res[[1]]
s[j] <- res[[2]]
H[s] <- H[s] + 1
print(j)
}
return(list(H, w, s))
}
######## Параметры ########
l = 5 # длина строки
#query <- GenerateDNAseq(l) # что ищем, фиксированный
query <- DNAString("ACGTG")
subject <- GenerateDNAseq(l)
alphabet <- alphabet(query, baseOnly=TRUE)
len_alphabet <- length(alphabet)
Smin <- 1 # мин значение score
Smax <- length(query) # макс значение score
Nsim <- 10^6 # количество итераций в генерации выборки
weight<- rep(1, Smax)
weight_log <- rep(0, Smax) # веса для WangLandau
phi <- exp(0.1)
#phi_log <- -0.1
phi_log <- 1
phiFinal <- exp(0.0002)
phiFinal_log <- 0.0002
mat <- nucleotideSubstitutionMatrix(match = 1, mismatch = -3, baseOnly = TRUE)
gapOpening = 12
gapExtension = 1
#resWL <- WangLandau(w = weight_log, phi_log, phiFinal_log, Smax, Smin, Nsim, query, subject, len_alphabet, mat, gapOpening, gapExtension)
#
# hist_res <- resWL[[1]]
# hist_res
# weight_res <- resWL[[2]]
# weight_res
########## SAmpling ############
###### Eq Weight #######
# s.mh1 <- rep(NA, Nsim)
# x <- query
# y <- subject
# resAli <- pairwiseAlignment(y,x, type = "local",substitutionMatrix=mat, gapOpening=gapOpening, gapExtension=gapExtension)
# s.mh1[1] <- resAli@score
# w= c(1,1,1,1,1)
#
# for (i in 2:Nsim)
# {
# res.mh <- MetropolisUpdate(y, s.mh1[i-1], w, query, len_alphabet, mat, gapOpening, gapExtension)
# y <- res.mh[[1]]
# s.mh1[i] <- res.mh[[2]]
# print(i)
# }
#
# write(s.mh1, file = "ScoreMH1_2", ncolumns = 1)
#
# ##### Weight 12345 ########
# s.mh2 <- rep(0, Nsim)
# x <- query
# y <- subject
# resAli <- pairwiseAlignment(y,x, type = "local",substitutionMatrix=mat, gapOpening=gapOpening, gapExtension=gapExtension)
# s.mh2[1] <- resAli@score
# w= c(1,2,3,4,5)
#
# for (i in 2:Nsim)
# {
# res.mh <- MetropolisUpdate(y, s.mh2[i-1], w, query, len_alphabet, mat, gapOpening, gapExtension)
# y <- res.mh[[1]]
# s.mh2[i] <- res.mh[[2]]
# print(paste(i, 2))
# }
#
# ##### Score MK ######
# x <- query
# y <- GenerateDNAseq(l)
# resSample = rep(0, Nsim)
# for (i in 1:Nsim)
# {
# y <- GenerateDNAseq(l)
# res <- pairwiseAlignment(y,x, substitutionMatrix = mat, type = "local", gapOpening=12, gapExtension=1)
# resSample[i] <- res@score
# print(paste(i, "sample"))
# }
setwd("E:/Study/Magistracy/10 term/NIR/Prog")
#setwd("/media/ekaterina/Data/Study/Magistracy/10 term/NIR/Prog/")
resWL <- read.table("ScoreWL")
weight.wl <- c(9.296220e-11, 6.467032e-11, 3.392456e-10, 3.435640e-09, 2.993556e-08)
resSample <- read.table("ScoreMK")
resWL2 <- read.table("ScoreWL2")
weight.wl2 <- exp(tail(resWL2$V1,5))
resWL2 <- resWL2$V1[1:10^6]
res.mh1 <- read.table("ScoreMH1")
weight.mh1 <- c(1,1,1,1,1)
res.mh2 <- read.table("ScoreMH12345")
weight.mh2 <- exp(c(1,2,3,4,5))
GetConst <- function(x, w, addPlot = TRUE)
{
hist <- table(x)
scores_weight <- hist*(1/w)
const <- sum(scores_weight)
prob <- (1/const)*scores_weight
barplot(prob, main = "Scores with weights", col=rgb(0,0,1,1/4), add = addPlot)
return(const)
}
const.wl <- GetConst(resWL, weight.wl, FALSE)
const.mh1 <- GetConst(res.mh1, weight.mh1)
const.mh2 <- GetConst(res.mh2, weight.mh2)
hist.sample <- table(resSample)
barplot(hist.sample/sum(hist.sample),add = TRUE, col=rgb(0,1,0,1/4))
prob.sample <- hist.sample/sum(hist.sample)
conf.mk <- function(z, alpha = 0.05)
{
confUp <- z + qnorm(1-alpha/2)*sqrt(z*(1-z)/Nsim)
conftDown <- z - qnorm(1-alpha/2)*sqrt(z*(1-z)/Nsim)
c(z,conftDown, confUp)
}
se.bm <- function(x, g, i) {
n <- length(x)
b <- floor(sqrt(n))
a <- floor(n / b)
y <- sapply(1:a, function(k) sum(g(x[((k - 1) * b + 1):(k * b)])))
#y <- sapply(1:a, function(k) sum(g(x[((k - 1) * b + 1):(k * b)]))* (1/weight.wl[i])*(1/const))
gx <- g(x)
mu.hat <- sum(gx)
#mu.hat <- sum(gx) * (1/weight.wl[i])*(1/const)
var.hat <- b * sum((y - mu.hat)^2) / (a - 1)
se <- sqrt(var.hat / n)
list(mu = mu.hat, se.mean = se, var = var.hat)
}
conf.mc <- function(x, w, const)
{
df.conf <- matrix(data = 0,nrow = 5, ncol = 3)
for (i in 1:l)
{
tmp <- se.bm(t(x), g =function(x) (1/const)*ifelse(x == i, 1, 0)/w[i], i)
#tmp <- se.bm(t(x), g = function(x) ifelse(x == i, 1, 0),i )
se <- tmp$se.mean
m <-tmp$mu
df.conf[i,] <- c(m, m - qnorm(0.975)*se, m + qnorm(0.975)*se)
}
return(df.conf)
}
conf.prob.sample <- lapply(as.vector(prob.sample), conf.mk)
df.sample <- cbind(x= c(rep("Sample",5)), as.data.frame(t(as.data.frame(conf.prob.sample)), row.names = 1))
df.wl <-cbind(x=c(rep("WL",5)), as.data.frame(conf.mc(resWL, weight.wl, const.wl)))
df.mh1 <- cbind(x =c(rep("MH1",5)), as.data.frame(conf.mc(res.mh1, weight.mh1, const.mh1)))
df.mh2 <- cbind(x= c(rep("MH12345",5)), as.data.frame(conf.mc(res.mh2, weight.mh2, const.mh2)))
df <- rbind(df.sample, df.wl, df.mh1, df.mh2)
lengthCI <- df[,4] - df[,3]
df <- cbind(df, lengthCI)
colnames(df) <- c("x","PropScore", "LowerCI", "UpperCI", "lengthCI")
require(ggplot2)
## Loading required package: ggplot2

plot.CI <- function(p1, p2, p3, p4, num_score)
{
ggplot(df[c(p1, p2, p3, p4),], aes(x = x,y = PropScore)) +
geom_point(size = 4) +
geom_errorbar(aes(ymax = UpperCI, ymin = LowerCI))+
ggtitle(paste("Score = ", num_score))
}
num_score = 1
for (i in 1:5)
{
print(plot.CI(i, i+l, i + 2*l, i + 3*l,num_score))
num_score <- num_score + 1
print(df[c(i, i+l, i + 2*l, i + 3*l),])
}

## x PropScore LowerCI UpperCI lengthCI
## 1 Sample 0.3165470 0.3156354 0.3174586 0.001823272
## 6 WL 0.3145039 0.2950209 0.3339870 0.038966098
## 11 MH1 0.3179380 0.2982422 0.3376338 0.039391568
## 16 MH12345 0.3184131 0.2986879 0.3381384 0.039450437

## x PropScore LowerCI UpperCI lengthCI
## 2 Sample 0.5592160 0.5582429 0.5601891 0.00194617
## 7 WL 0.5591086 0.5244726 0.5937445 0.06927188
## 12 MH1 0.5576460 0.5231007 0.5921913 0.06909067
## 17 MH12345 0.5580726 0.5235008 0.5926444 0.06914353

## x PropScore LowerCI UpperCI lengthCI
## 3 Sample 0.1095060 0.1088940 0.1101180 0.001224088
## 8 WL 0.1113157 0.1044199 0.1182115 0.013791683
## 13 MH1 0.1098290 0.1030253 0.1166327 0.013607485
## 18 MH12345 0.1089319 0.1021837 0.1156800 0.013496334

## x PropScore LowerCI UpperCI lengthCI
## 4 Sample 0.01370100 0.01347316 0.01392884 0.0004556785
## 9 WL 0.01407690 0.01320485 0.01494894 0.0017440854
## 14 MH1 0.01366500 0.01281847 0.01451153 0.0016930528
## 19 MH12345 0.01367918 0.01283178 0.01452659 0.0016948099

## x PropScore LowerCI UpperCI lengthCI
## 5 Sample 0.0010300000 0.0009671301 0.0010928699 0.0001257398
## 10 WL 0.0009949120 0.0009332786 0.0010565453 0.0001232666
## 15 MH1 0.0009220000 0.0008648834 0.0009791166 0.0001142331
## 20 MH12345 0.0009031898 0.0008472385 0.0009591410 0.0001119026
# g = function(x,i) ifelse(x == i, 1, 0)
# m <- NULL
# sum.t <- c(0,0,0,0,0)
# for (j in 1: 10000)
# {
# s <- sample(1:5, 10000, replace = TRUE, prob = 1/weight.wl)
# for (i in 1:l)
# {
# m[i] <- mean(g(s, i))
# sum.t[i] <- sum.t[i] + ifelse(m[i] >= df.conf.wl[i,2] & m[i] <= df.conf.wl[i,3], 1,0)
# }
# print(j)
# }