Rare event sampling

library("Biostrings")

## Loading required package: BiocGenerics

## Loading required package: parallel

## 
## Attaching package: 'BiocGenerics'

## The following objects are masked from 'package:parallel':
## 
##     clusterApply, clusterApplyLB, clusterCall, clusterEvalQ,
##     clusterExport, clusterMap, parApply, parCapply, parLapply,
##     parLapplyLB, parRapply, parSapply, parSapplyLB

## The following objects are masked from 'package:stats':
## 
##     IQR, mad, xtabs

## The following objects are masked from 'package:base':
## 
##     anyDuplicated, append, as.data.frame, cbind, colnames,
##     do.call, duplicated, eval, evalq, Filter, Find, get, grep,
##     grepl, intersect, is.unsorted, lapply, lengths, Map, mapply,
##     match, mget, order, paste, pmax, pmax.int, pmin, pmin.int,
##     Position, rank, rbind, Reduce, rownames, sapply, setdiff,
##     sort, table, tapply, union, unique, unsplit, which, which.max,
##     which.min

## Loading required package: S4Vectors

## Loading required package: stats4

## 
## Attaching package: 'S4Vectors'

## The following objects are masked from 'package:base':
## 
##     colMeans, colSums, expand.grid, rowMeans, rowSums

## Loading required package: IRanges

## Loading required package: XVector

GenerateDNAseq <- function(l)
{
      seq <- sample(DNA_ALPHABET[1:4], size=l, replace=TRUE)
      seq <-  paste(seq, collapse="")
      return(DNAString(seq))
}


SubstituteSymbol <- function(x,k, len_alphabet)
{
      symbol <- alphabet[sample(1:len_alphabet, 1)]
      substr(x,k,k) <- symbol
      return(x)
}

InsertWithLeftShift <- function(x,k, len_alphabet)
{
      substr(x, 1, k-1) <- substr(x, 2, k)
      res <- SubstituteSymbol(x,k, len_alphabet)
      return(res)
}

InsertWithRightShift <- function(x,k, len_alphabet)
{
      substr(x, k+1,nchar(x)) <- substr(x, k, nchar(x) - 1)
      res <- SubstituteSymbol(x,k, len_alphabet)
      return(res)
}

DelAndLeftShift <- function(x,k, len_alphabet)
{
      substr(x, 2, k) <- substr(x, 1, k-1)
      res <- SubstituteSymbol(x,1, len_alphabet)
      return(res)
}

DelAndRightShift <- function(x,k, len_alphabet)
{
      substr(x, k,nchar(x)-1) <- substr(x, k+1, nchar(x))
      res <- SubstituteSymbol(x,nchar(x), len_alphabet)
      return(res)
}

Neighbor <- function(x, len_alphabet)
{
      x <- toString(x)
      k = sample(1:len_alphabet, 1)
      roll <- sample(1:5, 1, prob = c(0.5, 1/8,1/8,1/8,1/8))
      if(roll == 1) return(SubstituteSymbol(x,k, len_alphabet))
      if(roll == 2) return(InsertWithLeftShift(x,k, len_alphabet))
      if(roll == 3) return(InsertWithRightShift(x,k, len_alphabet))
      if(roll == 4) return(DelAndLeftShift(x,k, len_alphabet))
      if(roll == 5) return(DelAndRightShift(x,k, len_alphabet))
}

isNOTflat <- function(H, Smax, Smin)
{
      res <- sum(H > 0.6*sum(H)/(Smax - Smin + 1))
      if(res == Smax) return(FALSE)
      else return(TRUE)
}


MetropolisUpdate <- function(y,s,w, query, len_alphabet = 4, mat,gapOpening = 12, gapExtension = 1)
{
      x <- query
      y_new <- DNAString(Neighbor(y, len_alphabet))
      
      resAli <- pairwiseAlignment(y_new, x, type = "local", substitutionMatrix=mat, gapOpening=gapOpening, gapExtension=gapExtension) # РїРѕСЃРјРѕС‚СЂРµС‚СЊ РїР°СЂР°РјРµС‚СЂС‹
      s_new <- resAli@score
      alpha <- w[s_new]-w[s]
      if(log(runif(1))< alpha)
            return(list(y_new, s_new))
      else
            return(list(y,s))
}

WangLandau <- function(w, phi, phiFinal,Smax, Smin, Nsim, query, subject, len_alphabet, mat, gapOpening= 12, gapExtension = 1)
{
      x <- query
      y <- subject
      resAli <- pairwiseAlignment(y,x, type = "local",substitutionMatrix=mat, gapOpening=gapOpening, gapExtension=gapExtension)
      s <- resAli@score
      while(phi > phiFinal)
      {
            H <- rep(0, Smax)
            while(isNOTflat(H, Smax, Smin))
            {
                  res <- MetropolisUpdate(y, s, w, query, len_alphabet, mat,gapOpening, gapExtension)
                  y <- res[[1]]
                  s <- res[[2]]
                  H[s] <- H[s] + 1
                  print(H)
                  w[s] <- w[s] - phi
            }
            phi <- phi/2
            #phi <- sqrt(phi)
      }
      s <- rep(0, Nsim)
      H <- rep(0, Smax)
      for(j in 1:Nsim)
      {
            res <- MetropolisUpdate(y, s, w, query, len_alphabet, mat, gapOpening, gapExtension)
            y <- res[[1]]
            s[j] <- res[[2]]
            H[s] <- H[s] + 1
            print(j)
      }
      return(list(H, w, s))
}


######## РџР°СЂР°РјРµС‚СЂС‹ ########
l = 5 # РґР»РёРЅР° СЃС‚СЂРѕРєРё
#query <- GenerateDNAseq(l) # С‡С‚Рѕ РёС‰РµРј, С„РёРєСЃРёСЂРѕРІР°РЅРЅС‹Р№
query <- DNAString("ACGTG")
subject <- GenerateDNAseq(l)
alphabet <- alphabet(query, baseOnly=TRUE)
len_alphabet <- length(alphabet)
Smin <- 1  # РјРёРЅ Р·РЅР°С‡РµРЅРёРµ score
Smax <- length(query)   # РјР°РєСЃ Р·РЅР°С‡РµРЅРёРµ score
Nsim <- 10^6 # РєРѕР»РёС‡РµСЃС‚РІРѕ РёС‚РµСЂР°С†РёР№ РІ РіРµРЅРµСЂР°С†РёРё РІС‹Р±РѕСЂРєРё
weight<- rep(1, Smax)
weight_log <- rep(0, Smax)  # РІРµСЃР° РґР»СЏ WangLandau
phi <- exp(0.1) 
#phi_log <- -0.1
phi_log <- 1
phiFinal <- exp(0.0002)
phiFinal_log <- 0.0002
mat <- nucleotideSubstitutionMatrix(match = 1, mismatch = -3, baseOnly = TRUE)
gapOpening = 12
gapExtension = 1

#resWL <- WangLandau(w = weight_log, phi_log, phiFinal_log, Smax, Smin, Nsim, query, subject, len_alphabet, mat, gapOpening, gapExtension)
# 
# hist_res <- resWL[[1]]
# hist_res
# weight_res <- resWL[[2]]
# weight_res


########## SAmpling ############

###### Eq Weight #######
# s.mh1 <- rep(NA, Nsim)
# x <- query
# y <- subject
# resAli <- pairwiseAlignment(y,x, type = "local",substitutionMatrix=mat, gapOpening=gapOpening, gapExtension=gapExtension)
# s.mh1[1] <- resAli@score
# w= c(1,1,1,1,1)
# 
# for (i in 2:Nsim)
# {
#       res.mh <- MetropolisUpdate(y, s.mh1[i-1], w, query, len_alphabet, mat, gapOpening, gapExtension)
#       y <- res.mh[[1]]
#       s.mh1[i] <- res.mh[[2]]
#       print(i)
# }
# 
# write(s.mh1, file = "ScoreMH1_2", ncolumns = 1)
# 
# ##### Weight 12345 ########
# s.mh2 <- rep(0, Nsim)
# x <- query
# y <- subject
# resAli <- pairwiseAlignment(y,x, type = "local",substitutionMatrix=mat, gapOpening=gapOpening, gapExtension=gapExtension)
# s.mh2[1] <- resAli@score
# w= c(1,2,3,4,5)
# 
# for (i in 2:Nsim)
# {
#       res.mh <- MetropolisUpdate(y, s.mh2[i-1], w, query, len_alphabet, mat, gapOpening, gapExtension)
#       y <- res.mh[[1]]
#       s.mh2[i] <- res.mh[[2]]
#       print(paste(i, 2))
# }
# 
# ##### Score MK ######
# x <- query
# y <- GenerateDNAseq(l)
# resSample = rep(0, Nsim)
# for (i in 1:Nsim)
# {
#       y <- GenerateDNAseq(l)
#       res <-  pairwiseAlignment(y,x, substitutionMatrix = mat, type = "local", gapOpening=12, gapExtension=1)
#       resSample[i] <- res@score
#       print(paste(i, "sample"))
# }


setwd("E:/Study/Magistracy/10 term/NIR/Prog")
#setwd("/media/ekaterina/Data/Study/Magistracy/10 term/NIR/Prog/")
resWL <- read.table("ScoreWL")
weight.wl <- c(9.296220e-11, 6.467032e-11, 3.392456e-10, 3.435640e-09, 2.993556e-08)

resSample <- read.table("ScoreMK")

resWL2 <- read.table("ScoreWL2")
weight.wl2 <- exp(tail(resWL2$V1,5))
resWL2 <- resWL2$V1[1:10^6]

res.mh1 <- read.table("ScoreMH1")
weight.mh1 <- c(1,1,1,1,1)

res.mh2 <- read.table("ScoreMH12345")
weight.mh2 <- exp(c(1,2,3,4,5))


GetConst <- function(x, w, addPlot = TRUE)
{
      hist <- table(x)
      scores_weight <- hist*(1/w)
      const <- sum(scores_weight)
      prob <- (1/const)*scores_weight
      barplot(prob, main = "Scores with weights", col=rgb(0,0,1,1/4), add = addPlot)
      return(const)
}

const.wl <- GetConst(resWL, weight.wl, FALSE)
const.mh1 <- GetConst(res.mh1, weight.mh1)
const.mh2 <- GetConst(res.mh2, weight.mh2)


hist.sample <- table(resSample)
barplot(hist.sample/sum(hist.sample),add = TRUE, col=rgb(0,1,0,1/4))
prob.sample <- hist.sample/sum(hist.sample)




conf.mk <- function(z, alpha = 0.05)
{ 
      confUp <- z + qnorm(1-alpha/2)*sqrt(z*(1-z)/Nsim)
      conftDown <- z - qnorm(1-alpha/2)*sqrt(z*(1-z)/Nsim)
      c(z,conftDown, confUp)
}

se.bm <- function(x, g, i) {
      n <- length(x)
      b <- floor(sqrt(n))
      a <- floor(n / b)
      
      y <- sapply(1:a, function(k) sum(g(x[((k - 1) * b + 1):(k * b)])))
      #y <- sapply(1:a, function(k) sum(g(x[((k - 1) * b + 1):(k * b)]))* (1/weight.wl[i])*(1/const))
      gx <- g(x)
      mu.hat <- sum(gx)
      #mu.hat <- sum(gx) * (1/weight.wl[i])*(1/const)
      var.hat <- b * sum((y - mu.hat)^2) / (a - 1)
      se <- sqrt(var.hat / n)
      
      list(mu = mu.hat, se.mean = se, var = var.hat)
}

conf.mc <- function(x, w, const)
{
      df.conf <- matrix(data = 0,nrow = 5, ncol = 3)
      for (i in 1:l)
      {
            tmp <- se.bm(t(x), g =function(x) (1/const)*ifelse(x == i, 1, 0)/w[i], i)
            #tmp <- se.bm(t(x), g = function(x) ifelse(x == i, 1, 0),i )
            se <- tmp$se.mean
            m <-tmp$mu
            df.conf[i,] <- c(m, m - qnorm(0.975)*se, m + qnorm(0.975)*se)
      }
      return(df.conf)
}

conf.prob.sample <- lapply(as.vector(prob.sample), conf.mk)
df.sample <- cbind(x= c(rep("Sample",5)), as.data.frame(t(as.data.frame(conf.prob.sample)), row.names = 1))

df.wl <-cbind(x=c(rep("WL",5)), as.data.frame(conf.mc(resWL, weight.wl, const.wl)))

df.mh1 <- cbind(x =c(rep("MH1",5)), as.data.frame(conf.mc(res.mh1, weight.mh1, const.mh1)))

df.mh2 <- cbind(x= c(rep("MH12345",5)), as.data.frame(conf.mc(res.mh2, weight.mh2, const.mh2)))

df <- rbind(df.sample, df.wl, df.mh1, df.mh2)
lengthCI <- df[,4] - df[,3]
df <- cbind(df, lengthCI)
colnames(df) <- c("x","PropScore", "LowerCI", "UpperCI", "lengthCI")


require(ggplot2)

## Loading required package: ggplot2

plot.CI <- function(p1, p2, p3, p4, num_score)
{
      ggplot(df[c(p1, p2, p3, p4),], aes(x = x,y = PropScore)) +
            geom_point(size = 4) +
            geom_errorbar(aes(ymax = UpperCI, ymin = LowerCI))+
            ggtitle(paste("Score = ", num_score))
}

num_score = 1
for (i in 1:5)
{
      
      print(plot.CI(i, i+l, i + 2*l, i + 3*l,num_score))
      num_score <- num_score + 1
      print(df[c(i, i+l, i + 2*l, i + 3*l),])
      
}

##          x PropScore   LowerCI   UpperCI    lengthCI
## 1   Sample 0.3165470 0.3156354 0.3174586 0.001823272
## 6       WL 0.3145039 0.2950209 0.3339870 0.038966098
## 11     MH1 0.3179380 0.2982422 0.3376338 0.039391568
## 16 MH12345 0.3184131 0.2986879 0.3381384 0.039450437

##          x PropScore   LowerCI   UpperCI   lengthCI
## 2   Sample 0.5592160 0.5582429 0.5601891 0.00194617
## 7       WL 0.5591086 0.5244726 0.5937445 0.06927188
## 12     MH1 0.5576460 0.5231007 0.5921913 0.06909067
## 17 MH12345 0.5580726 0.5235008 0.5926444 0.06914353

##          x PropScore   LowerCI   UpperCI    lengthCI
## 3   Sample 0.1095060 0.1088940 0.1101180 0.001224088
## 8       WL 0.1113157 0.1044199 0.1182115 0.013791683
## 13     MH1 0.1098290 0.1030253 0.1166327 0.013607485
## 18 MH12345 0.1089319 0.1021837 0.1156800 0.013496334

##          x  PropScore    LowerCI    UpperCI     lengthCI
## 4   Sample 0.01370100 0.01347316 0.01392884 0.0004556785
## 9       WL 0.01407690 0.01320485 0.01494894 0.0017440854
## 14     MH1 0.01366500 0.01281847 0.01451153 0.0016930528
## 19 MH12345 0.01367918 0.01283178 0.01452659 0.0016948099

##          x    PropScore      LowerCI      UpperCI     lengthCI
## 5   Sample 0.0010300000 0.0009671301 0.0010928699 0.0001257398
## 10      WL 0.0009949120 0.0009332786 0.0010565453 0.0001232666
## 15     MH1 0.0009220000 0.0008648834 0.0009791166 0.0001142331
## 20 MH12345 0.0009031898 0.0008472385 0.0009591410 0.0001119026

# g = function(x,i) ifelse(x == i, 1, 0)
# m <- NULL
# sum.t <- c(0,0,0,0,0)
# for (j in 1: 10000)
# {
#       s <- sample(1:5, 10000, replace = TRUE, prob = 1/weight.wl)
#       for (i in 1:l)
#       {
#             m[i] <- mean(g(s, i))
#             sum.t[i] <- sum.t[i] + ifelse(m[i] >= df.conf.wl[i,2] & m[i] <= df.conf.wl[i,3], 1,0)
#       }
#       print(j)
# }

Rare event sampling

May 29, 2017