Untitled

library("Rcpp")
library("Biostrings")

## Loading required package: BiocGenerics

## Loading required package: parallel

## 
## Attaching package: 'BiocGenerics'

## The following objects are masked from 'package:parallel':
## 
##     clusterApply, clusterApplyLB, clusterCall, clusterEvalQ,
##     clusterExport, clusterMap, parApply, parCapply, parLapply,
##     parLapplyLB, parRapply, parSapply, parSapplyLB

## The following objects are masked from 'package:stats':
## 
##     IQR, mad, sd, var, xtabs

## The following objects are masked from 'package:base':
## 
##     anyDuplicated, append, as.data.frame, cbind, colMeans,
##     colnames, colSums, do.call, duplicated, eval, evalq, Filter,
##     Find, get, grep, grepl, intersect, is.unsorted, lapply,
##     lengths, Map, mapply, match, mget, order, paste, pmax,
##     pmax.int, pmin, pmin.int, Position, rank, rbind, Reduce,
##     rowMeans, rownames, rowSums, sapply, setdiff, sort, table,
##     tapply, union, unique, unsplit, which, which.max, which.min

## Loading required package: S4Vectors

## Loading required package: stats4

## 
## Attaching package: 'S4Vectors'

## The following object is masked from 'package:base':
## 
##     expand.grid

## Loading required package: IRanges

## Loading required package: XVector

## 
## Attaching package: 'Biostrings'

## The following object is masked from 'package:base':
## 
##     strsplit

setwd("/media/ekaterina/Data/Study/Magistracy/10term/NIR/Prog/")
sourceCpp("/media/ekaterina/Data/Study/Magistracy/10term/NIR/Prog/Rccp/ssw_R.cpp", cacheDir = "/media/ekaterina/Data/Study/Magistracy/10term/NIR/Prog/Rccp/")

GenerateDNAseq <- function(l)
{
      seq <- sample(DNA_ALPHABET[1:4], size=l, replace=TRUE)
      seq <-  paste(seq, collapse="")
      return(DNAString(seq))
}


SubstituteSymbol <- function(x,k, len_alphabet)
{
      symbol <- sample(c("A", "C", "G", "T"), size = 1)
      substr(x,k,k) <- symbol
      return(x)
}

InsertWithLeftShift <- function(x,k, len_alphabet)
{
      substr(x, 1, k-1) <- substr(x, 2, k)
      res <- SubstituteSymbol(x,k, len_alphabet)
      return(res)
}

InsertWithRightShift <- function(x,k, len_alphabet)
{
      substr(x, k+1,nchar(x)) <- substr(x, k, nchar(x) - 1)
      res <- SubstituteSymbol(x,k, len_alphabet)
      return(res)
}

DelAndLeftShift <- function(x,k, len_alphabet)
{
      substr(x, 2, k) <- substr(x, 1, k-1)
      res <- SubstituteSymbol(x,1, len_alphabet)
      return(res)
}

DelAndRightShift <- function(x,k, len_alphabet)
{
      substr(x, k,nchar(x)-1) <- substr(x, k+1, nchar(x))
      res <- SubstituteSymbol(x,nchar(x), len_alphabet)
      return(res)
}

Neighbor <- function(x, len_alphabet)
{
      x <- toString(x)
      k = sample(1:len_alphabet, 1)
      roll <- sample(1:5, 1, prob = c(0.5, 1/8,1/8,1/8,1/8))
      switch(roll,
             SubstituteSymbol(x,k, len_alphabet),
             InsertWithLeftShift(x,k, len_alphabet),
             InsertWithRightShift(x,k, len_alphabet),
             DelAndLeftShift(x,k, len_alphabet),
             DelAndRightShift(x,k, len_alphabet))
}

isflat <- function(H, Smax, Smin)
{
      H1 <- H[Smin:Smax]
      m <- mean(H1)
      all(H1>20) & all(H1 > 0.75*m) & all(H1 < 1.25*m)
}


MetropolisUpdate <- function(y,s,w, query, len_alphabet = 4)
{
      x <- query
      y_new <- Neighbor(y, len_alphabet)
      s_new <- pairwiseAlignmentSSW(y_new, x)
      #alpha <- w[s_new]-w[s]
      alpha <- w[s_new]/w[s]
      if(runif(1)< alpha)
            return(list(y_new, s_new))
      else
            return(list(y,s))
}

WangLandau <- function(w, phi, phiFinal,Smax, Smin, Nsim, query, subject, len_alphabet)
{
      x <- query
      y <- subject
      s <- pairwiseAlignmentSSW(y, x)
      while(phi > phiFinal)
      {
            H <- rep(0, Smax)
            while(!isflat(H, Smax, Smin))
            {
                  res <- MetropolisUpdate(y, s, w, query, len_alphabet)
                  y <- res[[1]]
                  s <- res[[2]]
                  H[s] <- H[s] + 1
                  #w[s] <- w[s] - phi
                  w[s] <- w[s]/phi
            }
            #w <- w-max(w)
            #phi <- phi/2
            phi <- sqrt(phi)
            print(H)
      }
      return(w)
}    





######## Параметры ########
l = 5 # длина строки
#query <- GenerateDNAseq(l) # что ищем, фиксированный
letters <- c("A", "T", "C", "G")
ssubject <- sample(x = letters, size = l, replace = TRUE)
subject <- paste(ssubject, collapse = '')
query <- "ACGTG"
#alphabet <- alphabet(query, baseOnly=TRUE)
len_alphabet <- 4
Smin <- 1  # мин значение score
Smax <- l  # макс значение score
Nsim <- 10^6 # количество итераций в генерации выборки
weight<- rep(1, Smax)
weight_log <- rep(0, Smax)  # веса для WangLandau
phi <- exp(0.1) 
#phi_log <- -0.1
phi_log <- 0.1
phiFinal <- exp(0.0002)
phiFinal_log <- 0.0002
mat <- nucleotideSubstitutionMatrix(match = 1, mismatch = -3, baseOnly = TRUE)
gapOpening = 12
gapExtension = 1






########## Sampling ############

########## Wang-Landau #########
#weight.wl <- WangLandau(w = weight, phi, phiFinal, Smax, Smin, Nsim, query, subject, len_alphabet)


# s <- rep(0, Nsim)
# #H <- rep(0, Smax)
# y <- subject
# s[1] <- pairwiseAlignmentSSW(y, query)
# for(j in 2:Nsim)
# {
# 
#       res <- MetropolisUpdate(y, s[j-1], weight.wl, query, len_alphabet)
#       y <- res[[1]]
#       s[j] <- res[[2]]
#       #H[s] <- H[s] + 1
#       print(j)
# }

#write(s, "WL_new", ncolumns = 1)
#write(weight.wl, "weight.wl")

# ##### Monte-Carlo ######
# x <- query
# resSample = rep(0, Nsim)
# for (i in 1:Nsim)
# {
#       ssubject <- sample(x = letters, size = l, replace = TRUE)
#       y <- paste(ssubject, collapse = '')
#       #res <-  pairwiseAlignment(y,x, substitutionMatrix = mat, type = "local", gapOpening=12, gapExtension=1)
#       resSample[i] <- pairwiseAlignmentSSW(y, x)
#       print(i)
# }

#write(resSample, "MC_new", ncolumns = 1)

#resWL <- read.table("WL_new")
resWL <- read.table("ScoreWL")
weight.wl <- c(9.296220e-11, 6.467032e-11, 3.392456e-10, 3.435640e-09, 2.993556e-08)

resSample <- read.table("ScoreSample")

 i = 1
 resWL <- resWL[1:(i+4) == (i+4),]
#resWL <- s
#weight.wl <- 1/as.vector(prob.sample)
#weight.wl <- exp(weight.wl)
hist <- table(resWL)
const.wl <- sum(hist/weight.wl)
prob.wl <- (1/const.wl)*(hist/weight.wl)
prob2 <- 1/weight.wl/sum(1/weight.wl)
barplot(prob.wl, main = "Scores with weights", col=rgb(0,0,1,1/4))


hist.sample <- table(resSample)
prob.sample <- hist.sample/sum(hist.sample)
barplot(prob.sample,add = TRUE, col=rgb(0,1,0,1/4))



######## Conf intervals ########

#MC
conf.mc <- function(z, alpha = 0.05)
{ 
      confUp <- z + qnorm(1-alpha/2)*sqrt(z*(1-z)/Nsim)
      conftDown <- z - qnorm(1-alpha/2)*sqrt(z*(1-z)/Nsim)
      c(z,conftDown, confUp)
}

se.bm <- function(x, g = function(x) x) {
      n <- length(x)
      b <- floor(sqrt(n))
      a <- floor(n / b)
      
      y <- sapply(1:a, function(k) mean(g(x[((k - 1) * b + 1):(k * b)]))) 
      gx <- g(x)
      mu.hat <- mean(gx)
      var.hat <- b * sum((y - mu.hat)^2) / (a - 1)
      se <- sqrt(var.hat / n)
      
      list(mu = mu.hat, se.mean = se, var = var.hat)
}

#WL
conf.w <- function(x, w, const)
{
      df.conf <- matrix(data = 0,nrow = 5, ncol = 3)
      for (i in 1:l)
      {
            tmp <- se.bm(t(x), g =function(x) (Nsim/const)*ifelse(x == i, 1, 0)/w[i])
            se <- tmp$se.mean
            m <-tmp$mu
            df.conf[i,] <- c(m, m - se, m + se)
      }
      return(df.conf)
}

conf.prob.sample <- lapply(as.vector(prob.sample), conf.mc)
ci.sample <- cbind(x= c(rep("Sample",5)), as.data.frame(t(as.data.frame(conf.prob.sample)), row.names = 1))

#ci.wl <-cbind(x=c(rep("WL",5)), as.data.frame(conf.w(resWL, weight.wl, const.wl)))
 conf.prob.wl <- lapply(as.vector(prob.wl), conf.mc)
 ci.wl2<- cbind(x= c(rep("WL",5)), as.data.frame(t(as.data.frame(conf.prob.wl)), row.names = 1))
 ci.wl <- ci.wl2

ci <- rbind(ci.sample, ci.wl)
lengthCI <- ci[,4] - ci[,3]
ci <- cbind(ci, lengthCI)
colnames(ci) <- c("x","PropScore", "LowerCI", "UpperCI", "lengthCI")

####### Plots #######
require(ggplot2)

## Loading required package: ggplot2

plot.CI <- function(p1, p2,num_score)
{
      ggplot(ci[c(p1, p2),], aes(x = x,y = PropScore)) +
            geom_point(size = 4) +
            geom_errorbar(aes(ymax = UpperCI, ymin = LowerCI))+
            ggtitle(paste("Score = ", num_score))
}

for (i in 1:l)
{
      
      print(plot.CI(i, i+l,i))
      print(ci[c(i, i+l),])
      
}

##        x PropScore   LowerCI   UpperCI    lengthCI
## 1 Sample 0.3150690 0.3141585 0.3159795 0.001820976
## 6     WL 0.3147905 0.3138802 0.3157007 0.001820541

##        x PropScore   LowerCI   UpperCI    lengthCI
## 2 Sample 0.5592280 0.5582549 0.5602011 0.001946164
## 7     WL 0.5584439 0.5574706 0.5594172 0.001946529

##        x PropScore   LowerCI   UpperCI    lengthCI
## 3 Sample 0.1106030 0.1099883 0.1112177 0.001229446
## 8     WL 0.1116901 0.1110727 0.1123074 0.001234717

##        x  PropScore    LowerCI    UpperCI     lengthCI
## 4 Sample 0.01405000 0.01381932 0.01428068 0.0004613640
## 9     WL 0.01407937 0.01384845 0.01431029 0.0004618391

##         x    PropScore      LowerCI     UpperCI     lengthCI
## 5  Sample 0.0010500000 0.0009865233 0.001113477 0.0001269535
## 10     WL 0.0009962239 0.0009343923 0.001058055 0.0001236631

# my.bm <- function(x)
# {
#       n <- length(x)
#       n_ <- n/2
#       x_ <- rep(0, n_)
#       for (i in 1:n_)
#             x_[i] <- 0.5*(x[2*i-1] + x[2*i])
#       c0 <- (1/n_)*sum(x_ - mean(x_))
#       sigma <- c0/(n_ - 1) 
#       return(sigma)
# }

Untitled

Небожатко Екатерина

September 15, 2017