Local alignments score

library("Biostrings")

## Loading required package: BiocGenerics

## Loading required package: parallel

## 
## Attaching package: 'BiocGenerics'

## The following objects are masked from 'package:parallel':
## 
##     clusterApply, clusterApplyLB, clusterCall, clusterEvalQ,
##     clusterExport, clusterMap, parApply, parCapply, parLapply,
##     parLapplyLB, parRapply, parSapply, parSapplyLB

## The following objects are masked from 'package:stats':
## 
##     IQR, mad, xtabs

## The following objects are masked from 'package:base':
## 
##     anyDuplicated, append, as.data.frame, cbind, colnames,
##     do.call, duplicated, eval, evalq, Filter, Find, get, grep,
##     grepl, intersect, is.unsorted, lapply, lengths, Map, mapply,
##     match, mget, order, paste, pmax, pmax.int, pmin, pmin.int,
##     Position, rank, rbind, Reduce, rownames, sapply, setdiff,
##     sort, table, tapply, union, unique, unsplit, which, which.max,
##     which.min

## Loading required package: S4Vectors

## Loading required package: stats4

## 
## Attaching package: 'S4Vectors'

## The following objects are masked from 'package:base':
## 
##     colMeans, colSums, expand.grid, rowMeans, rowSums

## Loading required package: IRanges

## Loading required package: XVector

GenerateDNAseq <- function(l)
{
      seq <- sample(DNA_ALPHABET[1:4], size=l, replace=TRUE)
      seq <-  paste(seq, collapse="")
      return(DNAString(seq))
}

dict <- function(n, Smax, Smin)
{
      x <- vector(mode = "list", length = Smax - Smin + 1)
      x <- rep(n, Smax-Smin + 1)
      names(x) <- seq(Smin, Smax)
      return(x)
}

SubstituteSymbol <- function(x,k, len_alphabet)
{
      symbol <- alphabet[sample(1:len_alphabet, 1)]
      substr(x,k,k) <- symbol
      return(x)
}

InsertWithLeftShift <- function(x,k, len_alphabet)
{
      substr(x, 1, k-1) <- substr(x, 2, k)
      res <- SubstituteSymbol(x,k, len_alphabet)
      return(res)
}

InsertWithRightShift <- function(x,k, len_alphabet)
{
      substr(x, k+1,nchar(x)) <- substr(x, k, nchar(x) - 1)
      res <- SubstituteSymbol(x,k, len_alphabet)
      return(res)
}

DelAndLeftShift <- function(x,k, len_alphabet)
{
      substr(x, 2, k) <- substr(x, 1, k-1)
      res <- SubstituteSymbol(x,1, len_alphabet)
      return(res)
}

DelAndRightShift <- function(x,k, len_alphabet)
{
      substr(x, k,nchar(x)-1) <- substr(x, k+1, nchar(x))
      res <- SubstituteSymbol(x,nchar(x), len_alphabet)
      return(res)
}

Neighbor <- function(x, len_alphabet)
{
      k = sample(1:len_alphabet, 1)
      roll <- sample(1:5, 1, prob = c(0.5, 1/8,1/8,1/8,1/8))
      if(roll == 1) return(SubstituteSymbol(x,k, len_alphabet))
      if(roll == 2) return(InsertWithLeftShift(x,k, len_alphabet))
      if(roll == 3) return(InsertWithRightShift(x,k, len_alphabet))
      if(roll == 4) return(DelAndLeftShift(x,k, len_alphabet))
      if(roll == 5) return(DelAndRightShift(x,k, len_alphabet))
}

isNOTflat <- function(H, Smax, Smin)
{
      res <- sum(H > 0.6*sum(H)/(Smax - Smin + 1))
      if(res == Smax) return(FALSE)
      else return(TRUE)
}


MetropolisUpdate <- function(y,s,w, query, len_alphabet = 4, mat,gapOpening = 12, gapExtension = 1)
{
      x <- query
      y <- toString(y)
      y_new <- Neighbor(y, len_alphabet)
      y <- DNAString(y)
      y_new <- DNAString(y_new)
      
      resAli <- pairwiseAlignment(y_new, x, type = "local", substitutionMatrix=mat, gapOpening=gapOpening, gapExtension=gapExtension) # посмотреть параметры
      s_new <- resAli@score
      if (s_new<=0) 
      {
            s_new <- 0
            print("########")
            print(x)
            print(y_new)
      }
            
      alpha <- w[[as.character(s_new)]]/ w[[as.character(s)]]
      #alpha <- exp(w[[as.character(s_new)]] - w[[as.character(s)]])
      if(runif(1)< alpha)
            return(list(y_new, s_new))
      else
            return(list(y,s))
}

WangLandau <- function(w, phi, phiFinal,Smax, Smin, Nsim, query, subject, len_alphabet, mat, gapOpening= 12, gapExtension = 1)
{
      x <- query
      y <- subject
      resAli <- pairwiseAlignment(y,x, type = "local",substitutionMatrix=mat, gapOpening=gapOpening, gapExtension=gapExtension)
      s <- resAli@score
      if(s<0) s <- 0
      while(phi > phiFinal)
      {
            H <- dict(0, Smax, Smin)
            while(isNOTflat(H, Smax, Smin))
            {
                  res <- MetropolisUpdate(y, s, w, query, len_alphabet, mat,gapOpening, gapExtension)
                  y <- res[[1]]
                  s <- res[[2]]
                  if (s<0) s <- 0
                  H[[as.character(s)]] <- H[[as.character(s)]] + 1
                  w[[as.character(s)]] <- w[[as.character(s)]]/phi
                  print(H)
                  #w[[as.character(s)]] <- w[[as.character(s)]] - phi
            }
            #phi <- phi/2
            phi <- sqrt(phi)
      }
      H <- dict(0, Smax, Smin)
      s_res <- rep(0, Nsim)
      step <- 1
      for(j in 1:Nsim)
      {
            res <- MetropolisUpdate(y, s, w, query, len_alphabet, mat, gapOpening, gapExtension)
            y <- res[[1]]
            s <- res[[2]]
            s_res[j] <- s
            H[[as.character(s)]] <- H[[as.character(s)]] + 1
            print(step)
            step = step +1
      }
      return(list(H, w, s_res))
}
      

######## Параметры ########
l = 5 # длина строки
#query <- GenerateDNAseq(l) # что ищем, фиксированный
query <- DNAString("ACGTG")
subject <- GenerateDNAseq(l)
alphabet <- alphabet(query, baseOnly=TRUE)
len_alphabet <- length(alphabet)
Smin <- 1  # мин значение score
Smax <- length(query)   # макс значение score
Nsim <- 10^6 # количество итераций в генерации выборки
weight<- dict(1, Smax, Smin)
weight_log <- dict(log(1), Smax, Smin) # веса для WangLandau
phi <- exp(0.1) 
phi_log <- log(phi)
phiFinal <- exp(0.0002)
phiFinal_log <- log(phiFinal)
mat <- nucleotideSubstitutionMatrix(match = 1, mismatch = -3, baseOnly = TRUE)
gapOpening = 12
gapExtension = 1

# res <- WangLandau(w = weight, phi, phiFinal, Smax, Smin, Nsim, query, subject, len_alphabet, mat, gapOpening, gapExtension)
# 
# hist_res <- res[[1]]
# hist_res
# weight_res <- res[[2]]
# weight_res 
# s_res <- res[[3]]

setwd("/media/ekaterina/Data/Study/Magistracy/10 term/НИР/Prog/")
s_res <- read.table("ScoreWL")
s <- read.table("ScoreSample")
weight_res <- c(9.296220e-11, 6.467032e-11, 3.392456e-10, 3.435640e-09, 2.993556e-08)


## для длины 5
counts5 <- table(s_res)
barplot(counts5/sum(counts5), main = "Scores without weights")

scores_weight <- counts5*(1/weight_res)
barplot(scores_weight/sum(scores_weight), main = "Scores with weights", col=rgb(0,0,1,1/4))
scores_weight/sum(scores_weight)

## s_res
##           1           2           3           4           5 
## 0.314503919 0.559108570 0.111315703 0.014076896 0.000994912

s_tbl <- table(s)
barplot(s_tbl/sum(s_tbl),add = TRUE, col=rgb(1,0,0,1/4))

s_tbl/sum(s_tbl)

## s
##        1        2        3        4        5 
## 0.315069 0.559228 0.110603 0.014050 0.001050

#h1 <- hist(scores_weight)


###### Score Sample ######
# s <- NULL
# x <- query
# y <- GenerateDNAseq(l)
# for (i in 491026:Nsim)
# {
#       y <- Neighbor(toString(y), len_alphabet)
#       res <-  pairwiseAlignment(y,x, substitutionMatrix = mat, type = "local", gapOpening=12, gapExtension=1)
#       print(i)
#       s[i] <- res@score
#       
# }

# s_tbl <- table(s)
# barplot(s_tbl/sum(s_tbl),col=rgb(0,1,0,1/4))

prop.wl <- scores_weight/sum(scores_weight) 
prop.score <- s_tbl/sum(s_tbl)


conf <- function(z)
{ 
      confUp <- z + 1.96*sqrt(z*(1-z)/Nsim)
      conftDown <- z - 1.96*sqrt(z*(1-z)/Nsim)
      c(z,confUp, conftDown)
}

l.prop.score <- lapply(as.vector(prop.score), conf)
l.prop.wl <- lapply(as.vector(prop.wl), conf)
df1 <- as.data.frame(t(as.data.frame(l.prop.score)), row.names = 1)
df2 <- as.data.frame(t(as.data.frame(l.prop.wl)), row.names = 1)
df <- rbind(df1, df2)
df <- df[with(df,order(V1)),]
df <- cbind(x =rep(c(1,2),5), df)
colnames(df) <- c("x","PropScore", "UpperCI", "LowerCI")

require(ggplot2)

## Loading required package: ggplot2

plot.CI <- function(a,b, num_score)
{
      ggplot(df[a:b,], aes(x = x,y = PropScore)) +
            geom_point(size = 4) +
            geom_errorbar(aes(ymax = UpperCI, ymin = LowerCI))+
            ggtitle(paste("Score = ", num_score))
}

num_score = 5
for (i in seq(1,(nrow(df)-1), by = 2))
{
      
      print(plot.CI(i, i+1, num_score))
      num_score <- num_score - 1
      print(df[i:(i+1),])
      
}

##    x   PropScore     UpperCI      LowerCI
## 10 1 0.000994912 0.001056704 0.0009331200
## 5  2 0.001050000 0.001113478 0.0009865221

##   x PropScore    UpperCI    LowerCI
## 4 1 0.0140500 0.01428069 0.01381931
## 9 2 0.0140769 0.01430780 0.01384599

##   x PropScore   UpperCI   LowerCI
## 3 1 0.1106030 0.1112177 0.1099883
## 8 2 0.1113157 0.1119322 0.1106992

##   x PropScore   UpperCI   LowerCI
## 6 1 0.3145039 0.3154140 0.3135939
## 1 2 0.3150690 0.3159795 0.3141585

##   x PropScore   UpperCI   LowerCI
## 7 1 0.5591086 0.5600817 0.5581354
## 2 2 0.5592280 0.5602011 0.5582549

Local alignments score

Небожатко Екатерина

March 20, 2017