Local alignments score

library("Biostrings")

## Loading required package: BiocGenerics

## Loading required package: parallel

## 
## Attaching package: 'BiocGenerics'

## The following objects are masked from 'package:parallel':
## 
##     clusterApply, clusterApplyLB, clusterCall, clusterEvalQ,
##     clusterExport, clusterMap, parApply, parCapply, parLapply,
##     parLapplyLB, parRapply, parSapply, parSapplyLB

## The following objects are masked from 'package:stats':
## 
##     IQR, mad, xtabs

## The following objects are masked from 'package:base':
## 
##     anyDuplicated, append, as.data.frame, cbind, colnames,
##     do.call, duplicated, eval, evalq, Filter, Find, get, grep,
##     grepl, intersect, is.unsorted, lapply, lengths, Map, mapply,
##     match, mget, order, paste, pmax, pmax.int, pmin, pmin.int,
##     Position, rank, rbind, Reduce, rownames, sapply, setdiff,
##     sort, table, tapply, union, unique, unsplit, which, which.max,
##     which.min

## Loading required package: S4Vectors

## Loading required package: stats4

## 
## Attaching package: 'S4Vectors'

## The following objects are masked from 'package:base':
## 
##     colMeans, colSums, expand.grid, rowMeans, rowSums

## Loading required package: IRanges

## Loading required package: XVector

GenerateDNAseq <- function(l)
{
      seq <- sample(DNA_ALPHABET[1:4], size=l, replace=TRUE)
      seq <-  paste(seq, collapse="")
      return(DNAString(seq))
}


SubstituteSymbol <- function(x,k, len_alphabet)
{
      symbol <- alphabet[sample(1:len_alphabet, 1)]
      substr(x,k,k) <- symbol
      return(x)
}

InsertWithLeftShift <- function(x,k, len_alphabet)
{
      substr(x, 1, k-1) <- substr(x, 2, k)
      res <- SubstituteSymbol(x,k, len_alphabet)
      return(res)
}

InsertWithRightShift <- function(x,k, len_alphabet)
{
      substr(x, k+1,nchar(x)) <- substr(x, k, nchar(x) - 1)
      res <- SubstituteSymbol(x,k, len_alphabet)
      return(res)
}

DelAndLeftShift <- function(x,k, len_alphabet)
{
      substr(x, 2, k) <- substr(x, 1, k-1)
      res <- SubstituteSymbol(x,1, len_alphabet)
      return(res)
}

DelAndRightShift <- function(x,k, len_alphabet)
{
      substr(x, k,nchar(x)-1) <- substr(x, k+1, nchar(x))
      res <- SubstituteSymbol(x,nchar(x), len_alphabet)
      return(res)
}

Neighbor <- function(x, len_alphabet)
{
      k = sample(1:len_alphabet, 1)
      roll <- sample(1:5, 1, prob = c(0.5, 1/8,1/8,1/8,1/8))
      if(roll == 1) return(SubstituteSymbol(x,k, len_alphabet))
      if(roll == 2) return(InsertWithLeftShift(x,k, len_alphabet))
      if(roll == 3) return(InsertWithRightShift(x,k, len_alphabet))
      if(roll == 4) return(DelAndLeftShift(x,k, len_alphabet))
      if(roll == 5) return(DelAndRightShift(x,k, len_alphabet))
}

isNOTflat <- function(H, Smax, Smin)
{
      res <- sum(H > 0.6*sum(H)/(Smax - Smin + 1))
      if(res == Smax) return(FALSE)
      else return(TRUE)
}


MetropolisUpdate <- function(y,s,w, query, len_alphabet = 4, mat,gapOpening = 12, gapExtension = 1)
{
      x <- query
      y <- toString(y)
      y_new <- Neighbor(y, len_alphabet)
      y <- DNAString(y)
      y_new <- DNAString(y_new)
      
      resAli <- pairwiseAlignment(y_new, x, type = "local", substitutionMatrix=mat, gapOpening=gapOpening, gapExtension=gapExtension) # посмотреть параметры
      s_new <- resAli@score
      alpha <- exp(w[s_new] - w[s])
      if(runif(1)< alpha)
            return(list(y_new, s_new))
      else
            return(list(y,s))
}

WangLandau <- function(w, phi, phiFinal,Smax, Smin, Nsim, query, subject, len_alphabet, mat, gapOpening= 12, gapExtension = 1)
{
      x <- query
      y <- subject
      resAli <- pairwiseAlignment(y,x, type = "local",substitutionMatrix=mat, gapOpening=gapOpening, gapExtension=gapExtension)
      s <- resAli@score
      kMax = 15
      k <- 0
      while(k <= kMax)
      {
            H <- rep(0, Smax)
            while(isNOTflat(H, Smax, Smin))
            {
                  res <- MetropolisUpdate(y, s, w, query, len_alphabet, mat,gapOpening, gapExtension)
                  y <- res[[1]]
                  s <- res[[2]]
                  H[s] <- H[s] + 1
                  print(H)
                  w[s] <- w[s] - phi
            }
            phi <- phi/2
            #phi <- sqrt(phi)
            k = k + 1
      }
      H <- rep(0, Smax)
      for(j in 1:Nsim)
      {
            res <- MetropolisUpdate(y, s, w, query, len_alphabet, mat, gapOpening, gapExtension)
            y <- res[[1]]
            s <- res[[2]]
            H[s] <- H[s] + 1
            print(j)
      }
      return(list(H, w))
}
      

######## Параметры ########
l = 5 # длина строки
#query <- GenerateDNAseq(l) # что ищем, фиксированный
query <- DNAString("ACGTG")
subject <- GenerateDNAseq(l)
alphabet <- alphabet(query, baseOnly=TRUE)
len_alphabet <- length(alphabet)
Smin <- 1  # мин значение score
Smax <- length(query)   # макс значение score
Nsim <- 10^6 # количество итераций в генерации выборки
weight<- rep(1, Smax)
weight_log <- rep(0, Smax)  # веса для WangLandau
phi <- exp(0.1) 
#phi_log <- -0.1
phi_log <- 1
phiFinal <- exp(0.0002)
phiFinal_log <- 0.0002
mat <- nucleotideSubstitutionMatrix(match = 1, mismatch = -3, baseOnly = TRUE)
gapOpening = 12
gapExtension = 1

# resWL <- WangLandau(w = weight_log, phi_log, phiFinal_log, Smax, Smin, Nsim, query, subject, len_alphabet, mat, gapOpening, gapExtension)
# 
# hist_res <- resWL[[1]]
# hist_res
# weight_res <- resWL[[2]]
# weight_res

####### WL Score #######

setwd("E:/Study/Magistracy/10 term/НИР/Prog")
s_res <- read.table("ScoreWL")
s <- read.table("ScoreSample")
weight.wl <- c(9.296220e-11, 6.467032e-11, 3.392456e-10, 3.435640e-09, 2.993556e-08)

hist.wl <- table(s_res)
barplot(hist.wl/sum(hist.wl), main = "Scores without weights")

scores_weight <- hist.wl*(1/weight.wl)
const <- sum(scores_weight)
prob.wl <- (1/const)*scores_weight
barplot(prob.wl, main = "Scores with weights", col=rgb(0,0,1,1/4))
# #h1 <- hist(scores_weight)

###### Score Sample ######
# s <- NULL
# x <- query
# y <- GenerateDNAseq(l)
# H_s <- rep(0, Smax)
# for (i in 1:Nsim)
# {
#       y <- Neighbor(toString(y), len_alphabet)
#       res <-  pairwiseAlignment(y,x, substitutionMatrix = mat, type = "local", gapOpening=12, gapExtension=1)
#       print(paste(i, "sample"))
#       H_s[res@score] = H_s[res@score] + 1
# 
# }

hist.sample <- table(s)
barplot(hist.sample/sum(hist.sample),add = TRUE, col=rgb(0,1,0,1/4))
prob.sample <- hist.sample/sum(hist.sample)


conf <- function(z, alpha = 0.05)
{ 
      confUp <- z + qnorm(1-alpha/2)*sqrt(z*(1-z)/Nsim)
      conftDown <- z - qnorm(1-alpha/2)*sqrt(z*(1-z)/Nsim)
      c(z,conftDown, confUp)
}

#library("Hmisc")
conf.prob.sample <- lapply(as.vector(prob.sample), conf)
conf.prob.wl <- lapply(as.vector(prob.wl), conf)
# l.prop.score <- lapply(as.vector(s_tbl), function(x) {binconf(x, Nsim)} )
# l.prop.wl <- lapply(as.vector(scores_weight), function(x) {binconf(x, Nsim)} )
df1 <- as.data.frame(t(as.data.frame(conf.prob.sample)), row.names = 1)
df2 <- as.data.frame(t(as.data.frame(conf.prob.wl)), row.names = 1)
df <- rbind(df1, df2)
#df <- df[with(df,order(V1)),]
df <- cbind(x =c(rep("Sample",5),rep("WL", 5)), df)
colnames(df) <- c("x","PropScore", "UpperCI", "LowerCI")

require(ggplot2)

## Loading required package: ggplot2

plot.CI <- function(a,b, num_score)
{
      ggplot(df[c(a,b),], aes(x = x,y = PropScore)) +
            geom_point(size = 4) +
            geom_errorbar(aes(ymax = UpperCI, ymin = LowerCI))+
            ggtitle(paste("Score = ", num_score))
}

num_score = 1
for (i in 1:(nrow(df) - l))
{
      
      print(plot.CI(i, i+l, num_score))
      num_score <- num_score + 1
      print(df[c(i,(i+l)),])
      
}

##        x PropScore   UpperCI   LowerCI
## 1 Sample 0.3150690 0.3141585 0.3159795
## 6     WL 0.3145039 0.3135939 0.3154140

##        x PropScore   UpperCI   LowerCI
## 2 Sample 0.5592280 0.5582549 0.5602011
## 7     WL 0.5591086 0.5581355 0.5600817

##        x PropScore   UpperCI   LowerCI
## 3 Sample 0.1106030 0.1099883 0.1112177
## 8     WL 0.1113157 0.1106992 0.1119322

##        x PropScore    UpperCI    LowerCI
## 4 Sample 0.0140500 0.01381932 0.01428068
## 9     WL 0.0140769 0.01384600 0.01430780

##         x   PropScore      UpperCI     LowerCI
## 5  Sample 0.001050000 0.0009865233 0.001113477
## 10     WL 0.000994912 0.0009331211 0.001056703

#setwd("/media/ekaterina/Data/Study/Magistracy/10 term/")
# setwd("E:/??????/????? ????. ???????????? (?????)/????? 4 2.05.17/Prog")
# write(s_res, file = "ScoreWL", ncolumns = 1)
# write(s, file = "ScoreSample", ncolumns = 1)