R Markdown

read.ms.output <- function( file.ms.output=NA ) {
  
  txt=NA
  
  if( !is.na(file.ms.output) ) txt <- scan(file=file.ms.output,
                                           what="character", sep="\n", quiet=TRUE)
  if( is.na(txt[1]) ){
    print("Usage: read.ms.output(txt), or read.ms.output(file=filename)")
    return()
  }
  nsam <- as.integer( strsplit(txt[1], split=" ")[[1]][2] )
  ndraws <- as.integer( strsplit( txt[1], split=" ")[[1]][3] )
  
  h <- numeric()
  result <- list()
  gamlist <- list()
  positions <- list()
  
  marker <- grep("prob",txt)
  probs <- sapply(strsplit(txt[marker], split=":"), function(vec) as.numeric(vec[2]))
  marker <- grep("time",txt)
  times <- sapply(strsplit(txt[marker], split="\t"), function(vec){ as.numeric(vec[2:3])} )
  
  
  ## THE OUTPUT TEXT FOR EACH DRAW SHOULD CONTAIN THE WORD "segsites"
  marker <- grep("segsites", txt)
  
  if( length(marker) != ndraws){
    stop( paste("length: ", length(marker), " ndraws: ", ndraws) )
    stopifnot(length(marker) == ndraws)
  }
  
  
  ## GET NUMBERS OF SEGREGATING SITES IN EACH DRAW
  segsites <- sapply(strsplit(txt[marker], split=" "), function(vec) as.integer(vec[2]) )
  for(draw in seq(along=marker)) {
    if(!(draw %% 100)) cat(draw, " ")
    if(segsites[draw] > 0) {
      tpos <- strsplit(txt[marker[draw]+1], split=" ")
      positions[[draw]] <- as.numeric( tpos[[1]][ 2:(segsites[draw]+1) ] ) 
      haplotypes <- txt[(marker[draw] + 2):(marker[draw] + 2 + nsam - 1)]
      haplotypes <- strsplit(haplotypes, split="")
      h <- sapply(haplotypes, function(el) c(as.integer(el)))
      ## IF THERE'S 1 SEGREGATING SITE, THIS WON'T BE A MATRIX 
      if(segsites[draw] == 1) h <- as.matrix(h)
      ## OTHERWISE, IT NEEDS TO BE TRANSPOSED
      else h <- t(h)
    }
    else {
      h <- matrix(nrow=nsam, ncol=0)
      positions[[draw]]<- NA
    }
    gamlist[[draw]] <- h
    stopifnot(all(dim(h) == c(nsam, segsites[draw]))) 
  }
  cat("\n")
  list(segsites=segsites, gametes=gamlist, probs=probs, times=t(times), positions=positions, nsam=nsam, nreps=ndraws ) 
}

system("ms 10 1 -t 10 -G 10 > ms.out")
ms <- read.ms.output("ms.out")

data = ms$gametes[[1]]
ms$gametes

observation = ncol(ms$gametes[[1]])
observation


pairdif = function(data){
  dif = 0
  for (i in 1:(nrow(data)-1)){
    for(j in (i+1):(nrow(data))){
    dif = dif + sum(data[i,] != data[j,])
    }
  }
  return(2*dif/(nrow(data)*(nrow(data)-1)))
}


stats = c(ncol(ms$gametes[[1]]), pairdif(ms$gametes[[1]]))
stats

reps = 10000
all.sim.stats = matrix(0, nrow=reps, ncol=length(stats)) 
all.sim.stats
all.theta = vector(mode = "numeric", reps)
for(i in 1:reps){
  ## G to kseroume
  ## t oxi
  t = runif(1, 2, 100)
  all.theta[i] = t
  system(paste("ms 10 1 -t ", t, " -G 10 > ms.out", sep=" "))
  simdata = read.ms.output("ms.out")
  sim.stats = c(ncol(simdata$gametes[[1]]), pairdif((simdata$gametes[[1]])))
  all.sim.stats[i,] = sim.stats
  print(i)
}
all.sim.stats


eucl = function(m, obs){
  difs = apply(m, 1, function(x){ sqrt(sum(obs - x)**2)})
  return(difs)
}

difs = eucl(all.sim.stats, stats)
accepted.dif.indexes  = which(difs < 5)

mean(all.theta[accepted.dif.indexes])

plot(density(all.theta), col='blue', ylim=c(0, 0.1))
points(density(all.theta[accepted.dif.indexes]), col='red', type='l')