Done following the instructions underneath “Obtaining samples - the ms software” in the class notes, hyperlink.
In terminal:
ms 20 1 -t 20 > ms.out
fileToList <- function(filename)
{
stringsToList <- function(i, stringData, sampleSize)
{
# ss: subset
ss <- stringData[((i-1)*sampleSize + 1):(i*sampleSize)]
oneString <- paste(ss, collapse = "")
oneVec <- as.numeric(unlist(strsplit(x = oneString, split = "")))
mat.data <- matrix(oneVec, nrow = sampleSize, byrow = TRUE)
return(mat.data)
}
con <- file(filename, open = 'r')
firstLine <- readLines(con = con, n = 1)
reps <- as.integer(gsub(pattern = "ms (\\d+) (\\d+) .*" ,
x = firstLine, replacement = "\\2"))
sampleSize <- as.integer(gsub(pattern = "ms (\\d+) (\\d+) .*",
x = firstLine, replacement = "\\1"))
commandReadLines <- paste("wc -l ", filename, " | awk '{ print $1 }'", sep = '')
numberOfLines <- as.integer(system(command = commandReadLines, intern = TRUE))
tmp.data <- c()
for (i in 2:numberOfLines)
{
tmp <- readLines(con = con, n = 1)
condition = grepl("[^01]", tmp) == FALSE & grepl("[01]", tmp) == TRUE
if (condition) tmp.data <- c(tmp.data, tmp)
}
lapply(1:reps, stringsToList, tmp.data, sampleSize)
}
The working directory is set to the folder the file ms.out is in:
setwd("~/Documents/MSc in Bioinformatics/2nd Semester/Introduction to Programming for Bioinformatics/Delivery/Exercise_4")
The above is an example of my working directory, i.e. it applies only for my case here.
filename = "ms.out"
dataset <- fileToList(filename = filename)
dataset <- dataset[[1]]
manhattan.dist <- function(v1, v2)
{
d <- abs(v1 - v2)
return (sum(d))
}
distances <- c()
for (i in 1:nrow(dataset))
{
# We compute all pairwise combinations:
distances <- c(distances, apply(dataset, 1, manhattan.dist, dataset[i,]))
}
# There must be 20^2 = 400 combinations:
length(distances) == 400
## [1] TRUE
average <- sum(distances)/length(distances)
# Same result if using the mean() function:
average == mean(distances)
## [1] TRUE
pi <- mean(distances)
pi
## [1] 14.165