This is an R Markdown Notebook. When you execute code within the notebook, the results appear beneath the code.

Try executing this chunk by clicking the Run button within the chunk or by placing your cursor inside it and pressing Cmd+Shift+Enter.

## Question 1: Random Scatter
set.seed(296)
data = read.csv("hcmv-263hxkx-1qhtfgz.txt", header=T)
site.original = data.matrix(data)
n.base <- 229354 #number of DNA sequence bases
n.site <- 296 #296 palindrome sites
Title1 = 'Palindrome locations'
library(lattice)
#Create a scatter plot of the locations of the original data
site.original.plot =stripplot(site.origianl, pch=5, cex=0.8,col="black", 
           main=Title1, xlab= 'Base Pairs', jitter.data=T, grid="h") 
site.original.plot

#randomly generate 3 different 296 panlindrome locations from 229354 DNA sequence bases
site.random1 = round(runif(n.site,0,n.base))
site.random2 = round(runif(n.site,0,n.base))
site.random3 = round(runif(n.site,0,n.base))
#plot 3 scatter plots of runif
stripplot(site.random1, pch=5, cex=0.8, jitter.data=T, grid = "h", col="blue", #3 different scatter plots
          main = Title1, xlab = 'Base Pairs') #1-dimensional scatter plot

     
stripplot(site.random2, pch=5, cex=0.8, jitter.data=T, grid = "h", col="red", 
          main = Title1, xlab = 'Base Pairs')

     
stripplot(site.random3, pch=5, cex=0.8, jitter.data=T, grid = "h", col="green", 
          main = Title1, xlab = 'Base Pairs', add=T)

#plot histograms comparing sample 1 and sample 2
hist(site.random1, breaks=85, col=rgb(1,1,0,0.7), main="Histogram of sample 1 and sample 2", xlab="Base Pair",xaxt='n')
hist(site.random2, breaks=85, col=rgb(0,1,1,0.4), main="Histogram of sample 2", xlab="Base Pair", add=T, xaxt='n')
axis(1, at=seq(0, n.base+10000, 10000))
legend("topright", c("sample1", "sample2"), col=c(rgb(1,1,0,0.7), rgb(0,1,1,0.4)), lwd=8)

#plot histograms comparing original data and sample3
hist(site.random3, breaks=85, col=rgb(1,0,0,0.5), main="Histogram of original data and sample 3", xlab="BasePair", xaxt='n')
hist(site.origianl, breaks=85, col=rgb(0,0,1,0.5), main="Histogram of original data", xlab="Base Pair", add=T, xaxt='n')
axis(1, at=seq(0, n.base+10000, 10000))
legend("topright", c("sample3", "data"), col=c(rgb(1,0,0,0.5), rgb(0,0,1,0.5)), lwd=8)

for(x in c(51,57,66)){
 count.sample = rep(n.site/x, x)
 count.data = as.vector(table(cut(data, breaks=seq(0, n.base, length.out = x + 1), include.lowest=TRUE)))
 
 hist(data, breaks=x, probability=T, col=rgb(1,0,0,0.5))
}
Error in cut.default(data, breaks = seq(0, n.base, length.out = x + 1),  : 
  'x' must be numeric
#chi-squared
data = read.csv("hcmv-263hxkx-1qhtfgz.txt", header=T)
array.data = data.matrix(data)
n.base <- 229354 #number of DNA sequence bases
n.site <- 296 #296 palindrome sites
gene <- seq(1,n.base)
reg.split <- function(n.region, gene, site){ #function to split regions into n regions
  count.int <- table(cut(site, breaks = seq(1, length(gene), length.out=n.region+1), include.lower=TRUE))
  count.vector <- as.vector(count.int)
  count.tab <-table(count.vector)
  return(count.tab)
}
n.region <- 50 # n=50 regions here
reg.split(n.region, gene, site.random) #generate a table for the dataset
count.vector
 1  2  3  4  5  6  7  8  9 10 12 
 1  2  3  6 14  7  4  7  2  2  2 
chisqtable <- function(n.region, site, n.base){ #chi-squared table for the dataset
  n <- length(site) 
  lambda.est <- n/n.region #estimate for lambda
  count.int <- table(cut(site, breaks = seq(1, length(gene), length.out=n.region+1), include.lowest=TRUE)) #divide into n.region number of non-overlapping intervals
  count.vector <- as.vector(count.int)  #get the count levels range
  count.range <- max(count.vector) - min(count.vector) +1
  
  table <- matrix(rep(NA, count.range*3), count.range, 3) #create a table
  for(i in 1:count.range){
    offset <-min(count.vector) - 1
    table[i, 1] <- i + offset #first column is the count level
    table[i, 2] <- sum(count.vector == i + offset) #2nd column is the observed count
    if ((i + offset == min(count.vector) && (min(count.vector)) != 0))
        table[i, 3] <- ppois(i+offset, lambda.est)*n.region
        else if (i + offset == max(count.vector))
          table[i, 3] <- (1 - ppois(i + offset - 1, lambda.est))*n.region
        else 
          table[i, 3] <- (ppois(i + offset, lambda.est) - ppois(i + offset - 1, lambda.est))*n.region
  }
  
  return(table)
}
site.random.table <- chisqtable(n.region, site.random, n.base)
site.random.table
      [,1] [,2]      [,3]
 [1,]    1    1 0.9290793
 [2,]    2    2 2.3526650
 [3,]    3    3 4.6425922
 [4,]    4    6 6.8710365
 [5,]    5   14 8.1353072
 [6,]    6    7 8.0268365
 [7,]    7    4 6.7884103
 [8,]    8    7 5.0234236
 [9,]    9    2 3.3042964
[10,]   10    2 1.9561435
[11,]   11    0 1.0527609
[12,]   12    2 0.9174487
install.packages("gplots")
also installing the dependencies ‘gtools’, ‘gdata’

trying URL 'https://cran.rstudio.com/bin/macosx/el-capitan/contrib/3.4/gtools_3.8.1.tgz'
Content type 'application/x-gzip' length 262688 bytes (256 KB)
==================================================
downloaded 256 KB

trying URL 'https://cran.rstudio.com/bin/macosx/el-capitan/contrib/3.4/gdata_2.18.0.tgz'
Content type 'application/x-gzip' length 1141723 bytes (1.1 MB)
==================================================
downloaded 1.1 MB

trying URL 'https://cran.rstudio.com/bin/macosx/el-capitan/contrib/3.4/gplots_3.0.1.1.tgz'
Content type 'application/x-gzip' length 509475 bytes (497 KB)
==================================================
downloaded 497 KB

The downloaded binary packages are in
    /var/folders/xy/341ywwj15pq82lydhqf89xr80000gn/T//RtmpWrrJrc/downloaded_packages
#Monte Carlo Practice 
set.seed(12)
n.site 

Add a new chunk by clicking the Insert Chunk button on the toolbar or by pressing Cmd+Option+I.

When you save the notebook, an HTML file containing the code and output will be saved alongside it (click the Preview button or press Cmd+Shift+K to preview the HTML file).

The preview shows you a rendered HTML copy of the contents of the editor. Consequently, unlike Knit, Preview does not run any R code chunks. Instead, the output of the chunk when it was last run in the editor is displayed.

LS0tCnRpdGxlOiAiUiBOb3RlYm9vayIKb3V0cHV0OiBodG1sX25vdGVib29rCi0tLQoKVGhpcyBpcyBhbiBbUiBNYXJrZG93bl0oaHR0cDovL3JtYXJrZG93bi5yc3R1ZGlvLmNvbSkgTm90ZWJvb2suIFdoZW4geW91IGV4ZWN1dGUgY29kZSB3aXRoaW4gdGhlIG5vdGVib29rLCB0aGUgcmVzdWx0cyBhcHBlYXIgYmVuZWF0aCB0aGUgY29kZS4gCgpUcnkgZXhlY3V0aW5nIHRoaXMgY2h1bmsgYnkgY2xpY2tpbmcgdGhlICpSdW4qIGJ1dHRvbiB3aXRoaW4gdGhlIGNodW5rIG9yIGJ5IHBsYWNpbmcgeW91ciBjdXJzb3IgaW5zaWRlIGl0IGFuZCBwcmVzc2luZyAqQ21kK1NoaWZ0K0VudGVyKi4gCgpgYGB7cn0KCmBgYAoKYGBge3J9CiMjIFF1ZXN0aW9uIDE6IFJhbmRvbSBTY2F0dGVyCgoKc2V0LnNlZWQoMjk2KQpkYXRhID0gcmVhZC5jc3YoImhjbXYtMjYzaHhreC0xcWh0Zmd6LnR4dCIsIGhlYWRlcj1UKQpzaXRlLm9yaWdpbmFsID0gZGF0YS5tYXRyaXgoZGF0YSkKbi5iYXNlIDwtIDIyOTM1NCAjbnVtYmVyIG9mIEROQSBzZXF1ZW5jZSBiYXNlcwpuLnNpdGUgPC0gMjk2ICMyOTYgcGFsaW5kcm9tZSBzaXRlcwpUaXRsZTEgPSAnUGFsaW5kcm9tZSBsb2NhdGlvbnMnCmxpYnJhcnkobGF0dGljZSkKCiNDcmVhdGUgYSBzY2F0dGVyIHBsb3Qgb2YgdGhlIGxvY2F0aW9ucyBvZiB0aGUgb3JpZ2luYWwgZGF0YQpzaXRlLm9yaWdpbmFsLnBsb3QgPXN0cmlwcGxvdChzaXRlLm9yaWdpYW5sLCBwY2g9NSwgY2V4PTAuOCxjb2w9ImJsYWNrIiwgCiAgICAgICAgICAgbWFpbj1UaXRsZTEsIHhsYWI9ICdCYXNlIFBhaXJzJywgaml0dGVyLmRhdGE9VCwgZ3JpZD0iaCIpIApzaXRlLm9yaWdpbmFsLnBsb3QKCiNyYW5kb21seSBnZW5lcmF0ZSAzIGRpZmZlcmVudCAyOTYgcGFubGluZHJvbWUgbG9jYXRpb25zIGZyb20gMjI5MzU0IEROQSBzZXF1ZW5jZSBiYXNlcwpzaXRlLnJhbmRvbTEgPSByb3VuZChydW5pZihuLnNpdGUsMCxuLmJhc2UpKQpzaXRlLnJhbmRvbTIgPSByb3VuZChydW5pZihuLnNpdGUsMCxuLmJhc2UpKQpzaXRlLnJhbmRvbTMgPSByb3VuZChydW5pZihuLnNpdGUsMCxuLmJhc2UpKQoKI3Bsb3QgMyBzY2F0dGVyIHBsb3RzIG9mIHJ1bmlmCnN0cmlwcGxvdChzaXRlLnJhbmRvbTEsIHBjaD01LCBjZXg9MC44LCBqaXR0ZXIuZGF0YT1ULCBncmlkID0gImgiLCBjb2w9ImJsdWUiLCAjMyBkaWZmZXJlbnQgc2NhdHRlciBwbG90cwogICAgICAgICAgbWFpbiA9IFRpdGxlMSwgeGxhYiA9ICdCYXNlIFBhaXJzJykgIzEtZGltZW5zaW9uYWwgc2NhdHRlciBwbG90CiAgICAgCnN0cmlwcGxvdChzaXRlLnJhbmRvbTIsIHBjaD01LCBjZXg9MC44LCBqaXR0ZXIuZGF0YT1ULCBncmlkID0gImgiLCBjb2w9InJlZCIsIAogICAgICAgICAgbWFpbiA9IFRpdGxlMSwgeGxhYiA9ICdCYXNlIFBhaXJzJykKICAgICAKc3RyaXBwbG90KHNpdGUucmFuZG9tMywgcGNoPTUsIGNleD0wLjgsIGppdHRlci5kYXRhPVQsIGdyaWQgPSAiaCIsIGNvbD0iZ3JlZW4iLCAKICAgICAgICAgIG1haW4gPSBUaXRsZTEsIHhsYWIgPSAnQmFzZSBQYWlycycsIGFkZD1UKQoKCiNwbG90IGhpc3RvZ3JhbXMgY29tcGFyaW5nIHNhbXBsZSAxIGFuZCBzYW1wbGUgMgpoaXN0KHNpdGUucmFuZG9tMSwgYnJlYWtzPTg1LCBjb2w9cmdiKDEsMSwwLDAuNyksIG1haW49Ikhpc3RvZ3JhbSBvZiBzYW1wbGUgMSBhbmQgc2FtcGxlIDIiLCB4bGFiPSJCYXNlIFBhaXIiLHhheHQ9J24nKQpoaXN0KHNpdGUucmFuZG9tMiwgYnJlYWtzPTg1LCBjb2w9cmdiKDAsMSwxLDAuNCksIG1haW49Ikhpc3RvZ3JhbSBvZiBzYW1wbGUgMiIsIHhsYWI9IkJhc2UgUGFpciIsIGFkZD1ULCB4YXh0PSduJykKYXhpcygxLCBhdD1zZXEoMCwgbi5iYXNlKzEwMDAwLCAxMDAwMCkpCmxlZ2VuZCgidG9wcmlnaHQiLCBjKCJzYW1wbGUxIiwgInNhbXBsZTIiKSwgY29sPWMocmdiKDEsMSwwLDAuNyksIHJnYigwLDEsMSwwLjQpKSwgbHdkPTgpCgojcGxvdCBoaXN0b2dyYW1zIGNvbXBhcmluZyBvcmlnaW5hbCBkYXRhIGFuZCBzYW1wbGUzCmhpc3Qoc2l0ZS5yYW5kb20zLCBicmVha3M9ODUsIGNvbD1yZ2IoMSwwLDAsMC41KSwgbWFpbj0iSGlzdG9ncmFtIG9mIG9yaWdpbmFsIGRhdGEgYW5kIHNhbXBsZSAzIiwgeGxhYj0iQmFzZVBhaXIiLCB4YXh0PSduJykKaGlzdChzaXRlLm9yaWdpYW5sLCBicmVha3M9ODUsIGNvbD1yZ2IoMCwwLDEsMC41KSwgbWFpbj0iSGlzdG9ncmFtIG9mIG9yaWdpbmFsIGRhdGEiLCB4bGFiPSJCYXNlIFBhaXIiLCBhZGQ9VCwgeGF4dD0nbicpCmF4aXMoMSwgYXQ9c2VxKDAsIG4uYmFzZSsxMDAwMCwgMTAwMDApKQpsZWdlbmQoInRvcHJpZ2h0IiwgYygic2FtcGxlMyIsICJkYXRhIiksIGNvbD1jKHJnYigxLDAsMCwwLjUpLCByZ2IoMCwwLDEsMC41KSksIGx3ZD04KQoKYGBgCmBgYHtyfQojUXVlc3Rpb24gMjogTG9jYXRpb25zIGFuZCBzcGFjaW5ncwojVXNlIGdyYXBoaWNhbCBtZXRob2RzIHRvIGV4YW1pbmUgdGhlIHNwYWNpbmdzIGJldHdlZW4gY29uc2VjdXRpdmUgcGFsaW5kcm9tZXMgYW5kIHN1bSBvZiBjb25zZWN1dGl2ZSBwYWlycywgdHJpcGxldHMsIGV0Yywgc3BhY2luZ3MuIENvbXBhcmUgd2hhdCB5b3UgZmluZCB0byB3aGF0IHlvdSB3b3VsZCBleHBlY3QgdG8gZmluZCBpbiBhIHJhbmRvbSBzY2F0dGVyLiBBbHNvLCB1c2UgZ3JhcGhpY2FsIG1ldGhvZHMgdG8gY29tcGFyZSBsb2NhdGlvbnMgb2YgdGhlIHBhbGluZHJvbWVzLgoKc2V0LnNlZWQoMjk2KQp1LnNhbXBsZSA9IHJ1bmlmKG4uc2l0ZSwgMCwgbi5iYXNlKQoKZm9yKHggaW4gYyg1MSw1Nyw2NikpewogY291bnQuc2FtcGxlID0gcmVwKG4uc2l0ZS94LCB4KQogY291bnQuZGF0YSA9IGFzLnZlY3Rvcih0YWJsZShjdXQoZGF0YSwgYnJlYWtzPXNlcSgwLCBuLmJhc2UsIGxlbmd0aC5vdXQgPSB4ICsgMSksIGluY2x1ZGUubG93ZXN0PVRSVUUpKSkKIAogaGlzdChkYXRhLCBicmVha3M9eCwgcHJvYmFiaWxpdHk9VCwgY29sPXJnYigxLDAsMCwwLjUpKQp9CgpgYGAKCmBgYHtyfQojY2hpLXNxdWFyZWQKCmRhdGEgPSByZWFkLmNzdigiaGNtdi0yNjNoeGt4LTFxaHRmZ3oudHh0IiwgaGVhZGVyPVQpCmFycmF5LmRhdGEgPSBkYXRhLm1hdHJpeChkYXRhKQpuLmJhc2UgPC0gMjI5MzU0ICNudW1iZXIgb2YgRE5BIHNlcXVlbmNlIGJhc2VzCm4uc2l0ZSA8LSAyOTYgIzI5NiBwYWxpbmRyb21lIHNpdGVzCmdlbmUgPC0gc2VxKDEsbi5iYXNlKQpyZWcuc3BsaXQgPC0gZnVuY3Rpb24obi5yZWdpb24sIGdlbmUsIHNpdGUpeyAjZnVuY3Rpb24gdG8gc3BsaXQgcmVnaW9ucyBpbnRvIG4gcmVnaW9ucwogIGNvdW50LmludCA8LSB0YWJsZShjdXQoc2l0ZSwgYnJlYWtzID0gc2VxKDEsIGxlbmd0aChnZW5lKSwgbGVuZ3RoLm91dD1uLnJlZ2lvbisxKSwgaW5jbHVkZS5sb3dlcj1UUlVFKSkKICBjb3VudC52ZWN0b3IgPC0gYXMudmVjdG9yKGNvdW50LmludCkKICBjb3VudC50YWIgPC10YWJsZShjb3VudC52ZWN0b3IpCiAgcmV0dXJuKGNvdW50LnRhYikKfQoKbi5yZWdpb24gPC0gNTAgIyBuPTUwIHJlZ2lvbnMgaGVyZQpyZWcuc3BsaXQobi5yZWdpb24sIGdlbmUsIHNpdGUucmFuZG9tKSAjZ2VuZXJhdGUgYSB0YWJsZSBmb3IgdGhlIGRhdGFzZXQKCgpjaGlzcXRhYmxlIDwtIGZ1bmN0aW9uKG4ucmVnaW9uLCBzaXRlLCBuLmJhc2UpeyAjY2hpLXNxdWFyZWQgdGFibGUgZm9yIHRoZSBkYXRhc2V0CiAgbiA8LSBsZW5ndGgoc2l0ZSkgCiAgbGFtYmRhLmVzdCA8LSBuL24ucmVnaW9uICNlc3RpbWF0ZSBmb3IgbGFtYmRhCiAgY291bnQuaW50IDwtIHRhYmxlKGN1dChzaXRlLCBicmVha3MgPSBzZXEoMSwgbGVuZ3RoKGdlbmUpLCBsZW5ndGgub3V0PW4ucmVnaW9uKzEpLCBpbmNsdWRlLmxvd2VzdD1UUlVFKSkgI2RpdmlkZSBpbnRvIG4ucmVnaW9uIG51bWJlciBvZiBub24tb3ZlcmxhcHBpbmcgaW50ZXJ2YWxzCiAgY291bnQudmVjdG9yIDwtIGFzLnZlY3Rvcihjb3VudC5pbnQpICAjZ2V0IHRoZSBjb3VudCBsZXZlbHMgcmFuZ2UKICBjb3VudC5yYW5nZSA8LSBtYXgoY291bnQudmVjdG9yKSAtIG1pbihjb3VudC52ZWN0b3IpICsxCiAgCiAgdGFibGUgPC0gbWF0cml4KHJlcChOQSwgY291bnQucmFuZ2UqMyksIGNvdW50LnJhbmdlLCAzKSAjY3JlYXRlIGEgdGFibGUKICBmb3IoaSBpbiAxOmNvdW50LnJhbmdlKXsKICAgIG9mZnNldCA8LW1pbihjb3VudC52ZWN0b3IpIC0gMQogICAgdGFibGVbaSwgMV0gPC0gaSArIG9mZnNldCAjZmlyc3QgY29sdW1uIGlzIHRoZSBjb3VudCBsZXZlbAogICAgdGFibGVbaSwgMl0gPC0gc3VtKGNvdW50LnZlY3RvciA9PSBpICsgb2Zmc2V0KSAjMm5kIGNvbHVtbiBpcyB0aGUgb2JzZXJ2ZWQgY291bnQKICAgIGlmICgoaSArIG9mZnNldCA9PSBtaW4oY291bnQudmVjdG9yKSAmJiAobWluKGNvdW50LnZlY3RvcikpICE9IDApKQogICAgICAgIHRhYmxlW2ksIDNdIDwtIHBwb2lzKGkrb2Zmc2V0LCBsYW1iZGEuZXN0KSpuLnJlZ2lvbgogICAgICAgIGVsc2UgaWYgKGkgKyBvZmZzZXQgPT0gbWF4KGNvdW50LnZlY3RvcikpCiAgICAgICAgICB0YWJsZVtpLCAzXSA8LSAoMSAtIHBwb2lzKGkgKyBvZmZzZXQgLSAxLCBsYW1iZGEuZXN0KSkqbi5yZWdpb24KICAgICAgICBlbHNlIAogICAgICAgICAgdGFibGVbaSwgM10gPC0gKHBwb2lzKGkgKyBvZmZzZXQsIGxhbWJkYS5lc3QpIC0gcHBvaXMoaSArIG9mZnNldCAtIDEsIGxhbWJkYS5lc3QpKSpuLnJlZ2lvbgogIH0KICAKICByZXR1cm4odGFibGUpCn0KCnNpdGUucmFuZG9tLnRhYmxlIDwtIGNoaXNxdGFibGUobi5yZWdpb24sIHNpdGUucmFuZG9tLCBuLmJhc2UpCnNpdGUucmFuZG9tLnRhYmxlCmBgYAoKYGBge3J9Cmluc3RhbGwucGFja2FnZXMoImdwbG90cyIpCmBgYAoKYGBge3J9CiNNb250ZSBDYXJsbyBQcmFjdGljZSAKc2V0LnNlZWQoMTIpCm4uc2l0ZSAKCmBgYAoKCkFkZCBhIG5ldyBjaHVuayBieSBjbGlja2luZyB0aGUgKkluc2VydCBDaHVuayogYnV0dG9uIG9uIHRoZSB0b29sYmFyIG9yIGJ5IHByZXNzaW5nICpDbWQrT3B0aW9uK0kqLgoKV2hlbiB5b3Ugc2F2ZSB0aGUgbm90ZWJvb2ssIGFuIEhUTUwgZmlsZSBjb250YWluaW5nIHRoZSBjb2RlIGFuZCBvdXRwdXQgd2lsbCBiZSBzYXZlZCBhbG9uZ3NpZGUgaXQgKGNsaWNrIHRoZSAqUHJldmlldyogYnV0dG9uIG9yIHByZXNzICpDbWQrU2hpZnQrSyogdG8gcHJldmlldyB0aGUgSFRNTCBmaWxlKS4gCgpUaGUgcHJldmlldyBzaG93cyB5b3UgYSByZW5kZXJlZCBIVE1MIGNvcHkgb2YgdGhlIGNvbnRlbnRzIG9mIHRoZSBlZGl0b3IuIENvbnNlcXVlbnRseSwgdW5saWtlICpLbml0KiwgKlByZXZpZXcqIGRvZXMgbm90IHJ1biBhbnkgUiBjb2RlIGNodW5rcy4gSW5zdGVhZCwgdGhlIG91dHB1dCBvZiB0aGUgY2h1bmsgd2hlbiBpdCB3YXMgbGFzdCBydW4gaW4gdGhlIGVkaXRvciBpcyBkaXNwbGF5ZWQuCgo=