This is an R Markdown Notebook. When you execute code within the notebook, the results appear beneath the code.
Try executing this chunk by clicking the Run button within the chunk or by placing your cursor inside it and pressing Cmd+Shift+Enter.
## Question 1: Random Scatter
set.seed(296)
data = read.csv("hcmv-263hxkx-1qhtfgz.txt", header=T)
site.original = data.matrix(data)
n.base <- 229354 #number of DNA sequence bases
n.site <- 296 #296 palindrome sites
Title1 = 'Palindrome locations'
library(lattice)
#Create a scatter plot of the locations of the original data
site.original.plot =stripplot(site.origianl, pch=5, cex=0.8,col="black",
main=Title1, xlab= 'Base Pairs', jitter.data=T, grid="h")
site.original.plot

#randomly generate 3 different 296 panlindrome locations from 229354 DNA sequence bases
site.random1 = round(runif(n.site,0,n.base))
site.random2 = round(runif(n.site,0,n.base))
site.random3 = round(runif(n.site,0,n.base))
#plot 3 scatter plots of runif
stripplot(site.random1, pch=5, cex=0.8, jitter.data=T, grid = "h", col="blue", #3 different scatter plots
main = Title1, xlab = 'Base Pairs') #1-dimensional scatter plot

stripplot(site.random2, pch=5, cex=0.8, jitter.data=T, grid = "h", col="red",
main = Title1, xlab = 'Base Pairs')

stripplot(site.random3, pch=5, cex=0.8, jitter.data=T, grid = "h", col="green",
main = Title1, xlab = 'Base Pairs', add=T)

#plot histograms comparing sample 1 and sample 2
hist(site.random1, breaks=85, col=rgb(1,1,0,0.7), main="Histogram of sample 1 and sample 2", xlab="Base Pair",xaxt='n')
hist(site.random2, breaks=85, col=rgb(0,1,1,0.4), main="Histogram of sample 2", xlab="Base Pair", add=T, xaxt='n')
axis(1, at=seq(0, n.base+10000, 10000))
legend("topright", c("sample1", "sample2"), col=c(rgb(1,1,0,0.7), rgb(0,1,1,0.4)), lwd=8)

#plot histograms comparing original data and sample3
hist(site.random3, breaks=85, col=rgb(1,0,0,0.5), main="Histogram of original data and sample 3", xlab="BasePair", xaxt='n')
hist(site.origianl, breaks=85, col=rgb(0,0,1,0.5), main="Histogram of original data", xlab="Base Pair", add=T, xaxt='n')
axis(1, at=seq(0, n.base+10000, 10000))
legend("topright", c("sample3", "data"), col=c(rgb(1,0,0,0.5), rgb(0,0,1,0.5)), lwd=8)

for(x in c(51,57,66)){
count.sample = rep(n.site/x, x)
count.data = as.vector(table(cut(data, breaks=seq(0, n.base, length.out = x + 1), include.lowest=TRUE)))
hist(data, breaks=x, probability=T, col=rgb(1,0,0,0.5))
}
Error in cut.default(data, breaks = seq(0, n.base, length.out = x + 1), :
'x' must be numeric
#chi-squared
data = read.csv("hcmv-263hxkx-1qhtfgz.txt", header=T)
array.data = data.matrix(data)
n.base <- 229354 #number of DNA sequence bases
n.site <- 296 #296 palindrome sites
gene <- seq(1,n.base)
reg.split <- function(n.region, gene, site){ #function to split regions into n regions
count.int <- table(cut(site, breaks = seq(1, length(gene), length.out=n.region+1), include.lower=TRUE))
count.vector <- as.vector(count.int)
count.tab <-table(count.vector)
return(count.tab)
}
n.region <- 50 # n=50 regions here
reg.split(n.region, gene, site.random) #generate a table for the dataset
count.vector
1 2 3 4 5 6 7 8 9 10 12
1 2 3 6 14 7 4 7 2 2 2
chisqtable <- function(n.region, site, n.base){ #chi-squared table for the dataset
n <- length(site)
lambda.est <- n/n.region #estimate for lambda
count.int <- table(cut(site, breaks = seq(1, length(gene), length.out=n.region+1), include.lowest=TRUE)) #divide into n.region number of non-overlapping intervals
count.vector <- as.vector(count.int) #get the count levels range
count.range <- max(count.vector) - min(count.vector) +1
table <- matrix(rep(NA, count.range*3), count.range, 3) #create a table
for(i in 1:count.range){
offset <-min(count.vector) - 1
table[i, 1] <- i + offset #first column is the count level
table[i, 2] <- sum(count.vector == i + offset) #2nd column is the observed count
if ((i + offset == min(count.vector) && (min(count.vector)) != 0))
table[i, 3] <- ppois(i+offset, lambda.est)*n.region
else if (i + offset == max(count.vector))
table[i, 3] <- (1 - ppois(i + offset - 1, lambda.est))*n.region
else
table[i, 3] <- (ppois(i + offset, lambda.est) - ppois(i + offset - 1, lambda.est))*n.region
}
return(table)
}
site.random.table <- chisqtable(n.region, site.random, n.base)
site.random.table
[,1] [,2] [,3]
[1,] 1 1 0.9290793
[2,] 2 2 2.3526650
[3,] 3 3 4.6425922
[4,] 4 6 6.8710365
[5,] 5 14 8.1353072
[6,] 6 7 8.0268365
[7,] 7 4 6.7884103
[8,] 8 7 5.0234236
[9,] 9 2 3.3042964
[10,] 10 2 1.9561435
[11,] 11 0 1.0527609
[12,] 12 2 0.9174487
install.packages("gplots")
also installing the dependencies ‘gtools’, ‘gdata’
trying URL 'https://cran.rstudio.com/bin/macosx/el-capitan/contrib/3.4/gtools_3.8.1.tgz'
Content type 'application/x-gzip' length 262688 bytes (256 KB)
==================================================
downloaded 256 KB
trying URL 'https://cran.rstudio.com/bin/macosx/el-capitan/contrib/3.4/gdata_2.18.0.tgz'
Content type 'application/x-gzip' length 1141723 bytes (1.1 MB)
==================================================
downloaded 1.1 MB
trying URL 'https://cran.rstudio.com/bin/macosx/el-capitan/contrib/3.4/gplots_3.0.1.1.tgz'
Content type 'application/x-gzip' length 509475 bytes (497 KB)
==================================================
downloaded 497 KB
The downloaded binary packages are in
/var/folders/xy/341ywwj15pq82lydhqf89xr80000gn/T//RtmpWrrJrc/downloaded_packages
#Monte Carlo Practice
set.seed(12)
n.site
Add a new chunk by clicking the Insert Chunk button on the toolbar or by pressing Cmd+Option+I.
When you save the notebook, an HTML file containing the code and output will be saved alongside it (click the Preview button or press Cmd+Shift+K to preview the HTML file).
The preview shows you a rendered HTML copy of the contents of the editor. Consequently, unlike Knit, Preview does not run any R code chunks. Instead, the output of the chunk when it was last run in the editor is displayed.
LS0tCnRpdGxlOiAiUiBOb3RlYm9vayIKb3V0cHV0OiBodG1sX25vdGVib29rCi0tLQoKVGhpcyBpcyBhbiBbUiBNYXJrZG93bl0oaHR0cDovL3JtYXJrZG93bi5yc3R1ZGlvLmNvbSkgTm90ZWJvb2suIFdoZW4geW91IGV4ZWN1dGUgY29kZSB3aXRoaW4gdGhlIG5vdGVib29rLCB0aGUgcmVzdWx0cyBhcHBlYXIgYmVuZWF0aCB0aGUgY29kZS4gCgpUcnkgZXhlY3V0aW5nIHRoaXMgY2h1bmsgYnkgY2xpY2tpbmcgdGhlICpSdW4qIGJ1dHRvbiB3aXRoaW4gdGhlIGNodW5rIG9yIGJ5IHBsYWNpbmcgeW91ciBjdXJzb3IgaW5zaWRlIGl0IGFuZCBwcmVzc2luZyAqQ21kK1NoaWZ0K0VudGVyKi4gCgpgYGB7cn0KCmBgYAoKYGBge3J9CiMjIFF1ZXN0aW9uIDE6IFJhbmRvbSBTY2F0dGVyCgoKc2V0LnNlZWQoMjk2KQpkYXRhID0gcmVhZC5jc3YoImhjbXYtMjYzaHhreC0xcWh0Zmd6LnR4dCIsIGhlYWRlcj1UKQpzaXRlLm9yaWdpbmFsID0gZGF0YS5tYXRyaXgoZGF0YSkKbi5iYXNlIDwtIDIyOTM1NCAjbnVtYmVyIG9mIEROQSBzZXF1ZW5jZSBiYXNlcwpuLnNpdGUgPC0gMjk2ICMyOTYgcGFsaW5kcm9tZSBzaXRlcwpUaXRsZTEgPSAnUGFsaW5kcm9tZSBsb2NhdGlvbnMnCmxpYnJhcnkobGF0dGljZSkKCiNDcmVhdGUgYSBzY2F0dGVyIHBsb3Qgb2YgdGhlIGxvY2F0aW9ucyBvZiB0aGUgb3JpZ2luYWwgZGF0YQpzaXRlLm9yaWdpbmFsLnBsb3QgPXN0cmlwcGxvdChzaXRlLm9yaWdpYW5sLCBwY2g9NSwgY2V4PTAuOCxjb2w9ImJsYWNrIiwgCiAgICAgICAgICAgbWFpbj1UaXRsZTEsIHhsYWI9ICdCYXNlIFBhaXJzJywgaml0dGVyLmRhdGE9VCwgZ3JpZD0iaCIpIApzaXRlLm9yaWdpbmFsLnBsb3QKCiNyYW5kb21seSBnZW5lcmF0ZSAzIGRpZmZlcmVudCAyOTYgcGFubGluZHJvbWUgbG9jYXRpb25zIGZyb20gMjI5MzU0IEROQSBzZXF1ZW5jZSBiYXNlcwpzaXRlLnJhbmRvbTEgPSByb3VuZChydW5pZihuLnNpdGUsMCxuLmJhc2UpKQpzaXRlLnJhbmRvbTIgPSByb3VuZChydW5pZihuLnNpdGUsMCxuLmJhc2UpKQpzaXRlLnJhbmRvbTMgPSByb3VuZChydW5pZihuLnNpdGUsMCxuLmJhc2UpKQoKI3Bsb3QgMyBzY2F0dGVyIHBsb3RzIG9mIHJ1bmlmCnN0cmlwcGxvdChzaXRlLnJhbmRvbTEsIHBjaD01LCBjZXg9MC44LCBqaXR0ZXIuZGF0YT1ULCBncmlkID0gImgiLCBjb2w9ImJsdWUiLCAjMyBkaWZmZXJlbnQgc2NhdHRlciBwbG90cwogICAgICAgICAgbWFpbiA9IFRpdGxlMSwgeGxhYiA9ICdCYXNlIFBhaXJzJykgIzEtZGltZW5zaW9uYWwgc2NhdHRlciBwbG90CiAgICAgCnN0cmlwcGxvdChzaXRlLnJhbmRvbTIsIHBjaD01LCBjZXg9MC44LCBqaXR0ZXIuZGF0YT1ULCBncmlkID0gImgiLCBjb2w9InJlZCIsIAogICAgICAgICAgbWFpbiA9IFRpdGxlMSwgeGxhYiA9ICdCYXNlIFBhaXJzJykKICAgICAKc3RyaXBwbG90KHNpdGUucmFuZG9tMywgcGNoPTUsIGNleD0wLjgsIGppdHRlci5kYXRhPVQsIGdyaWQgPSAiaCIsIGNvbD0iZ3JlZW4iLCAKICAgICAgICAgIG1haW4gPSBUaXRsZTEsIHhsYWIgPSAnQmFzZSBQYWlycycsIGFkZD1UKQoKCiNwbG90IGhpc3RvZ3JhbXMgY29tcGFyaW5nIHNhbXBsZSAxIGFuZCBzYW1wbGUgMgpoaXN0KHNpdGUucmFuZG9tMSwgYnJlYWtzPTg1LCBjb2w9cmdiKDEsMSwwLDAuNyksIG1haW49Ikhpc3RvZ3JhbSBvZiBzYW1wbGUgMSBhbmQgc2FtcGxlIDIiLCB4bGFiPSJCYXNlIFBhaXIiLHhheHQ9J24nKQpoaXN0KHNpdGUucmFuZG9tMiwgYnJlYWtzPTg1LCBjb2w9cmdiKDAsMSwxLDAuNCksIG1haW49Ikhpc3RvZ3JhbSBvZiBzYW1wbGUgMiIsIHhsYWI9IkJhc2UgUGFpciIsIGFkZD1ULCB4YXh0PSduJykKYXhpcygxLCBhdD1zZXEoMCwgbi5iYXNlKzEwMDAwLCAxMDAwMCkpCmxlZ2VuZCgidG9wcmlnaHQiLCBjKCJzYW1wbGUxIiwgInNhbXBsZTIiKSwgY29sPWMocmdiKDEsMSwwLDAuNyksIHJnYigwLDEsMSwwLjQpKSwgbHdkPTgpCgojcGxvdCBoaXN0b2dyYW1zIGNvbXBhcmluZyBvcmlnaW5hbCBkYXRhIGFuZCBzYW1wbGUzCmhpc3Qoc2l0ZS5yYW5kb20zLCBicmVha3M9ODUsIGNvbD1yZ2IoMSwwLDAsMC41KSwgbWFpbj0iSGlzdG9ncmFtIG9mIG9yaWdpbmFsIGRhdGEgYW5kIHNhbXBsZSAzIiwgeGxhYj0iQmFzZVBhaXIiLCB4YXh0PSduJykKaGlzdChzaXRlLm9yaWdpYW5sLCBicmVha3M9ODUsIGNvbD1yZ2IoMCwwLDEsMC41KSwgbWFpbj0iSGlzdG9ncmFtIG9mIG9yaWdpbmFsIGRhdGEiLCB4bGFiPSJCYXNlIFBhaXIiLCBhZGQ9VCwgeGF4dD0nbicpCmF4aXMoMSwgYXQ9c2VxKDAsIG4uYmFzZSsxMDAwMCwgMTAwMDApKQpsZWdlbmQoInRvcHJpZ2h0IiwgYygic2FtcGxlMyIsICJkYXRhIiksIGNvbD1jKHJnYigxLDAsMCwwLjUpLCByZ2IoMCwwLDEsMC41KSksIGx3ZD04KQoKYGBgCmBgYHtyfQojUXVlc3Rpb24gMjogTG9jYXRpb25zIGFuZCBzcGFjaW5ncwojVXNlIGdyYXBoaWNhbCBtZXRob2RzIHRvIGV4YW1pbmUgdGhlIHNwYWNpbmdzIGJldHdlZW4gY29uc2VjdXRpdmUgcGFsaW5kcm9tZXMgYW5kIHN1bSBvZiBjb25zZWN1dGl2ZSBwYWlycywgdHJpcGxldHMsIGV0Yywgc3BhY2luZ3MuIENvbXBhcmUgd2hhdCB5b3UgZmluZCB0byB3aGF0IHlvdSB3b3VsZCBleHBlY3QgdG8gZmluZCBpbiBhIHJhbmRvbSBzY2F0dGVyLiBBbHNvLCB1c2UgZ3JhcGhpY2FsIG1ldGhvZHMgdG8gY29tcGFyZSBsb2NhdGlvbnMgb2YgdGhlIHBhbGluZHJvbWVzLgoKc2V0LnNlZWQoMjk2KQp1LnNhbXBsZSA9IHJ1bmlmKG4uc2l0ZSwgMCwgbi5iYXNlKQoKZm9yKHggaW4gYyg1MSw1Nyw2NikpewogY291bnQuc2FtcGxlID0gcmVwKG4uc2l0ZS94LCB4KQogY291bnQuZGF0YSA9IGFzLnZlY3Rvcih0YWJsZShjdXQoZGF0YSwgYnJlYWtzPXNlcSgwLCBuLmJhc2UsIGxlbmd0aC5vdXQgPSB4ICsgMSksIGluY2x1ZGUubG93ZXN0PVRSVUUpKSkKIAogaGlzdChkYXRhLCBicmVha3M9eCwgcHJvYmFiaWxpdHk9VCwgY29sPXJnYigxLDAsMCwwLjUpKQp9CgpgYGAKCmBgYHtyfQojY2hpLXNxdWFyZWQKCmRhdGEgPSByZWFkLmNzdigiaGNtdi0yNjNoeGt4LTFxaHRmZ3oudHh0IiwgaGVhZGVyPVQpCmFycmF5LmRhdGEgPSBkYXRhLm1hdHJpeChkYXRhKQpuLmJhc2UgPC0gMjI5MzU0ICNudW1iZXIgb2YgRE5BIHNlcXVlbmNlIGJhc2VzCm4uc2l0ZSA8LSAyOTYgIzI5NiBwYWxpbmRyb21lIHNpdGVzCmdlbmUgPC0gc2VxKDEsbi5iYXNlKQpyZWcuc3BsaXQgPC0gZnVuY3Rpb24obi5yZWdpb24sIGdlbmUsIHNpdGUpeyAjZnVuY3Rpb24gdG8gc3BsaXQgcmVnaW9ucyBpbnRvIG4gcmVnaW9ucwogIGNvdW50LmludCA8LSB0YWJsZShjdXQoc2l0ZSwgYnJlYWtzID0gc2VxKDEsIGxlbmd0aChnZW5lKSwgbGVuZ3RoLm91dD1uLnJlZ2lvbisxKSwgaW5jbHVkZS5sb3dlcj1UUlVFKSkKICBjb3VudC52ZWN0b3IgPC0gYXMudmVjdG9yKGNvdW50LmludCkKICBjb3VudC50YWIgPC10YWJsZShjb3VudC52ZWN0b3IpCiAgcmV0dXJuKGNvdW50LnRhYikKfQoKbi5yZWdpb24gPC0gNTAgIyBuPTUwIHJlZ2lvbnMgaGVyZQpyZWcuc3BsaXQobi5yZWdpb24sIGdlbmUsIHNpdGUucmFuZG9tKSAjZ2VuZXJhdGUgYSB0YWJsZSBmb3IgdGhlIGRhdGFzZXQKCgpjaGlzcXRhYmxlIDwtIGZ1bmN0aW9uKG4ucmVnaW9uLCBzaXRlLCBuLmJhc2UpeyAjY2hpLXNxdWFyZWQgdGFibGUgZm9yIHRoZSBkYXRhc2V0CiAgbiA8LSBsZW5ndGgoc2l0ZSkgCiAgbGFtYmRhLmVzdCA8LSBuL24ucmVnaW9uICNlc3RpbWF0ZSBmb3IgbGFtYmRhCiAgY291bnQuaW50IDwtIHRhYmxlKGN1dChzaXRlLCBicmVha3MgPSBzZXEoMSwgbGVuZ3RoKGdlbmUpLCBsZW5ndGgub3V0PW4ucmVnaW9uKzEpLCBpbmNsdWRlLmxvd2VzdD1UUlVFKSkgI2RpdmlkZSBpbnRvIG4ucmVnaW9uIG51bWJlciBvZiBub24tb3ZlcmxhcHBpbmcgaW50ZXJ2YWxzCiAgY291bnQudmVjdG9yIDwtIGFzLnZlY3Rvcihjb3VudC5pbnQpICAjZ2V0IHRoZSBjb3VudCBsZXZlbHMgcmFuZ2UKICBjb3VudC5yYW5nZSA8LSBtYXgoY291bnQudmVjdG9yKSAtIG1pbihjb3VudC52ZWN0b3IpICsxCiAgCiAgdGFibGUgPC0gbWF0cml4KHJlcChOQSwgY291bnQucmFuZ2UqMyksIGNvdW50LnJhbmdlLCAzKSAjY3JlYXRlIGEgdGFibGUKICBmb3IoaSBpbiAxOmNvdW50LnJhbmdlKXsKICAgIG9mZnNldCA8LW1pbihjb3VudC52ZWN0b3IpIC0gMQogICAgdGFibGVbaSwgMV0gPC0gaSArIG9mZnNldCAjZmlyc3QgY29sdW1uIGlzIHRoZSBjb3VudCBsZXZlbAogICAgdGFibGVbaSwgMl0gPC0gc3VtKGNvdW50LnZlY3RvciA9PSBpICsgb2Zmc2V0KSAjMm5kIGNvbHVtbiBpcyB0aGUgb2JzZXJ2ZWQgY291bnQKICAgIGlmICgoaSArIG9mZnNldCA9PSBtaW4oY291bnQudmVjdG9yKSAmJiAobWluKGNvdW50LnZlY3RvcikpICE9IDApKQogICAgICAgIHRhYmxlW2ksIDNdIDwtIHBwb2lzKGkrb2Zmc2V0LCBsYW1iZGEuZXN0KSpuLnJlZ2lvbgogICAgICAgIGVsc2UgaWYgKGkgKyBvZmZzZXQgPT0gbWF4KGNvdW50LnZlY3RvcikpCiAgICAgICAgICB0YWJsZVtpLCAzXSA8LSAoMSAtIHBwb2lzKGkgKyBvZmZzZXQgLSAxLCBsYW1iZGEuZXN0KSkqbi5yZWdpb24KICAgICAgICBlbHNlIAogICAgICAgICAgdGFibGVbaSwgM10gPC0gKHBwb2lzKGkgKyBvZmZzZXQsIGxhbWJkYS5lc3QpIC0gcHBvaXMoaSArIG9mZnNldCAtIDEsIGxhbWJkYS5lc3QpKSpuLnJlZ2lvbgogIH0KICAKICByZXR1cm4odGFibGUpCn0KCnNpdGUucmFuZG9tLnRhYmxlIDwtIGNoaXNxdGFibGUobi5yZWdpb24sIHNpdGUucmFuZG9tLCBuLmJhc2UpCnNpdGUucmFuZG9tLnRhYmxlCmBgYAoKYGBge3J9Cmluc3RhbGwucGFja2FnZXMoImdwbG90cyIpCmBgYAoKYGBge3J9CiNNb250ZSBDYXJsbyBQcmFjdGljZSAKc2V0LnNlZWQoMTIpCm4uc2l0ZSAKCmBgYAoKCkFkZCBhIG5ldyBjaHVuayBieSBjbGlja2luZyB0aGUgKkluc2VydCBDaHVuayogYnV0dG9uIG9uIHRoZSB0b29sYmFyIG9yIGJ5IHByZXNzaW5nICpDbWQrT3B0aW9uK0kqLgoKV2hlbiB5b3Ugc2F2ZSB0aGUgbm90ZWJvb2ssIGFuIEhUTUwgZmlsZSBjb250YWluaW5nIHRoZSBjb2RlIGFuZCBvdXRwdXQgd2lsbCBiZSBzYXZlZCBhbG9uZ3NpZGUgaXQgKGNsaWNrIHRoZSAqUHJldmlldyogYnV0dG9uIG9yIHByZXNzICpDbWQrU2hpZnQrSyogdG8gcHJldmlldyB0aGUgSFRNTCBmaWxlKS4gCgpUaGUgcHJldmlldyBzaG93cyB5b3UgYSByZW5kZXJlZCBIVE1MIGNvcHkgb2YgdGhlIGNvbnRlbnRzIG9mIHRoZSBlZGl0b3IuIENvbnNlcXVlbnRseSwgdW5saWtlICpLbml0KiwgKlByZXZpZXcqIGRvZXMgbm90IHJ1biBhbnkgUiBjb2RlIGNodW5rcy4gSW5zdGVhZCwgdGhlIG91dHB1dCBvZiB0aGUgY2h1bmsgd2hlbiBpdCB3YXMgbGFzdCBydW4gaW4gdGhlIGVkaXRvciBpcyBkaXNwbGF5ZWQuCgo=