dat <- read.table("peds5000_vespa8000_original_call_Samples_Table.txt", sep = "\t",
header = TRUE)
summary(dat$Call.Rate)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.000 0.990 0.991 0.980 0.991 0.992
cutoff <- c(0.8, seq(from = 0.9, to = 0.99, by = 0.01))
exclude.samples <- data.frame(cutoff = paste("<", cutoff, sep = ""))
for (i in 1:length(cutoff)) {
exclude.samples$number[i] <- sum(dat$Call.Rate < cutoff[i])
}
print(exclude.samples)
## cutoff number
## 1 <0.8 259
## 2 <0.9 363
## 3 <0.91 387
## 4 <0.92 419
## 5 <0.93 446
## 6 <0.94 485
## 7 <0.95 549
## 8 <0.96 661
## 9 <0.97 831
## 10 <0.98 1035
## 11 <0.99 4278
hist(dat$Call.Rate, xlab = "Call Rate", main = "Histogram of sample call rate",
breaks = 50)