Group:
Install package that reads tables much faster
if (!require('data.table')){
install.packages('data.table')
library('data.table')
}
## Loading required package: data.table
if (!require('manipulate')){
install.packages('manipulate')
library('manipulate')
}
## Loading required package: manipulate
library('tictoc')
##
## Attaching package: 'tictoc'
## The following object is masked from 'package:data.table':
##
## shift
# install.packages("Rcpp")
library('Rcpp')
library('ggplot2')
reads_file_aviv = 'TCGA-13-0723-01A_lib2_all_chr1.forward'
chr1_reads = fread(reads_file_aviv)
colnames(chr1_reads) = c("Chrom","Loc","FragLen")
head(chr1_reads)
## Chrom Loc FragLen
## <int> <int> <int>
## 1: 1 23 198
## 2: 1 25 262
## 3: 1 27 200
## 4: 1 31 326
## 5: 1 32 197
## 6: 1 32 199
summary(chr1_reads)
## Chrom Loc FragLen
## Min. :1 Min. : 23 Min. :-244929232
## 1st Qu.:1 1st Qu.: 49877108 1st Qu.: 261
## Median :1 Median :111467644 Median : 288
## Mean :1 Mean :119647343 Mean : 2424
## 3rd Qu.:1 3rd Qu.:188314728 3rd Qu.: 309
## Max. :1 Max. :247199265 Max. : 245934169
“Slow” function:
# Function to get read line
getReadLine = function(locations, beg_region, end_region) {
# Initialize read_line
N = end_region - beg_region + 1
read_line = numeric(N)
# Loop through the locations
first_read = min(which(locations >= beg_region))
r = first_read
while (locations[r] <= end_region && r <= length(locations)) {
read_line[locations[r] - beg_region + 1] = read_line[
locations[r] - beg_region + 1
] + 1
r = r + 1
}
return(read_line)
}
# Define the regions
beg_region = 1
end_region = 20000000
# Measure the time
tic()
read_line = getReadLine(chr1_reads$Loc, beg_region, end_region)
toc()
## 0.95 sec elapsed
# Print the time taken and fragments
time_taken = toc()
num_fragments = sum(read_line)
cat("Time taken for 20 million base area: ", time_taken$toc - time_taken$tic, " seconds\n")
## Time taken for 20 million base area: seconds
cat("Number of fragments in the area: ", num_fragments, "\n")
## Number of fragments in the area: 1840733
read_lines=read_line
head(read_line,50)
## [1] 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 1
## [26] 0 1 0 0 0 1 4 3 0 7 11 6 3 1 0 5 5 3 1 3 1 6 1 4 1
print(read_line[510])
## [1] 0
tail(read_line,50)
## [1] 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0
## [39] 0 0 0 0 0 0 0 0 0 0 1 1
“Fast” function:
library(Rcpp)
Rcpp::cppFunction('
NumericVector getReadLineCpp(NumericVector locations, int beg_region, int end_region) {
int N = end_region - beg_region + 1;
NumericVector read_line(N);
int r = 0;
while (r < locations.size() && locations[r] < beg_region) {
r++;
}
while (r < locations.size() && locations[r] <= end_region) {
read_line[locations[r] - beg_region] += 1;
r++;
}
return read_line;
}')
# Define the regions
beg_region = 1
end_region = 20000000
# Measure the time
tic()
read_line = getReadLineCpp(chr1_reads$Loc, beg_region, end_region)
toc()
## 0.37 sec elapsed
# Print some of the output for verification
head(read_line,50)
## [1] 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 1
## [26] 0 1 0 0 0 1 4 3 0 7 11 6 3 1 0 5 5 3 1 3 1 6 1 4 1
print(read_line[510])
## [1] 0
tail(read_line,50)
## [1] 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0
## [39] 0 0 0 0 0 0 0 0 0 0 1 1
# Create and display histogram
max_val = max(read_line)
read_hist = hist(read_line, breaks = seq(0, max_val + 3, by = 1),
main = "Histogram of Reads Distribution", xlab = "Read Line Values", ylab = "Frequency", col = "skyblue")
# Annotate the counts on top of each bin
text(read_hist$mids, read_hist$counts, labels = read_hist$counts,
pos = 3, cex = 0.65, srt = 90)
The histogram reveals that the read line values are predominantly zero, with very few positions having any reads mapped to them. This highly skewed distribution highlights the sparsity of mapped reads across the genome and suggests that further analysis might be needed to understand the regions with non-zero reads.
# Function to sum fragments in bins of 10k size
sumFragmentsInBins = function(locations, beg_region, end_region, bin_size = 10000) {
# Get read_line vector
read_line = getReadLineCpp(locations, beg_region, end_region)
# Number of bins
num_bins = ceiling((end_region - beg_region + 1) / bin_size)
print(paste("num of bins: ",num_bins))
# Initialize bin_counts vector
bin_counts = numeric(num_bins)
# Sum fragments in each bin
for (i in 1:num_bins) {
bin_start = (i - 1) * bin_size + 1
bin_end = min(i * bin_size, length(read_line))
bin_counts[i] = sum(read_line[bin_start:bin_end])
}
return(bin_counts)
}
# Define the regions
beg_region = 1
end_region = 20000000
# Measure the time and sum fragments in bins
tic()
bin_counts = sumFragmentsInBins(chr1_reads$Loc, beg_region, end_region,bin_size = 10000)
## [1] "num of bins: 2000"
toc()
## 1.04 sec elapsed
# Print the first few bin counts for verification
head(bin_counts)
## [1] 472 43 31 33 115 83
# Load necessary libraries
library(ggplot2)
# Create a data frame for plotting
data <- data.frame(
Index = 1:length(bin_counts),
BinCounts = bin_counts
)
# Add a column to indicate outliers
data$Outlier <- ifelse(data$BinCounts > 1500 | data$BinCounts < 500, "Outlier", "Normal")
# Plot using ggplot2
ggplot(data, aes(x = Index, y = BinCounts)) +
theme_minimal() +
theme(panel.grid.major = element_line(size = 0.25, linetype = 'solid', colour = "gray"),
panel.grid.minor = element_line(size = 0.25, linetype = 'solid', colour = "gray")) +
geom_point(aes(color = Outlier), alpha = 0.5) +
geom_smooth() +
scale_color_manual(values = c("Outlier" = "red", "Normal" = "black")) +
labs(
x = "Location (bin size 10k)",
y = "Number of Fragments",
title = "Fragments Count vs Location (Bin Size 10k)",
color = "Point Type"
)
## Warning: The `size` argument of `element_line()` is deprecated as of ggplot2 3.4.0.
## ℹ Please use the `linewidth` argument instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
## `geom_smooth()` using method = 'gam' and formula = 'y ~ s(x, bs = "cs")'
Outliers: The presence of outliers may indicate regions
of the genome with significantly different fragment densities. These
could be areas of biological interest or technical artifacts. Some
outliers are significantly high, reaching counts above 3000, while
others are low, close to zero.
Variability: The plot demonstrates substantial variability in fragment counts across different bins. The regions with high densities of fragments might correspond to areas with higher sequencing coverage or regions of interest in the genomic study.
Interpretation: Biological Insights: The varying densities and the presence of outliers might point to biologically significant regions, such as regions with high gene expression or regulatory elements. Technical Considerations: The presence of outliers and variability might also reflect technical aspects of the sequencing process, such as differences in coverage or mapping efficiency.
# Test the normality of the counts distribution using a Q-Q plot
ggplot(data, aes(sample = BinCounts)) +
stat_qq() +
stat_qq_line() +
theme_minimal() +
labs(
title = "Q-Q Plot of Bin Counts",
x = "Theoretical Quantiles",
y = "Sample Quantiles"
)
We can see that the points deviate significantly from the straight
diagonal line. This is a clear indication that the set of data is not
normally distributed.