HW2

Group:

Ayelet Shani, 207207218, ayelet.shani@mail.huji.ac.il
Noam Salomon, 206481152, noam.salomon1@mail.huji.ac.il
Aviv Gelfand, 206958068, Aviv.gelfand@mail.huji.ac.il

Libraries used

Install package that reads tables much faster

if (!require('data.table')){
  install.packages('data.table')
  library('data.table')
}

## Loading required package: data.table

if (!require('manipulate')){
  install.packages('manipulate')
  library('manipulate')
}

## Loading required package: manipulate

library('tictoc')

## 
## Attaching package: 'tictoc'

## The following object is masked from 'package:data.table':
## 
##     shift

# install.packages("Rcpp")
library('Rcpp')
library('ggplot2')

Paths and Data

reads_file_aviv = 'TCGA-13-0723-01A_lib2_all_chr1.forward'
chr1_reads = fread(reads_file_aviv) 
colnames(chr1_reads) = c("Chrom","Loc","FragLen")   
head(chr1_reads)

##    Chrom   Loc FragLen
##    <int> <int>   <int>
## 1:     1    23     198
## 2:     1    25     262
## 3:     1    27     200
## 4:     1    31     326
## 5:     1    32     197
## 6:     1    32     199

summary(chr1_reads)

##      Chrom        Loc               FragLen          
##  Min.   :1   Min.   :       23   Min.   :-244929232  
##  1st Qu.:1   1st Qu.: 49877108   1st Qu.:       261  
##  Median :1   Median :111467644   Median :       288  
##  Mean   :1   Mean   :119647343   Mean   :      2424  
##  3rd Qu.:1   3rd Qu.:188314728   3rd Qu.:       309  
##  Max.   :1   Max.   :247199265   Max.   : 245934169

“Slow” function:

# Function to get read line
getReadLine = function(locations, beg_region, end_region) {
  # Initialize read_line
  N = end_region - beg_region + 1
  read_line = numeric(N)
  
  # Loop through the locations
  first_read = min(which(locations >= beg_region))
  r = first_read
  while (locations[r] <= end_region && r <= length(locations)) {
    read_line[locations[r] - beg_region + 1] = read_line[
        locations[r] - beg_region + 1
      ] + 1
    r = r + 1
  }
  
  return(read_line)
}

# Define the regions
beg_region = 1
end_region = 20000000

# Measure the time
tic()
read_line = getReadLine(chr1_reads$Loc, beg_region, end_region)
toc()

## 0.95 sec elapsed

# Print the time taken and fragments
time_taken = toc()
num_fragments = sum(read_line)
cat("Time taken for 20 million base area: ", time_taken$toc - time_taken$tic, " seconds\n")

## Time taken for 20 million base area:    seconds

cat("Number of fragments in the area: ", num_fragments, "\n")

## Number of fragments in the area:  1840733

read_lines=read_line
head(read_line,50)

##  [1]  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  1  0  1
## [26]  0  1  0  0  0  1  4  3  0  7 11  6  3  1  0  5  5  3  1  3  1  6  1  4  1

print(read_line[510])

## [1] 0

tail(read_line,50)

##  [1] 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0
## [39] 0 0 0 0 0 0 0 0 0 0 1 1

“Fast” function:

library(Rcpp)

Rcpp::cppFunction('
NumericVector getReadLineCpp(NumericVector locations, int beg_region, int end_region) {
  int N = end_region - beg_region + 1;
  NumericVector read_line(N);
  
  int r = 0;
  while (r < locations.size() && locations[r] < beg_region) {
    r++;
  }
  
  while (r < locations.size() && locations[r] <= end_region) {
    read_line[locations[r] - beg_region] += 1;
    r++;
  }
  
  return read_line;
}')

# Define the regions
beg_region = 1
end_region = 20000000

# Measure the time
tic()
read_line = getReadLineCpp(chr1_reads$Loc, beg_region, end_region)
toc()

## 0.37 sec elapsed

# Print some of the output for verification
head(read_line,50)

##  [1]  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  1  0  1
## [26]  0  1  0  0  0  1  4  3  0  7 11  6  3  1  0  5  5  3  1  3  1  6  1  4  1

print(read_line[510])

## [1] 0

tail(read_line,50)

##  [1] 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0
## [39] 0 0 0 0 0 0 0 0 0 0 1 1

Advantages of Using Rcpp

Performance: C++ is generally faster for low-level operations and can handle loops more efficiently than R. Using Rcpp allows us to leverage the speed of C++ while still using R for data manipulation and visualization. -
Integration: Rcpp seamlessly integrates with R, making it easier to call C++ functions from R without the need for complex interfacing code. By using Rcpp, we can significantly improve the performance of the getReadLine function, especially for large datasets. This method should provide a noticeable speedup compared to the pure R implementation.

b

# Create and display histogram
max_val = max(read_line)
read_hist = hist(read_line, breaks = seq(0, max_val + 3, by = 1), 
                 main = "Histogram of Reads Distribution", xlab = "Read Line Values", ylab = "Frequency", col = "skyblue")

# Annotate the counts on top of each bin
text(read_hist$mids, read_hist$counts, labels = read_hist$counts, 
     pos = 3, cex = 0.65, srt = 90)

The histogram reveals that the read line values are predominantly zero, with very few positions having any reads mapped to them. This highly skewed distribution highlights the sparsity of mapped reads across the genome and suggests that further analysis might be needed to understand the regions with non-zero reads.

c

(0)

# Function to sum fragments in bins of 10k size
sumFragmentsInBins = function(locations, beg_region, end_region, bin_size = 10000) {
  
  # Get read_line vector
  read_line = getReadLineCpp(locations, beg_region, end_region)
  
  # Number of bins
  num_bins = ceiling((end_region - beg_region + 1) / bin_size)
  print(paste("num of bins: ",num_bins))
  
  # Initialize bin_counts vector
  bin_counts = numeric(num_bins)

  # Sum fragments in each bin
  for (i in 1:num_bins) {
    bin_start = (i - 1) * bin_size + 1
    bin_end = min(i * bin_size, length(read_line))
    bin_counts[i] = sum(read_line[bin_start:bin_end])
  }
  
  return(bin_counts)
}

# Define the regions
beg_region = 1
end_region = 20000000

# Measure the time and sum fragments in bins
tic()
bin_counts = sumFragmentsInBins(chr1_reads$Loc, beg_region, end_region,bin_size = 10000)

## [1] "num of bins:  2000"

toc()

## 1.04 sec elapsed

# Print the first few bin counts for verification
head(bin_counts)

## [1] 472  43  31  33 115  83

(1)

# Load necessary libraries
library(ggplot2)

# Create a data frame for plotting
data <- data.frame(
  Index = 1:length(bin_counts),
  BinCounts = bin_counts
)

# Add a column to indicate outliers
data$Outlier <- ifelse(data$BinCounts > 1500 | data$BinCounts < 500, "Outlier", "Normal")

# Plot using ggplot2
ggplot(data, aes(x = Index, y = BinCounts)) +
  theme_minimal() +
  theme(panel.grid.major = element_line(size = 0.25, linetype = 'solid', colour = "gray"),
        panel.grid.minor = element_line(size = 0.25, linetype = 'solid', colour = "gray")) +
  geom_point(aes(color = Outlier), alpha = 0.5) +
  geom_smooth() +
  scale_color_manual(values = c("Outlier" = "red", "Normal" = "black")) +
  labs(
    x = "Location (bin size 10k)",
    y = "Number of Fragments",
    title = "Fragments Count vs Location (Bin Size 10k)",
    color = "Point Type"
  )

## Warning: The `size` argument of `element_line()` is deprecated as of ggplot2 3.4.0.
## ℹ Please use the `linewidth` argument instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.

## `geom_smooth()` using method = 'gam' and formula = 'y ~ s(x, bs = "cs")'

Outliers: The presence of outliers may indicate regions of the genome with significantly different fragment densities. These could be areas of biological interest or technical artifacts. Some outliers are significantly high, reaching counts above 3000, while others are low, close to zero.

Variability: The plot demonstrates substantial variability in fragment counts across different bins. The regions with high densities of fragments might correspond to areas with higher sequencing coverage or regions of interest in the genomic study.

Interpretation: Biological Insights: The varying densities and the presence of outliers might point to biologically significant regions, such as regions with high gene expression or regulatory elements. Technical Considerations: The presence of outliers and variability might also reflect technical aspects of the sequencing process, such as differences in coverage or mapping efficiency.

(2) Waiting for Explanation from Yuval

d

# Test the normality of the counts distribution using a Q-Q plot
ggplot(data, aes(sample = BinCounts)) +
  stat_qq() +
  stat_qq_line() +
  theme_minimal() +
  labs(
    title = "Q-Q Plot of Bin Counts",
    x = "Theoretical Quantiles",
    y = "Sample Quantiles"
  )

We can see that the points deviate significantly from the straight diagonal line. This is a clear indication that the set of data is not normally distributed.

HW2

Aviv, Ayelet & Noam

23 May 2024

Libraries used

Paths and Data

Advantages of Using Rcpp

b

c

(0)

(1)

(2) Waiting for Explanation from Yuval

d