Group:

Libraries used

if (!require('data.table')){
  install.packages('data.table')
  library('data.table')
}
## Loading required package: data.table
if (!require('manipulate')){
  install.packages('manipulate')
  library('manipulate')
}
## Loading required package: manipulate
# install.packages("Rcpp")
# library('Rcpp')
library('ggplot2')

# new in this week (4)
library(segmented) # for building a segmented model
## Loading required package: MASS
## Loading required package: nlme
library('tictoc')
## 
## Attaching package: 'tictoc'
## The following object is masked from 'package:data.table':
## 
##     shift
library(splines) # for building a splined model
library(Metrics) # for testing R^2 and rmse of models
library(lmtest) # Breusch-Pagan test for formally checking heteroscedasticity
## Loading required package: zoo
## 
## Attaching package: 'zoo'
## The following objects are masked from 'package:data.table':
## 
##     yearmon, yearqtr
## The following objects are masked from 'package:base':
## 
##     as.Date, as.Date.numeric
library(dplyr)
## 
## Attaching package: 'dplyr'
## The following object is masked from 'package:nlme':
## 
##     collapse
## The following object is masked from 'package:MASS':
## 
##     select
## The following objects are masked from 'package:data.table':
## 
##     between, first, last
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(tidyr)

Paths and Data

# getwd() # returned "C:/Users/avivg/OneDrive/Documents/R/Statistics-Lab"
# loading the chromosome 1 line file
load("chr1_line.rda")
chr1_seq <- chr1_line
# loading the base reads (change the path to what suits you with getwd() ❤️)
reads_file_aviv <- "TCGA-13-0723-01A_lib2_all_chr1.forward"
chr1_reads <- fread(reads_file_aviv)
colnames(chr1_reads) = c("Chrom", "Loc", "FragLen")

Counting base occurrences

# defining the counting function
count_CG_occurrences <- function(genome_region, bin_size) {
  total_bases <- length(genome_region)
  num_bins <- ceiling(total_bases / bin_size)
  G_counts <- numeric(num_bins)
  C_counts <- numeric(num_bins)
  
  for (cell_index in 1:num_bins) {
    start_index <- 1 + (cell_index - 1) * bin_size
    end_index <- min(cell_index * bin_size, total_bases)
    bin_sequence <- genome_region[start_index:end_index]
    
    G_counts[cell_index] <- sum(bin_sequence == "G",na.rm = TRUE)
    C_counts[cell_index] <- sum(bin_sequence == "C",na.rm = TRUE)
  }
    GC <-  G_counts + C_counts
  return(GC)
}

# running the function to GC in 5k bins (unlike of 10K previous week)
bin_size = 5000 # a global variable to use further on
GC_5K <- count_CG_occurrences(chr1_seq, bin_size)
# function for counting reads in each bin / cell at a constant bin_size
count_reads_in_bins <- function(locations, beg_region, end_region, bin_size) {
  num_bins <- ceiling((end_region - beg_region + 1) / bin_size)
  read_count <- integer(num_bins)
  filtered_locations <- locations[locations >= beg_region & locations <= end_region]
  bin_indices <- floor((filtered_locations - beg_region) / bin_size) + 1
  read_count <- tabulate(bin_indices, nbins = num_bins)
  return(read_count)
}

# defining region variables
beg_region = 1
end_region = length(chr1_seq)

# running func to count reads
reads_5K <- count_reads_in_bins(chr1_reads$Loc, beg_region, end_region, bin_size)

# combining data into a df for building regression models
reg_df <-data.frame(GC_5K, reads_5K)
reg_df <-reg_df
# taking a look at the data frame
head(reg_df)
##   GC_5K reads_5K
## 1  2915      281
## 2  2957      191
## 3  2689       21
## 4  2492       22
## 5  2309       18
## 6  2410       13

Reminder to how the data looks like (before filtering):

ggplot(reg_df, aes(x = GC_5K, y = reads_5K)) +
  # setting opacity
  geom_point(alpha = 0.1) + 
  # limmiting y coverege in the plot to avoid distrution by outliers
  scale_y_continuous(limits = c(-10, 800), breaks = seq(-10, 800, by = 50)) + 
  labs(title = "Reads count in a bin vs. G+C base counts in a bin",
       x = "G+C counts",
       y = "Reads count") +
  theme_minimal()
## Warning: Removed 120 rows containing missing values or values outside the scale range
## (`geom_point()`).

Now we will remove the outliers and show the data one more time. We will remove outlires for every window size of 500 in the above plot

# Define a function to remove outliers
remove_outliers <- function(x) {
  q1 <- quantile(x, 0.25)
  q3 <- quantile(x, 0.75)
  iqr <- q3 - q1
  lower_bound <- q1 - 1.5 * iqr
  upper_bound <- q3 + 1.5 * iqr
  x[x >= lower_bound & x <= upper_bound]
}


# Sort the data frame by 'GC_5K'
cleaned_reg_df <- reg_df %>%
  arrange(GC_5K)

# Create a windowing variable
cleaned_reg_df <- cleaned_reg_df %>%
  mutate(window = ceiling(row_number() / 1000))

# Apply the function to each window
cleaned_reg_df <- cleaned_reg_df %>%
  group_by(window) %>%
  filter(reads_5K %in% remove_outliers(reads_5K)) %>%
  ungroup() 
  # select(-window)  # Remove the window column if not needed
ggplot(cleaned_reg_df, aes(x = GC_5K, y = reads_5K)) +
  # setting opacity
  geom_point(alpha = 0.1) + 
  # limmiting y coverege in the plot to avoid distrution by outliers
  scale_y_continuous(limits = c(-10, 750), breaks = seq(-10, 750, by = 50)) + 
  labs(title = "Reads count in a bin vs. G+C base counts in a bin",
       x = "G+C counts",
       y = "Reads count") +
  theme_minimal()

Qa Building Regression Models

# Perform a simple linear regression
simple_model <- lm(reads_5K ~ GC_5K, data = cleaned_reg_df)

# Summary of the simple linear regression model
summary(simple_model)
## 
## Call:
## lm(formula = reads_5K ~ GC_5K, data = cleaned_reg_df)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -598.10  -44.08   11.13   46.88  237.35 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept) -1.113e+01  1.007e+00  -11.05   <2e-16 ***
## GC_5K        2.122e-01  5.024e-04  422.39   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 73.75 on 46875 degrees of freedom
## Multiple R-squared:  0.7919, Adjusted R-squared:  0.7919 
## F-statistic: 1.784e+05 on 1 and 46875 DF,  p-value: < 2.2e-16
# piece wise model
# Define the breakpoints
breakpoints <- c(1500, 2500)

# Fit the segmented regression model
seg_model <- segmented(simple_model, seg.Z = ~GC_5K, psi = breakpoints)

# Print the summary of the segmented model
summary(seg_model)
## 
##  ***Regression Model with Segmented Relationship(s)***
## 
## Call: 
## segmented.lm(obj = simple_model, seg.Z = ~GC_5K, psi = breakpoints)
## 
## Estimated Break-Point(s):
##                 Est. St.Err
## psi1.GC_5K 1293.999 34.406
## psi2.GC_5K 2303.178  1.598
## 
## Coefficients of the linear terms:
##              Estimate Std. Error  t value Pr(>|t|)    
## (Intercept)  0.026393   0.770298    0.034    0.973    
## GC_5K        0.078400   0.010119    7.748 9.53e-15 ***
## U1.GC_5K     0.380431   0.010230   37.189       NA    
## U2.GC_5K    -0.634811   0.002982 -212.911       NA    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 51.23 on 46871 degrees of freedom
## Multiple R-Squared: 0.8996,  Adjusted R-squared: 0.8996 
## 
## Boot restarting based on 6 samples. Last fit:
## Convergence attained in 1 iterations (rel. change 2.7281e-09)
# Make predictions using the segmented model
cleaned_reg_df$segmented_prediction <- predict(seg_model)

# Perform a splined regression model
# Define the knots for the splines
knots <- c(1500, 2400)

# Create the spline basis
spline_basis <- ns(cleaned_reg_df$GC_5K, knots = knots)

# Fit the splined regression model
splined_model <- lm(reads_5K ~ spline_basis, data = cleaned_reg_df)

# Summary of the splined regression model
summary(splined_model)
## 
## Call:
## lm(formula = reads_5K ~ spline_basis, data = cleaned_reg_df)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -234.778  -27.175    0.232   30.594  207.269 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept)    -0.2318     0.7456  -0.311    0.756    
## spline_basis1 785.4508     1.6157 486.125   <2e-16 ***
## spline_basis2 317.3276     2.8750 110.375   <2e-16 ***
## spline_basis3 103.5274     2.6959  38.401   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 49.62 on 46873 degrees of freedom
## Multiple R-squared:  0.9058, Adjusted R-squared:  0.9058 
## F-statistic: 1.503e+05 on 3 and 46873 DF,  p-value: < 2.2e-16
# Plot the splined regression results
cleaned_reg_df$predicted_reads_5K <- predict(splined_model, newdata = cleaned_reg_df)
library(ggplot2)

ggplot(cleaned_reg_df, aes(x = GC_5K, y = reads_5K)) +
  geom_point(alpha = 0.1, aes(color = "Data Points")) +
  geom_smooth(method = "lm", aes(color = "Simple Linear Model"), se = FALSE) +
  geom_line(aes(y = segmented_prediction, color = "Segmented Linear Model"),size = 1.5) +
  geom_line(aes(y = predicted_reads_5K, color = "Splined Model"),size = 1.5) +
  labs(title = "Regression Models of Read Counts VS G+C Counts",
       x = "G+C counts",
       y = "Reads count",
       color = "Legend") +
  scale_color_manual(values = c("Data Points" = "black",
                                "Simple Linear Model" = "blue",
                                "Segmented Linear Model" = "red",
                                "Splined Model" = "pink")) +
  theme_minimal()
## Warning: Using `size` aesthetic for lines was deprecated in ggplot2 3.4.0.
## ℹ Please use `linewidth` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
## `geom_smooth()` using formula = 'y ~ x'

The plot illustrates that while a simple linear model provides a basic understanding of the relationship between Read Counts and G+C Counts, it is inadequate for capturing the complexity of the data. The segmented linear model offers a better fit by addressing changes in the trend. However, the splined model is the most accurate, effectively modeling the non-linear relationship and capturing the peak in Read Counts. This highlights the importance of selecting appropriate models to accurately represent data relationships, particularly in cases with evident non-linear patterns.

# Calculate residuals
cleaned_reg_df$residuals <- residuals(splined_model)

# Plot residuals against GC_5K
ggplot(cleaned_reg_df, aes(x = GC_5K, y = residuals)) +
  geom_point(alpha = 0.5) +
  geom_hline(yintercept = 0, linetype = "dashed", color = "red") +
  labs(title = "Residuals vs G+C Counts",
       x = "G+C counts",
       y = "Residuals") +
  theme_minimal()

Based on the residual plot, there does not appear to be significant bias for specific GC_5K values. The residuals are fairly evenly distributed around the horizontal line at y = 0 without showing clear patterns, suggesting that the segmented regression model is a good fit for the data.

Qc

# Calculate R-squared for the simple model
r_squared_simple <- summary(simple_model)$r.squared

# Calculate R-squared for the splined model
r_squared_splined <- summary(splined_model)$r.squared

# Calculate RMSE for the simple model
rmse_simple <- rmse(cleaned_reg_df$reads_5K, predict(simple_model))

# Calculate RMSE for the splined model
rmse_splined <- rmse(cleaned_reg_df$reads_5K, predict(splined_model))

# Calculate the improvement in fit
r_squared_improvement <- r_squared_splined - r_squared_simple
rmse_improvement <- rmse_simple - rmse_splined

# Calculate the ratio of improvements
r_squared_ratio <- r_squared_splined / r_squared_simple
rmse_ratio <- rmse_splined / rmse_simple

# Print the results
cat("R-squared (Simple Model):", r_squared_simple, "\n")
## R-squared (Simple Model): 0.7919303
cat("R-squared (Splined Model):", r_squared_splined, "\n")
## R-squared (Splined Model): 0.905811
cat("R-squared Improvement:", r_squared_improvement, "\n")
## R-squared Improvement: 0.1138807
cat("R-squared improved by a ratio of:", r_squared_ratio, "\n\n")
## R-squared improved by a ratio of: 1.143801
cat("RMSE (Simple Model):", rmse_simple, "\n")
## RMSE (Simple Model): 73.74742
cat("RMSE (Splined Model):", rmse_splined, "\n")
## RMSE (Splined Model): 49.61837
cat("RMSE Improvement:", rmse_improvement, "\n")
## RMSE Improvement: 24.12905
cat("RMSE improved by a ratio of", rmse_ratio, "\n")
## RMSE improved by a ratio of 0.6728149

Qd

The residuals appear to have consistent variability across most GC values, but there may be slightly more spread in residuals at the higher end of GC values. This can sometimes indicate heteroscedasticity (changing variance) but doesn’t appear to be severe in this case.

However, a small increase in spread at higher GC values could warrant further investigation, possibly by:

Checking for heteroscedasticity formally using statistical test: Breusch-Pagan test.

The output of the bptest will include a p-value. If the p-value is small (typically less than 0.05), it indicates that there is evidence of heteroscedasticity in the model.

# Perform the Breusch-Pagan test for heteroscedasticity on the simple model
bp_test_simple <- bptest(simple_model)
cat("Breusch-Pagan test for Simple Model:\n")
## Breusch-Pagan test for Simple Model:
print(bp_test_simple)
## 
##  studentized Breusch-Pagan test
## 
## data:  simple_model
## BP = 3061.1, df = 1, p-value < 2.2e-16
# Perform the Breusch-Pagan test for heteroscedasticity on the segmented model
# Note: For the segmented model, use the residuals from the segmented fit
bp_test_splined <- bptest(splined_model)
cat("\nBreusch-Pagan test for Splined Model:\n")
## 
## Breusch-Pagan test for Splined Model:
print(bp_test_splined)
## 
##  studentized Breusch-Pagan test
## 
## data:  splined_model
## BP = 3732.4, df = 3, p-value < 2.2e-16

As we can see, there is heteroscedasticity in both the models.