# Load necessary libraries
install.packages("ggcorrplot")
## Installing package into '/home/catherinetaylor35/R/x86_64-pc-linux-gnu-library/4.2'
## (as 'lib' is unspecified)
install.packages("kableExtra")
## Installing package into '/home/catherinetaylor35/R/x86_64-pc-linux-gnu-library/4.2'
## (as 'lib' is unspecified)
library(readr)
library(ggcorrplot)
## Loading required package: ggplot2
library(tidyr)
library(readxl) # For reading Excel files
library(dplyr) # For data manipulation
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(ggplot2) # For data visualisation
library(knitr) # For rendering tables in Markdown
library(kableExtra) # For enhanced table aesthetics
##
## Attaching package: 'kableExtra'
## The following object is masked from 'package:dplyr':
##
## group_rows
library(gridExtra) # For multi-plot layout
##
## Attaching package: 'gridExtra'
## The following object is masked from 'package:dplyr':
##
## combine
# Load the data
guides_df <- read_csv("Guide_RNA_Analysis_RAS2.xlsx - Sheet1.csv")
## Rows: 4 Columns: 14
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (5): Target Sequence, Genomic Location, Strand, Off-Targets, Restriction...
## dbl (9): Rank, GC Content (%), Self-Complementarity, MM0 (Exact Match), MM1,...
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
# Display the data table
guides_df
## # A tibble: 4 × 14
## Rank `Target Sequence` `Genomic Location` Strand `GC Content (%)`
## <dbl> <chr> <chr> <chr> <dbl>
## 1 1 TAGGCGATAAGAGAAGACAACGG chrXIV:440767 + 40
## 2 2 CAGAAGAGTCAGATAAAGAGTGG chrXIV:440685 + 40
## 3 3 ATTTCAAGCGTAACGCAATCCGG chrXIV:440735 - 40
## 4 4 CGCTTGAAATGAGCGATTCTAGG chrXIV:440748 + 45
## # ℹ 9 more variables: `Self-Complementarity` <dbl>, `MM0 (Exact Match)` <dbl>,
## # MM1 <dbl>, MM2 <dbl>, MM3 <dbl>, `Doench '16 Efficiency` <dbl>,
## # `Mor.-Mateos Efficiency` <dbl>, `Off-Targets` <chr>,
## # `Restriction Enzymes` <chr>
# Specificity Visualisation
ggplot(guides_df, aes(x = factor(Rank), y = `Self-Complementarity`, fill = factor(Rank))) +
geom_bar(stat = "identity") +
labs(title = "Self-Complementarity by Guide Rank", x = "Guide Rank", y = "Self-Complementarity") +
theme_minimal() +
theme(legend.position = "none")
# Summarise MM counts
mm_long <- guides_df %>%
select(Rank, `MM0 (Exact Match)`, MM1, MM2, MM3) %>%
pivot_longer(cols = starts_with("MM"), names_to = "Mismatch", values_to = "Count")
# View the data to ensure it's in long format
head(mm_long)
## # A tibble: 6 × 3
## Rank Mismatch Count
## <dbl> <chr> <dbl>
## 1 1 MM0 (Exact Match) 0
## 2 1 MM1 0
## 3 1 MM2 0
## 4 1 MM3 1
## 5 2 MM0 (Exact Match) 0
## 6 2 MM1 0
ggplot(mm_long, aes(x = factor(Rank), y = Count, fill = Mismatch)) +
geom_bar(stat = "identity", position = "dodge") +
labs(title = "Mismatch Distribution by Guide Rank", x = "Guide Rank", y = "Mismatch Count") +
theme_minimal()
# Efficiency Comparison Plot
efficiency_long <- guides_df %>%
select(Rank, `Doench '16 Efficiency`, `Mor.-Mateos Efficiency`) %>%
pivot_longer(cols = starts_with("Doench"), names_to = "Efficiency_Metric", values_to = "Score")
ggplot(efficiency_long, aes(x = factor(Rank), y = Score, fill = Efficiency_Metric)) +
geom_bar(stat = "identity", position = "dodge") +
labs(title = "Efficiency Scores by Guide Rank", x = "Guide Rank", y = "Efficiency Score") +
theme_minimal()
##### Figure 3 bar chart illustrates the efficiency scores of guide RNAs
(gRNAs) ranked by their suitability for targeting. The x-axis represents
the guide rank (1 to 4). The y-axis shows the efficiency score derived
from the Doench ’16 scoring model, which predicts the likelihood of
effective cleavage by the gRNA. The bars are colour-coded to correspond
to the efficiency metric, emphasizing rank differences. Higher-ranked
gRNAs display superior efficiency scores, with Rank 1 achieving the
highest value, reflecting its greater suitability for precise gene
targeting.
# GC Content by Rank
ggplot(guides_df, aes(x = factor(Rank), y = `GC Content (%)`, fill = factor(Rank))) +
geom_bar(stat = "identity") +
labs(title = "GC Content by Guide Rank", x = "Guide Rank", y = "GC Content (%)") +
theme_minimal() +
theme(legend.position = "none")
# Table summarizing off-targets for each guide
off_target_summary <- guides_df %>%
select(Rank, `Off-Targets`)
kable(off_target_summary, col.names = c("Rank", "Off-Target Summary")) %>%
kable_styling(bootstrap_options = c("striped", "hover"), full_width = FALSE)
Rank | Off-Target Summary |
---|---|
1 | 1 (CDC9) |
2 | 0 |
3 | 1 (HSP31-FIT1 Intergenic) |
4 | 0 |
# Combined plot
p1 <- ggplot(guides_df, aes(x = factor(Rank), y = `Doench '16 Efficiency`, fill = factor(Rank))) +
geom_bar(stat = "identity") +
labs(title = "Doench '16 Efficiency", x = "Guide Rank", y = "Efficiency Score") +
theme_minimal() +
theme(legend.position = "none")
p2 <- ggplot(guides_df, aes(x = factor(Rank), y = `GC Content (%)`, fill = factor(Rank))) +
geom_bar(stat = "identity") +
labs(title = "GC Content (%)", x = "Guide Rank", y = "GC Content (%)") +
theme_minimal() +
theme(legend.position = "none")
grid.arrange(p1, p2, ncol = 2)