007 phase 2 native gene target identifier

2025-10-06

Ebrima SM Kolley

knitr::opts_chunk$set(echo = TRUE, fig.align = β€œcenter”, out.width = β€œ80%”)

Set working directory

setwd("/Users/esmk/Documents/PhD/PhD_projects/007/007_phase2/R_scripts/CNE_sgRNA_target_finder")

Load Relevant Packages

library(readxl) 
library(rtracklayer) 
## Loading required package: GenomicRanges
## Loading required package: stats4
## Loading required package: BiocGenerics
## 
## Attaching package: 'BiocGenerics'
## The following objects are masked from 'package:stats':
## 
##     IQR, mad, sd, var, xtabs
## The following objects are masked from 'package:base':
## 
##     anyDuplicated, aperm, append, as.data.frame, basename, cbind,
##     colnames, dirname, do.call, duplicated, eval, evalq, Filter, Find,
##     get, grep, grepl, intersect, is.unsorted, lapply, Map, mapply,
##     match, mget, order, paste, pmax, pmax.int, pmin, pmin.int,
##     Position, rank, rbind, Reduce, rownames, sapply, setdiff, table,
##     tapply, union, unique, unsplit, which.max, which.min
## Loading required package: S4Vectors
## 
## Attaching package: 'S4Vectors'
## The following object is masked from 'package:utils':
## 
##     findMatches
## The following objects are masked from 'package:base':
## 
##     expand.grid, I, unname
## Loading required package: IRanges
## Loading required package: GenomeInfoDb
library(dplyr) 
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:GenomicRanges':
## 
##     intersect, setdiff, union
## The following object is masked from 'package:GenomeInfoDb':
## 
##     intersect
## The following objects are masked from 'package:IRanges':
## 
##     collapse, desc, intersect, setdiff, slice, union
## The following objects are masked from 'package:S4Vectors':
## 
##     first, intersect, rename, setdiff, setequal, union
## The following objects are masked from 'package:BiocGenerics':
## 
##     combine, intersect, setdiff, union
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(ggplot2)
library(ggthemes) 
library(GenomicRanges) 
library(ggrepel)
library(viridis)
## Loading required package: viridisLite
library(stringr)
library(tidyr)
## 
## Attaching package: 'tidyr'
## The following object is masked from 'package:S4Vectors':
## 
##     expand
library(ggforce)
library(forcats)

Read in single-cell significant features data from Page et al., 2023

all_significant_feature <- read_excel("/Users/esmk/Documents/dataAnalysis/dataFiles/input_files/An_gambiae_single_cell/42003_2023_5224_MOESM4_ESM.xlsx",sheet = 1 )

papa = read_excel("/Users/esmk/Documents/dataAnalysis/dataFiles/input_files/An_gambiae_single_cell/supp_gr.217216.116_Supplemental_Table_S1.xlsx")

Import genome annotation (GFF) file

gff_file <- "/Users/esmk/Downloads/VectorBase-59_AgambiaePEST.gff" 

gff <- import(gff_file, format = "gff") %>% as.data.frame()

chromosomes <- c("2R", "2L", "3R", "3L", "X")

Import the conservation score data from Kranjc et al 2021

CNE_data <- lapply(chromosomes, function(chr) {
  read.csv(paste0("./", chr, "_CNE.csv")) %>% mutate(chromosome = paste0("AgamP4_", chr))
}) %>% bind_rows()

Find conserved sgRNA regions

find_consecutive_regions <- function(df, min_len = 18, max_len = 20) {
  df <- df %>% arrange(pos) %>% mutate(diff = c(NA, diff(pos)))
  results <- list()
  start_idx <- NULL
  count <- 1
  
  for (i in seq_len(nrow(df) - 1)) {
    if (df$diff[i + 1] == 1) {
      if (is.null(start_idx)) start_idx <- df$pos[i]
      count <- count + 1
    } else {
      if (!is.null(start_idx) && count >= min_len && count <= max_len) {
        mean_Cs <- mean(df$Cs[df$pos >= start_idx & df$pos <= df$pos[i]], na.rm = TRUE)
        results <- append(results, list(data.frame(start = start_idx, end = df$pos[i], length = count, mean_Cs = mean_Cs)))
      }
      start_idx <- NULL
      count <- 1
    }
  }
  
  if (!is.null(start_idx) && count >= min_len && count <= max_len) {
    mean_Cs <- mean(df$Cs[df$pos >= start_idx & df$pos <= df$pos[nrow(df)]], na.rm = TRUE)
    results <- append(results, list(data.frame(start = start_idx, end = df$pos[nrow(df)], length = count, mean_Cs = mean_Cs)))
  }
  
  return(bind_rows(results))
}

Apply function

consecutive_regions_df <- CNE_data %>% group_by(chromosome) %>% group_modify(~ find_consecutive_regions(.x))

Convert to GRanges

gff_gr <- GRanges(seqnames = gff$seqnames, ranges = IRanges(start = gff$start, end = gff$end), gene_id = gff$ID)
regions_gr <- GRanges(seqnames = consecutive_regions_df$chromosome, ranges = IRanges(start = consecutive_regions_df$start, end = consecutive_regions_df$end))

Find overlaps

overlaps <- findOverlaps(regions_gr, gff_gr)

gene_names <- gff$ID[subjectHits(overlaps)]
consecutive_regions_df$gene_names <- NA
consecutive_regions_df$gene_names[queryHits(overlaps)] <- gene_names

Filter non-coding regions (exclude UTRs)

consecutive_regions_df <- consecutive_regions_df %>%
filter(!is.na(gene_names) & !grepl("^utr", gene_names, ignore.case =
TRUE))

Select top 10 most conserved regions based on mean_Cs

top_conserved_regions <- consecutive_regions_df %>%
  arrange(desc(mean_Cs)) %>%
  head(10)

Main plot

ultraconserved_regions_Angam <- ggplot(consecutive_regions_df, 
                                       aes(x = start, xend = end, y = chromosome, yend = chromosome, color = mean_Cs)) +
  
  # Main segments (conserved regions)
  geom_segment(linewidth = 0.8, alpha = 0.8) +
  
  # Larger points for region centers
  geom_point(aes(x = (start + end) / 2, y = chromosome), size = 3.5, shape = 21, fill = "white", stroke = 1) +
  
  # Label only the top 10 most conserved regions
  geom_text_repel(data = top_conserved_regions, 
                  aes(x = (start + end) / 2, y = chromosome, label = gene_names), 
                  size = 3, 
                  fontface = "bold",
                  color = "black",
                  box.padding = 0.6, 
                  segment.color = "gray40",
                  segment.size = 0.4) +  
  
  # High-contrast color scale
  scale_color_viridis(option = "magma", direction = -1) +  
  
  # Axis and title formatting
  labs(x = "Genomic Position (bp)", y = "Chromosome", color = "Mean Cs",
       title = expression(bold("Ultraconserved Genomic Regions in " * italic("Anopheles gambiae")))) +  
  theme_classic(base_size = 18) +  
  theme(
    axis.text.x = element_text(size = 14, angle = 45, hjust = 1),
    axis.text.y = element_text(size = 16, face = "bold"),
    axis.title = element_text(size = 18, face = "bold"),
    plot.title = element_text(size = 20, face = "bold", hjust = 0.5),
    legend.position = "right",
    legend.text = element_text(size = 14),
    legend.title = element_text(size = 16, face = "bold"),
    plot.margin = margin(10, 10, 10, 10)
  )
ultraconserved_regions_Angam

ggsave("ultraconserved_regions_Angam.png", plot = ultraconserved_regions_Angam, width = 16, height = 8, dpi = 600)

#Why did i stop looking at only the top 10 most consevered genes and focused on all the gene????#

all_conserved_regions <- consecutive_regions_df %>%
  arrange(desc(mean_Cs)) %>%
  head(168)
# Ensure gene_id and Cs_group are defined
cs_summary <- all_conserved_regions %>%
  mutate(gene_id = str_extract(gene_names, "^AGAP\\d+")) %>%
  group_by(gene_id) %>%
  summarise(mean_Cs = mean(mean_Cs, na.rm = TRUE)) %>%
  ungroup() %>%
  arrange(desc(mean_Cs)) %>%
  mutate(
    rank = row_number(),
    Cs_group = ntile(mean_Cs, 3),
    Cs_group = factor(Cs_group, labels = c("Low", "Medium", "High")),
    hjust = ifelse(rank %% 2 == 0, -0.1, 1.1)
  )
#Nature Biotechnology–ready plot
# allGenesCs <- ggplot(cs_summary, aes(x = mean_Cs, y = reorder(gene_id, mean_Cs), color = Cs_group)) +
#   # Grey lollipop stems
#   geom_segment(aes(x = 0.25, xend = mean_Cs, yend = gene_id), color = "grey75", size = 0.3) + #0.5 to return the lines
#   # Colored dot
#   geom_point(size = 2) +
#   # Gene label alternating sides
#   geom_text(
#     aes(label = gene_id, hjust = hjust),
#     size = 2.7, fontface = "italic", family = "Times", color = "black"
#   ) +
#   # Subtle color palette
#   scale_color_manual(
#     values = c("Low" = "#91bfdb", "Medium" = "#fee08b", "High" = "#fc8d59"),
#     guide = guide_legend(title.position = "top", title.hjust = 0.5)
#   ) +
#   # Axes and scale
#   scale_x_continuous(
#     breaks = seq(0.5, max(cs_summary$mean_Cs, na.rm = TRUE), by = 0.125),
#     limits = c(0.5, NA),
#     expand = expansion(mult = c(0, 0.12))
#   ) +
#   labs(
#     x = expression("Mean C"["s"]),
#     y = NULL,
#     color = "Cs Quantile"
#   ) +
#   theme_minimal(base_size = 11, base_family = "Times") +
#   theme(
#     panel.grid.major.y = element_blank(),
#     panel.grid.major.x = element_blank(),
#     panel.grid.minor = element_blank(),
#     axis.text.y = element_blank(),
#     axis.ticks.y = element_blank(),
#     axis.text.x = element_text(size = 9.5, color = "black"),
#     axis.title.x = element_text(size = 11, face = "bold", margin = margin(t = 10)),
#     legend.position = "bottom",
#     legend.title = element_text(size = 10, face = "bold"),
#     legend.text = element_text(size = 9),
#     plot.margin = margin(10, 10, 10, 10)
#   )
# allGenesCs
#ggsave("allGenesCs.png", plot = allGenesCs, width = 10, height = 6, dpi = 600)
#Extract gene ids
all_conserved_regions <- all_conserved_regions %>%
  mutate(gene_id = str_extract(gene_names, "AGAP[0-9]+"))

ids <- unique(all_conserved_regions$gene_id)

Function to calculate the average expression for female and male samples and replace the original columns

replace_with_ave_expression <- function(data) {
  data %>%
    mutate(
      Female_Car = rowMeans(select(., Female_Car1:Female_Car3), na.rm = TRUE),
      
      Female_RT = rowMeans(select(., Female_RT1:Female_RT3), na.rm = TRUE),
      
      Male_Car = rowMeans(select(., Male_Car1:Male_Car3), na.rm = TRUE),
      
      Male_RT = rowMeans(select(., Male_RT1:Male_RT3), na.rm = TRUE),
      
    ) %>%
    select(-Female_Car1, -Female_Car2, -Female_Car3, -Female_RT1, -Female_RT2, -Female_RT3, -Male_Car1, -Male_Car2, -Male_Car3, -Male_RT1, -Male_RT2, -Male_RT3,
           -log2FC_Carc, -padj_Carc, -log2FC_RT, -padj_RT, -log2FC_M, -padj_M, -log2FC_F, -padj_F) 
}

papa <- replace_with_ave_expression(papa)

#add double sex and 7280 from hammond 2016 zpg vasa yob 007 and fle#

subset_papa <- papa %>%
  filter(geneID %in% c(ids, "AGAP004050", "AGAP007280","AGAP006241", "AGAP008578", "AGAP029221", "AGAP005748", "AGAP013051")) 
# Convert to long format
expression_long <- subset_papa %>%
  pivot_longer(
    cols = starts_with("Female") | starts_with("Male"),
    names_to = "Sample",
    values_to = "Expression"
  )
# ggplot(expression_long, aes(x = Sample, y = Expression, fill = Sample)) +
#   geom_boxplot(outlier.shape = NA, alpha = 0.8, width = 0.6) +
#   geom_jitter(aes(color = Sample), size = 2, width = 0.2, alpha = 0.8) +
#   scale_fill_brewer(palette = "Set2") +
#   scale_color_brewer(palette = "Set2") +
#   labs(
#     title = "Expression Profile of Conserved Genes Across Tissues",
#     x = "Tissue Sample",
#     y = "Expression (Normalized Counts)"
#   ) +
#   theme_minimal(base_size = 14) +
#   theme(
#     legend.position = "none",
#     panel.grid.major = element_blank(),
#     panel.grid.minor = element_blank(),
#     axis.title.x = element_text(margin = margin(t = 10)),
#     axis.title.y = element_text(margin = margin(r = 10))
#   )

First, create the ggplot object with paginate support

# Define custom gene labels
gene_labels <- c(
  "AGAP004050" = "dsx",
  "AGAP007280" = "7280",
  "AGAP006241" = "zpg",
  "AGAP008578" = "vasa",
  "AGAP029221" = "yob",
  "AGAP005748" = "007 aka SOA",
  "AGAP013051" = "fle",
  "AGAP009164" = "Vg",
  "AGAP004648" = "Pb"
)

# Define highlight color for special genes
highlight_genes <- names(gene_labels)

# Prepare data: set geneID as factor to control order
expression_long <- expression_long %>%
  mutate(
    geneID_label = ifelse(geneID %in% highlight_genes, gene_labels[geneID], geneID),
    highlight = ifelse(geneID %in% highlight_genes, "highlight", "normal"),
    geneID = forcats::fct_reorder(geneID, Expression, .fun = median)
  )

# Build base plot
p <- ggplot(expression_long, aes(x = Sample, y = Expression, fill = Sample)) +
  geom_col(position = "dodge", color = "grey20") +
  facet_wrap_paginate(
    ~ geneID,
    scales = "free_y",
    ncol = 3,
    nrow = 3,
    page = 1,
    labeller = labeller(geneID = label_parsed)  # allows italics from label code
  ) +
  scale_fill_brewer(palette = "Dark2") +
  labs(
    title = expression("Gene-wise Expression Profiles"),
    x = NULL,
    y = "Expression"
  ) +
  theme_minimal(base_size = 12) +
  theme(
    legend.position = "bottom",
    axis.text.x = element_text(angle = 45, hjust = 1),
    strip.text = element_text(size = 9, face = "italic")
  )

# Compute number of pages
n_pages_total <- n_pages(p)
cat("Total number of pages:", n_pages_total, "\n\n")
## Total number of pages: 15
# Print all pages with highlighted facets
for (i in seq_len(n_pages_total)) {
  print(
    p + facet_wrap_paginate(
      ~ geneID,
      scales = "free_y",
      ncol = 3,
      nrow = 3,
      page = i,
      labeller = labeller(
        geneID = function(x) {
          ifelse(x %in% highlight_genes, gene_labels[x], x)
        }
      )
    )
  )
}

# Get vectors of gene IDs # selected_gene_ids <- unique(selected_genes$FeatureID) # expression_gene_ids <- unique(expression_long$geneID)

extra_genes <- setdiff(expression_gene_ids, selected_gene_ids)

extra_genes

#[1] β€œAGAP000254” β€œAGAP003448” β€œAGAP008605” β€œAGAP008618” check y β€œAGAP000254” was omitted answer these genes were omitted because they are not in the page dataset note that yob #is a also not in the papa dataset. the absence of AGAP000254 from the page data set could be because it not expressed at all in the male germline so could not be identified.

selected_genes <-  all_significant_feature %>%
  filter(FeatureID %in% expression_long$geneID)
# Get column names containing "Average"
average_cols <- grep("Average", names(selected_genes), value = TRUE)

average_cols <- average_cols[average_cols != "HC Average"]
# Sum all "Average" columns row-wise, then group by gene
expression_sums <- selected_genes %>%
  rowwise() %>%
  mutate(total_row_expression = sum(c_across(all_of(average_cols)), na.rm = TRUE)) %>%
  ungroup() %>%
  group_by(FeatureID) %>%
  summarise(total_expression = sum(total_row_expression, na.rm = TRUE)) %>%
  arrange(desc(total_expression))
# Sum all "Average" columns row-wise, then group by gene
expression_sums <- selected_genes %>%
  rowwise() %>%
  mutate(total_row_expression = sum(c_across(all_of(average_cols)), na.rm = TRUE)) %>%
  ungroup() %>%
  group_by(FeatureID) %>%
  summarise(total_expression = sum(total_row_expression, na.rm = TRUE)) %>%
  arrange(desc(total_expression))

ggplot(expression_sums, aes(x = reorder(FeatureID, -total_expression), y = total_expression)) +
  geom_col(fill = "#2C7BB6", width = 0.7) +
  labs(
    x = NULL,
    y = "Summed Average Expression"
  ) +
  scale_y_continuous(expand = expansion(mult = c(0, 0.05))) +
  theme_minimal(base_size = 14, base_family = "Helvetica") +
  theme(
    plot.title = element_blank(),
    axis.text.x = element_text(
      angle = 45, hjust = 1, vjust = 1, size = 10, color = "black"
    ),
    axis.text.y = element_text(size = 11, color = "black"),
    axis.title.y = element_text(size = 12, face = "bold"),
    panel.grid.major.x = element_blank(),
    panel.grid.minor = element_blank(),
    panel.grid.major.y = element_line(color = "grey90"),
    plot.margin = margin(10, 10, 10, 10)
  )

#Select Genes with low expression in the male germline#

low_expression_genes <- expression_sums %>%
  filter(total_expression < 1.5)

# View result
print(low_expression_genes)
## # A tibble: 55 Γ— 2
##    FeatureID  total_expression
##    <chr>                 <dbl>
##  1 AGAP009036             1.49
##  2 AGAP010391             1.48
##  3 AGAP002041             1.43
##  4 AGAP011333             1.43
##  5 AGAP004369             1.33
##  6 AGAP000720             1.27
##  7 AGAP008662             1.20
##  8 AGAP006452             1.17
##  9 AGAP009542             1.11
## 10 AGAP005165             1.10
## # β„Ή 45 more rows

#Why don’t we need genes that are expressed in the germline????#

reference_genes <- c("AGAP004050", "AGAP007280", "AGAP006241", "AGAP008578", "AGAP029221", "AGAP005748", "AGAP013051") 
#dsx 7280 zpg vasa yob 007 fle

# Add label for low expression genes
low_expr_labeled <- low_expression_genes %>%
  mutate(category = if_else(FeatureID %in% reference_genes, "Reference", "Low Expression"))

# Add any reference genes not already in low_expression_genes
reference_only <- tibble(
  FeatureID = setdiff(reference_genes, low_expression_genes$FeatureID),
  total_expression = NA,
  category = "Reference"
)

# Combine into one data frame
putative_targets <- bind_rows(low_expr_labeled, reference_only)


expression_targets <- expression_long %>%
  filter(geneID %in% putative_targets$FeatureID)



# Add indicator column
expression_targets <- expression_targets %>%
  mutate(is_reference = ifelse(geneID %in% reference_genes, "Reference", "Other"))
# Mapping AGAP IDs to simplified gene names
gene_labels <- c(
  "AGAP004050" = "dsx",
  "AGAP007280" = "7280",
  "AGAP006241" = "zpg",
  "AGAP008578" = "vasa",
  "AGAP029221" = "yob",
  "AGAP005748" = "007 aka SOA",
  "AGAP013051" = "fle",
  "AGAP009164" = "Vg",
  "AGAP004648" = "Pb"
)

# Prepare data for highlighting and labeling
expression_targets <- expression_targets %>%
  mutate(
    is_highlight = ifelse(geneID %in% names(gene_labels), "highlight", "normal"),
    display_name = ifelse(geneID %in% names(gene_labels),
                          gene_labels[geneID], geneID)
  )

# Plot
p <- ggplot(expression_targets, aes(x = Sample, y = Expression, fill = Sample)) +
  geom_col(
    position = "dodge",
    aes(color = is_highlight),
    linewidth = 0.5
  ) +
  facet_wrap(
    ~ geneID,
    scales = "free_y",
    labeller = labeller(geneID = as_labeller(gene_labels, default = identity))
  ) +
  scale_fill_brewer(palette = "Dark2") +
  scale_color_manual(values = c("highlight" = "black", "normal" = NA), guide = "none") +
  labs(
    title = "Genes with Low Germline Expression (Reference Highlighted)",
    x = NULL,
    y = "Expression"
  ) +
  theme_minimal(base_size = 12) +
  theme(
    legend.position = "bottom",
    axis.text.x = element_text(angle = 45, hjust = 1),
    strip.text = element_text(size = 9, face = "bold.italic"),  # italic gene labels
    panel.border = element_rect(color = "grey80", fill = NA, linewidth = 0.4)
  )

# Print
p

#ggsave("lowMaleGermlineExp2.png", p, width = 10, height = 8, dpi = 600)
selected_genes2 <-  all_significant_feature %>%
  filter(FeatureID %in% expression_targets$geneID)
# Get column names containing "Average"
average_cols <- grep("Average", names(selected_genes2), value = TRUE)

average_cols <- average_cols[average_cols != "HC Average"]


# Sum all "Average" columns row-wise, then group by gene
expression_sums <- selected_genes2 %>%
  rowwise() %>%
  mutate(total_row_expression = sum(c_across(all_of(average_cols)), na.rm = TRUE)) %>%
  ungroup() %>%
  group_by(FeatureID) %>%
  summarise(total_expression = sum(total_row_expression, na.rm = TRUE)) %>%
  arrange(desc(total_expression))

# Add is_reference column
expression_sums <- expression_sums %>%
  mutate(is_reference = ifelse(FeatureID %in% reference_genes, "Reference", "Other"))
# Mapping AGAP IDs to names
gene_labels <- c(
  "AGAP004050" = "dsx",
  "AGAP007280" = "7280",
  "AGAP006241" = "zpg",
  "AGAP008578" = "vasa",
  "AGAP029221" = "yob",
  "AGAP005748" = "007 aka SOA",
  "AGAP013051" = "fle",
  "AGAP009164" = "Vg",
  "AGAP004648" = "Pb"
)

# Replace FeatureID labels where applicable
expression_sums$FeatureID_label <- ifelse(
  expression_sums$FeatureID %in% names(gene_labels),
  gene_labels[expression_sums$FeatureID],
  expression_sums$FeatureID
)

# Plot with relabeled x-axis
plot <- ggplot(expression_sums, aes(x = reorder(FeatureID_label, -total_expression), 
                                    y = total_expression, 
                                    fill = is_reference)) +
  geom_col(width = 0.7) +
  scale_fill_manual(values = c("Reference" = "#D7191C", "Other" = "#2C7BB6")) +
  labs(
    x = NULL,
    y = "Summed Average Expression",
    fill = NULL
  ) +
  scale_y_continuous(expand = expansion(mult = c(0, 0.05))) +
  theme_minimal(base_size = 14, base_family = "Helvetica") +
  theme(
    plot.title = element_blank(),
    axis.text.x = element_text(angle = 45, hjust = 1, vjust = 1, size = 10, color = "black"),
    axis.text.y = element_text(size = 11, color = "black"),
    axis.title.y = element_text(size = 12, face = "bold"),
    panel.grid.major.x = element_blank(),
    panel.grid.minor = element_blank(),
    panel.grid.major.y = element_line(color = "grey90"),
    legend.position = "bottom",
    plot.margin = margin(10, 10, 10, 10)
  )

plot

#ggsave("lowMaleGermlineExp.png", plot, width = 10, height = 8, dpi = 600)
expression_sums$FeatureID
##  [1] "AGAP006241" "AGAP008578" "AGAP013051" "AGAP005748" "AGAP009036"
##  [6] "AGAP010391" "AGAP002041" "AGAP011333" "AGAP004369" "AGAP000720"
## [11] "AGAP008662" "AGAP006452" "AGAP009542" "AGAP005165" "AGAP004494"
## [16] "AGAP005010" "AGAP010519" "AGAP005288" "AGAP004902" "AGAP000974"
## [21] "AGAP009757" "AGAP004446" "AGAP001291" "AGAP010438" "AGAP003945"
## [26] "AGAP012280" "AGAP000255" "AGAP004835" "AGAP002199" "AGAP011414"
## [31] "AGAP004050" "AGAP000061" "AGAP002529" "AGAP000138" "AGAP004664"
## [36] "AGAP001239" "AGAP002235" "AGAP005356" "AGAP009002" "AGAP004244"
## [41] "AGAP005096" "AGAP011927" "AGAP008563" "AGAP013557" "AGAP008919"
## [46] "AGAP004648" "AGAP013391" "AGAP007280" "AGAP006654" "AGAP001690"
## [51] "AGAP009055" "AGAP001201" "AGAP013708" "AGAP005716" "AGAP007973"
## [56] "AGAP011188" "AGAP003685" "AGAP003144" "AGAP001622"
#Cardinal sgRNAs

cardinal_sgRNAs <- read.csv("/Users/esmk/AgamP4_conservation_score/AgamP4_2R extraction Cardinal Annotations sgRNA.csv")
# Filter the data
cardinal_sgRNAs <- cardinal_sgRNAs[
  cardinal_sgRNAs$Type == "CRISPR" & grepl("0 in CDS", cardinal_sgRNAs$X..Local.Off.target.Sites),
]

cardinal_sgRNAs <- cardinal_sgRNAs[
  cardinal_sgRNAs$Type == "CRISPR" & !grepl("10 in CDS|20 in CDS", cardinal_sgRNAs$X..Local.Off.target.Sites),
]


cardinal_cne <- read.csv("/Users/esmk/AgamP4_conservation_score/cardinal_CNE.csv")

# First, make sure both datasets are sorted
cardinal_cne <- cardinal_cne[order(cardinal_cne$pos), ]
cardinal_sgRNAs <- cardinal_sgRNAs[order(cardinal_sgRNAs$Min..original.sequence.), ]

# Create an empty vector to store mean Cs scores
mean_Cs <- numeric(nrow(cardinal_sgRNAs))

# Loop through each sgRNA and calculate mean Cs score
for (i in 1:nrow(cardinal_sgRNAs)) {
  min_pos <- cardinal_sgRNAs$Min..original.sequence.[i]
  max_pos <- cardinal_sgRNAs$Max..original.sequence.[i]
  
  # Subset the Cs scores in that range
  cs_subset <- cardinal_cne$Cs[cardinal_cne$pos >= min_pos & cardinal_cne$pos <= max_pos]
  
  # Calculate mean (or NA if no overlap)
  mean_Cs[i] <- if(length(cs_subset) > 0) mean(cs_subset, na.rm = TRUE) else NA
}

# Add the new column to sgRNAs
cardinal_sgRNAs$mean_Cs <- mean_Cs
snps_cardinal <- read.csv("/Users/esmk/Downloads/AGAP003502-RA_snp_allele_freqs.csv")
# Extract numeric position and chromosome from the SNP label
snps_cardinal_cleaned <- snps_cardinal %>%
  mutate(
    chr = str_extract(label, "^[^:]+"),
    pos = as.numeric(gsub(",", "", str_extract(label, "(?<=:)[0-9,]+")))
  )

# For each sgRNA, find matching SNPs within its genomic range
guide_snp_matches <- lapply(1:nrow(cardinal_sgRNAs), function(i) {
  guide <- cardinal_sgRNAs[i, ]
  
  guide_chr <- "2R"  # Replace with actual chromosome if available per guide
  min_pos <- guide$Min..original.sequence.
  max_pos <- guide$Max..original.sequence.
  
  matching_snps <- snps_cardinal_cleaned %>%
    filter(chr == guide_chr & pos >= min_pos & pos <= max_pos) %>%
    mutate(guide_name = guide$Name)
  
  return(matching_snps)
})

# Combine all matches into one data frame
all_guide_snp_matches <- bind_rows(guide_snp_matches)

summary_table <- all_guide_snp_matches %>%
  group_by(guide_name)

cardinal_sgRNAs <- cardinal_sgRNAs %>%
  left_join(summary_table, by = c("Name" = "guide_name"))
## Warning in left_join(., summary_table, by = c(Name = "guide_name")): Detected an unexpected many-to-many relationship between `x` and `y`.
## β„Ή Row 1 of `x` matches multiple rows in `y`.
## β„Ή Row 829 of `y` matches multiple rows in `x`.
## β„Ή If a many-to-many relationship is expected, set `relationship =
##   "many-to-many"` to silence this warning.
# Select the relevant columns from putative_guides and rename 'Name' to match 'guide_name'
guide_info <- cardinal_sgRNAs %>%
  select(
    guide_name = Name,
    Direction,
    X..Local.Off.target.Sites,
    Doench..2016..Activity.Score,
    Zhang..2013..Specificity.Score,
    Min..original.sequence.,
    Max..original.sequence.,
    Target.Sequence,
    mean_Cs
  )

# Join the guide info into all_guide_snp_matches
all_guide_snp_matches_augmented <- all_guide_snp_matches %>%
  left_join(guide_info, by = "guide_name")
## Warning in left_join(., guide_info, by = "guide_name"): Detected an unexpected many-to-many relationship between `x` and `y`.
## β„Ή Row 1 of `x` matches multiple rows in `y`.
## β„Ή Row 1 of `y` matches multiple rows in `x`.
## β„Ή If a many-to-many relationship is expected, set `relationship =
##   "many-to-many"` to silence this warning.
library(ggplot2)
library(dplyr)

# Optional: summarize to get one row per guide if needed (e.g., using max or mean values)
cardinal_sgRNAs_summary <- all_guide_snp_matches_augmented %>%
  group_by(guide_name, Direction) %>%
  summarise(
    Doench_Score = mean(Doench..2016..Activity.Score, na.rm = TRUE),
    Specificity = as.numeric(gsub("%", "", unique(Zhang..2013..Specificity.Score))),
    mean_Cs = mean(mean_Cs, na.rm = TRUE),
    max_af = max(max_af, na.rm = TRUE),
    Target.Sequence = paste(unique(Target.Sequence), collapse = "; "),  # keep all
    .groups = "drop"
  )
## Warning: Returning more (or less) than 1 row per `summarise()` group was deprecated in
## dplyr 1.1.0.
## β„Ή Please use `reframe()` instead.
## β„Ή When switching from `summarise()` to `reframe()`, remember that `reframe()`
##   always returns an ungrouped data frame and adjust accordingly.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
# Plot: Doench score vs Specificity Score per guide
ggplot(cardinal_sgRNAs_summary, aes(x = Specificity, y = Doench_Score)) +
  geom_point(aes(color = Direction, size = max_af)) +
  geom_text(aes(label = guide_name), hjust = 1.1, vjust = 0.5, size = 2.5, check_overlap = TRUE) +
  scale_color_manual(values = c("forward" = "steelblue", "reverse" = "firebrick")) +
  scale_size_continuous(range = c(2, 6), name = "Max Allele Freq") +
  labs(
    title = "sgRNA Characteristics by Guide",
    x = "Zhang (2013) Specificity Score (%)",
    y = "Doench (2016) Activity Score",
    color = "Strand Direction"
  ) +
  theme_minimal()

Note that the echo = FALSE parameter was added to the code chunk to prevent printing of the R code that generated the plot.