1. load libraries

2. perform infercnv operations to reveal cnv signal


##this is to match the chr-regions to the chr-arms
hg38 = read.delim("../cytoBand_hg38.txt", header = F)
cnv = read.table("../L1/HMM_CNV_predictions.HMMi6.rand_trees.hmm_mode-subclusters.Pnorm_0.5.pred_cnv_regions.dat",header = T) ##from inferCNV resulting files

cytoband = hg38
cytoband <- data.frame(V1=gsub("chr", "", cytoband[,1]), V2=cytoband[,2], V3=cytoband[,3], V4=substring(cytoband$V4, 1, 1), stringsAsFactors=F)
start <- do.call(rbind, lapply(split(cytoband$V2, paste0(cytoband$V1, cytoband$V4)), min))
end <- do.call(rbind, lapply(split(cytoband$V3, paste0(cytoband$V1, cytoband$V4)), max))
cytoband <- data.frame(V1=gsub("p", "", gsub("q", "", rownames(start))), V2=start, V3=end, V4=rownames(start), stringsAsFactors=F)
cytoband <- cytoband [as.vector(unlist(sapply(c(1:22, "X"), function(x) which(cytoband$V1 %in% x)))), ]
cytoband$V4[grep("q", cytoband$V4)] <- "q"
cytoband$V4[grep("p", cytoband$V4)] <- "p"
rownames(cytoband) <- NULL
names(cytoband) = c("chr", "start", "end", "arm")
cytoband$chr_arm = paste0(cytoband$chr, cytoband$arm)
cytoband$chromosome = paste0("chr", cytoband$chr)
cytoband$chr = cytoband$chromosome
cytoband = cytoband[,1:5]


library(dplyr)
cnv$chr = as.character(cnv$chr)
cytoband$chr = as.character(cytoband$chr)
cnv$mid = (as.numeric(cnv$start) + as.numeric(cnv$end))/2
    new = inner_join(cnv,cytoband, by=c("chr"))%>%
          mutate(arm = ifelse(mid >= as.numeric(start.y) & mid <= as.numeric(end.y), chr_arm, NA))%>%
          group_by(chr)%>%
          filter(!is.na(arm)|n()==1)
Warning: Detected an unexpected many-to-many relationship between `x` and `y`.
          final_cnv = new[,c(1:6, 10)]

names(final_cnv) = c("cell_group_name", "cnv_name", "state", "chr", "start", "end", "arm")

final_cnv$event = ifelse(final_cnv$state >=4, "gain", "loss")

final_cnv$large_event = paste(final_cnv$arm, final_cnv$event, sep = " ")

final_cnv = final_cnv[!duplicated(final_cnv[,c('cell_group_name', 'large_event')]),]

large_events = unique(final_cnv$large_event)

final_cnv$group = gsub("sample.", "", final_cnv$cell_group_name)

##change the data frame to wide to summarize the event
library(reshape2)
wide = reshape2::dcast(final_cnv, large_event ~ group, value.var = "event", fun.aggregate = length)
##change the table to 0/1
rownames(wide) = wide$large_event
wide = wide[,-1]
wide = as.matrix(ifelse(wide >= 1, 1, 0))
wide = as.data.frame(cbind(wide, total = rowSums(wide)))
new = wide[order(-wide$total),]
new$"1.1.1" = ifelse((new$`1.1.1.1` + new$`1.1.1.2`) >=2, 1, 0)
Error in `$<-.data.frame`(`*tmp*`, "1.1.1", value = logical(0)) : 
  replacement has 0 rows, data has 64

3. perform infercnv operations to reveal cnv signal


library(dplyr)
library(reshape2)

# Read input files
hg38 = read.delim("../cytoBand_hg38.txt", header = F)
cnv = read.table("../L1/HMM_CNV_predictions.HMMi6.rand_trees.hmm_mode-subclusters.Pnorm_0.5.pred_cnv_regions.dat", header = T)

# Process cytoband data
cytoband = hg38
cytoband <- data.frame(V1=gsub("chr", "", cytoband[,1]), V2=cytoband[,2], V3=cytoband[,3], V4=substring(cytoband$V4, 1, 1), stringsAsFactors=F)
start <- do.call(rbind, lapply(split(cytoband$V2, paste0(cytoband$V1, cytoband$V4)), min))
end <- do.call(rbind, lapply(split(cytoband$V3, paste0(cytoband$V1, cytoband$V4)), max))
cytoband <- data.frame(V1=gsub("p", "", gsub("q", "", rownames(start))), V2=start, V3=end, V4=rownames(start), stringsAsFactors=F)
cytoband <- cytoband[as.vector(unlist(sapply(c(1:22, "X"), function(x) which(cytoband$V1 %in% x)))), ]
cytoband$V4[grep("q", cytoband$V4)] <- "q"
cytoband$V4[grep("p", cytoband$V4)] <- "p"
rownames(cytoband) <- NULL
names(cytoband) = c("chr", "start", "end", "arm")
cytoband$chr_arm = paste0(cytoband$chr, cytoband$arm)
cytoband$chromosome = paste0("chr", cytoband$chr)
cytoband$chr = cytoband$chromosome
cytoband = cytoband[,1:5]

# Process CNV data
cnv$chr = as.character(cnv$chr)
cytoband$chr = as.character(cytoband$chr)
cnv$mid = (as.numeric(cnv$start) + as.numeric(cnv$end))/2

# Join CNV and cytoband data
new = inner_join(cnv, cytoband, by=c("chr")) %>%
      mutate(arm = ifelse(mid >= as.numeric(start.y) & mid <= as.numeric(end.y), chr_arm, NA)) %>%
      group_by(chr) %>%
      filter(!is.na(arm) | n()==1)
Warning: Detected an unexpected many-to-many relationship between `x` and `y`.
final_cnv = new[,c(1:6, 10)]

names(final_cnv) = c("cell_group_name", "cnv_name", "state", "chr", "start", "end", "arm")

final_cnv$event = ifelse(final_cnv$state >= 4, "gain", "loss")
final_cnv$large_event = paste(final_cnv$arm, final_cnv$event, sep = " ")
final_cnv = final_cnv[!duplicated(final_cnv[,c('cell_group_name', 'large_event')]),]
final_cnv$group = gsub("sample.", "", final_cnv$cell_group_name)

# Create wide format data
wide = dcast(final_cnv, large_event ~ group, value.var = "event", fun.aggregate = length)
rownames(wide) = wide$large_event
wide = wide[,-1]
wide = as.matrix(ifelse(wide >= 1, 1, 0))
wide = as.data.frame(cbind(wide, total = rowSums(wide)))
new = wide[order(-wide$total),]

# Create new columns based on existing ones
new$"L1.L1.1.1" = ifelse((new$L1.L1.1.1.1.1 + new$L1.L1.1.1.1.2) >= 2, 1, 0)
new$"L1.L1.1.2" = ifelse((new$L1.L1.1.2.1.1 + new$L1.L1.1.2.1.2) >= 2, 1, 0)
new$"L1.L1.1" = ifelse((new$L1.L1.1.1 + new$L1.L1.1.2) >= 2, 1, 0)
new$"L1.L1.2.1" = ifelse((new$L1.L1.1.2.2.1 + new$L1.L1.1.2.2.2) >= 2, 1, 0)
new$"L1.L1.2" = new$L1.L1.2.1
new$"L1.L1" = ifelse((new$L1.L1.1 + new$L1.L1.2) >= 2, 1, 0)

# Reorder columns
new_order = c("L1.L1", "L1.L1.1", "L1.L1.1.1", "L1.L1.1.1.1.1", "L1.L1.1.1.1.2", 
              "L1.L1.1.2", "L1.L1.1.2.1.1", "L1.L1.1.2.1.2", 
              "L1.L1.2", "L1.L1.2.1", "L1.L1.1.2.2.1", "L1.L1.1.2.2.2", "total")
new = new[, c(new_order, setdiff(names(new), new_order))]

write.csv(new, "../L1/events.csv", row.names = TRUE)

# Calculate percentage of each CNV event
tryCatch({
  # Read the file as a single column
  cell_group = read.csv("../../uphyloplot2-2.3/Inputs/L1.cell_groupings", 
                        header = FALSE, stringsAsFactors = FALSE)
  
  # Split the column into two based on the tab character
  cell_group = data.frame(do.call('rbind', strsplit(as.character(cell_group$V1), '\t', fixed=TRUE)))
  colnames(cell_group) = c("cell_group_name", "cell")
  
  # Remove the header row
  cell_group = cell_group[-1,]
  
  print("Structure of cell_group:")
  print(str(cell_group))
  
  # Create group dataframe
  group = as.data.frame(table(cell_group$cell_group_name))
  colnames(group) = c("cell_group_name", "count")
  rownames(group) = group$cell_group_name
  
  print("Structure of group:")
  print(str(group))
  
  # Assuming 'new' is your dataframe with CNV events
  per = new[, setdiff(names(new), "total")]
  
  print("Columns of per:")
  print(colnames(per))
  
  print("Rows of group:")
  print(rownames(group))
  
  matching_rows = intersect(rownames(group), colnames(per))
  if(length(matching_rows) == 0) {
    stop("No matching row names between group and columns of per.")
  }
  
  # Calculate percentages
  percentage = per[, matching_rows] * group[matching_rows, "count"]
  
  # Calculate total cells for each CNV event
  total_cells = rowSums(percentage)
  
  # Convert counts to percentages
  percentage = sweep(percentage, 1, total_cells, "/") * 100
  
  # Add total percentage column
  percentage$sample = rowSums(percentage)
  
  write.csv(percentage, "../L1/sample_events_percentage.csv", row.names = TRUE)
  
  print("Percentage calculation completed successfully.")
}, error = function(e) {
  print(paste("An error occurred:", e$message))
})
[1] "Structure of cell_group:"
'data.frame':   5825 obs. of  2 variables:
 $ cell_group_name: chr  "L1.L1.1.1.1.1" "L1.L1.1.1.1.1" "L1.L1.1.1.1.1" "L1.L1.1.1.1.1" ...
 $ cell           : chr  "L1_AAACCTGAGGGCTTCC-1" "L1_AAACCTGGTGCAGGTA-1" "L1_AAACCTGTCCCTGACT-1" "L1_AAACCTGTCCTTCAAT-1" ...
NULL
[1] "Structure of group:"
'data.frame':   8 obs. of  2 variables:
 $ cell_group_name: Factor w/ 8 levels "L1.L1.1.1.1.1",..: 1 2 3 4 5 6 7 8
 $ count          : int  1462 14 211 151 2564 1025 325 73
NULL
[1] "Columns of per:"
 [1] "L1.L1"             "L1.L1.1"           "L1.L1.1.1"         "L1.L1.1.1.1.1"     "L1.L1.1.1.1.2"     "L1.L1.1.2"         "L1.L1.1.2.1.1"     "L1.L1.1.2.1.2"    
 [9] "L1.L1.2"           "L1.L1.2.1"         "L1.L1.1.2.2.1"     "L1.L1.1.2.2.2"     "L1.L1.1.1.2.1"     "L1.L1.1.1.2.2"     "PBMC.PBMC.1.1.1.2" "PBMC.PBMC.1.2.1.1"
[17] "PBMC.PBMC.1.2.1.2" "PBMC.PBMC.1.2.2"  
[1] "Rows of group:"
[1] "L1.L1.1.1.1.1" "L1.L1.1.1.1.2" "L1.L1.1.1.2.1" "L1.L1.1.1.2.2" "L1.L1.1.2.1.1" "L1.L1.1.2.1.2" "L1.L1.1.2.2.1" "L1.L1.1.2.2.2"
[1] "Percentage calculation completed successfully."

2. perform infercnv operations to reveal cnv signal



library(dplyr)
library(readr)

# Read the .cell_groupings file, skipping the first row which is an extra header
cell_groupings_file <- "../L7/L7.cell_groupings"
cell_mapping <- read_tsv(cell_groupings_file, col_names = c("cell_group_name", "cell_id"))
Rows: 5332 Columns: 2── Column specification ──────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────
Delimiter: "\t"
chr (2): cell_group_name, cell_id
ℹ Use `spec()` to retrieve the full column specification for this data.
ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
# Filter for only L1 cells
cell_mapping <- cell_mapping %>%
  filter(grepl("^L7_", cell_id))

# Write the mapping to a CSV file
write_csv(cell_mapping, "../L7/cell_to_group_mapping.csv")

# Print the first few rows to verify
print(head(cell_mapping))

# Print the number of rows to verify it's not empty
print(paste("Number of rows in cell_mapping:", nrow(cell_mapping)))
[1] "Number of rows in cell_mapping: 5331"
library(dplyr)
library(tidyr)

# Read the CNV data
cnv_data <- read.csv("../L7/L7_infercnv_with_chr_bands.csv", stringsAsFactors = FALSE)

# Read the cell to cell group mapping file
# This file should have columns: cell_id, cell_group_name
cell_mapping <- read.csv("../L7/cell_to_group_mapping.csv", stringsAsFactors = FALSE)

# Join the CNV data with the cell mapping
expanded_cnv_data <- cnv_data %>%
  left_join(cell_mapping, by = "cell_group_name") %>%
  select(-cell_group_name) %>%  # Remove the cell group column
  rename(cell_id = cell_id)  # Rename for clarity

# Count total number of cells
total_cells <- nrow(cell_mapping)

# Process the expanded data
cnv_summary <- expanded_cnv_data %>%
  # Group by chromosome band and CNV state
  group_by(chr_band, state) %>%
  # Count cells for each CNV event
  summarize(
    cell_count = n(),
    .groups = 'drop'
  ) %>%
  # Calculate percentage
  mutate(
    percentage = (cell_count / total_cells) * 100,
    cnv_type = case_when(
      state == 1 ~ "Complete Loss (0x)",
      state == 2 ~ "Loss of One Copy (0.5x)",
      state == 3 ~ "Neutral (1x)",
      state == 4 ~ "Addition of One Copy (1.5x)",
      state == 5 ~ "Addition of Two Copies (2x)",
      state == 6 ~ "Placeholder for >2x Copies (3x)"
    )
  ) %>%
  # Select relevant columns and sort
  select(chr_band, cnv_type, cell_count, percentage) %>%
  arrange(desc(percentage))

# Display the results
print(cnv_summary)

# Optionally, write to a CSV file
write.csv(cnv_summary, "../L7/cnv_event_percentages_by_cell.csv", row.names = FALSE)

Visualization_L1


# Load necessary libraries
library(dplyr)
library(readr)
library(ggplot2)
library(gtable)

# Step 1: Read the CNV data file
significant_cnv_data <- read_csv("../L1/cnv_event_percentages_by_cell.csv")
Rows: 1598 Columns: 4── Column specification ──────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────
Delimiter: ","
chr (2): chr_band, cnv_type
dbl (2): cell_count, percentage
ℹ Use `spec()` to retrieve the full column specification for this data.
ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
# Verify that the data was read correctly
print(head(significant_cnv_data))

# Step 2: Filter data to only include rows with a percentage > 5%
filtered_cnv_data <- significant_cnv_data %>%
  filter(percentage > 10)  # Change 5 to 90 if you want to filter for >90%

# Step 3: Create the bar plot using the filtered data
ggplot(filtered_cnv_data, aes(x = percentage, y = reorder(chr_band, percentage), fill = cnv_type)) +
  geom_bar(stat = "identity", position = "dodge", width = 0.8) +  # Adjusted bar width
  scale_fill_manual(values = c("Complete Loss (0x)" = "#fdae61",  # Orange for complete loss
                               "Loss of One Copy (0.5x)" = "#313695",  # Blue for partial loss
                               "Addition of One Copy (1.5x)" = "red",  # Red for gain
                               "Addition of Two Copies (2x)" = "darkgreen")) +  # Green for higher gain
  labs(
    title = "Significant CNVs Affecting >10% of Cells",  # Update title accordingly
    subtitle = "Chromosomal Bands with CNVs in >10% of Cells",  # Update subtitle accordingly
    x = "Percentage of Cells (%)",
    y = "Chromosomal Band",
    fill = "CNV Type"
  ) +
  theme_minimal(base_size = 14) +  # Adjust text size for readability
  theme(
    axis.title.x = element_text(size = 14, face = "bold"),
    axis.title.y = element_text(size = 14, face = "bold"),
    axis.text.y = element_text(size = 12),
    axis.text.x = element_text(size = 12),
    legend.title = element_text(size = 12, face = "bold"),
    legend.text = element_text(size = 10)
  )

NA
NA
NA
NA

Visualization_L2


# Load necessary libraries
library(dplyr)
library(readr)
library(ggplot2)
library(gtable)

# Step 1: Read the CNV data file
significant_cnv_data <- read_csv("../L2/cnv_event_percentages_by_cell.csv")
Rows: 1631 Columns: 4── Column specification ──────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────
Delimiter: ","
chr (2): chr_band, cnv_type
dbl (2): cell_count, percentage
ℹ Use `spec()` to retrieve the full column specification for this data.
ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
# Verify that the data was read correctly
print(head(significant_cnv_data))

# Step 2: Filter data to only include rows with a percentage > 5%
filtered_cnv_data <- significant_cnv_data %>%
  filter(percentage > 15)  # Change 5 to 90 if you want to filter for >90%

# Step 3: Create the bar plot using the filtered data
ggplot(filtered_cnv_data, aes(x = percentage, y = reorder(chr_band, percentage), fill = cnv_type)) +
  geom_bar(stat = "identity", position = "dodge", width = 0.8) +  # Adjusted bar width
  scale_fill_manual(values = c("Complete Loss (0x)" = "#fdae61",  # Orange for complete loss
                               "Loss of One Copy (0.5x)" = "#313695",  # Blue for partial loss
                               "Addition of One Copy (1.5x)" = "red",  # Red for gain
                               "Addition of Two Copies (2x)" = "darkgreen")) +  # Green for higher gain
  labs(
    title = "Significant CNVs Affecting >15% of Cells",  # Update title accordingly
    subtitle = "Chromosomal Bands with CNVs in >15% of Cells",  # Update subtitle accordingly
    x = "Percentage of Cells (%)",
    y = "Chromosomal Band",
    fill = "CNV Type"
  ) +
  theme_minimal(base_size = 14) +  # Adjust text size for readability
  theme(
    axis.title.x = element_text(size = 14, face = "bold"),
    axis.title.y = element_text(size = 14, face = "bold"),
    axis.text.y = element_text(size = 12),
    axis.text.x = element_text(size = 12),
    legend.title = element_text(size = 12, face = "bold"),
    legend.text = element_text(size = 10)
  )

NA
NA
NA
NA

Visualization_L3


# Load necessary libraries
library(dplyr)
library(readr)
library(ggplot2)
library(gtable)

# Step 1: Read the CNV data file
significant_cnv_data <- read_csv("../L3/cnv_event_percentages_by_cell.csv")
Rows: 1569 Columns: 4── Column specification ──────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────
Delimiter: ","
chr (2): chr_band, cnv_type
dbl (2): cell_count, percentage
ℹ Use `spec()` to retrieve the full column specification for this data.
ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
# Verify that the data was read correctly
print(head(significant_cnv_data))

# Step 2: Filter data to only include rows with a percentage > 5%
filtered_cnv_data <- significant_cnv_data %>%
  filter(percentage > 15)  # Change 5 to 90 if you want to filter for >90%

# Step 3: Create the bar plot using the filtered data
ggplot(filtered_cnv_data, aes(x = percentage, y = reorder(chr_band, percentage), fill = cnv_type)) +
  geom_bar(stat = "identity", position = "dodge", width = 0.8) +  # Adjusted bar width
  scale_fill_manual(values = c("Complete Loss (0x)" = "#fdae61",  # Orange for complete loss
                               "Loss of One Copy (0.5x)" = "#313695",  # Blue for partial loss
                               "Addition of One Copy (1.5x)" = "red",  # Red for gain
                               "Addition of Two Copies (2x)" = "darkgreen")) +  # Green for higher gain
  labs(
    title = "Significant CNVs Affecting >15% of Cells",  # Update title accordingly
    subtitle = "Chromosomal Bands with CNVs in >15% of Cells",  # Update subtitle accordingly
    x = "Percentage of Cells (%)",
    y = "Chromosomal Band",
    fill = "CNV Type"
  ) +
  theme_minimal(base_size = 14) +  # Adjust text size for readability
  theme(
    axis.title.x = element_text(size = 14, face = "bold"),
    axis.title.y = element_text(size = 14, face = "bold"),
    axis.text.y = element_text(size = 12),
    axis.text.x = element_text(size = 12),
    legend.title = element_text(size = 12, face = "bold"),
    legend.text = element_text(size = 10)
  )

NA
NA
NA
NA

Visualization_L4


# Load necessary libraries
library(dplyr)
library(readr)
library(ggplot2)
library(gtable)

# Step 1: Read the CNV data file
significant_cnv_data <- read_csv("../L4/cnv_event_percentages_by_cell.csv")
Rows: 2090 Columns: 4── Column specification ──────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────
Delimiter: ","
chr (2): chr_band, cnv_type
dbl (2): cell_count, percentage
ℹ Use `spec()` to retrieve the full column specification for this data.
ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
# Verify that the data was read correctly
print(head(significant_cnv_data))

# Step 2: Filter data to only include rows with a percentage > 5%
filtered_cnv_data <- significant_cnv_data %>%
  filter(percentage > 15)  # Change 5 to 90 if you want to filter for >90%

# Step 3: Create the bar plot using the filtered data
ggplot(filtered_cnv_data, aes(x = percentage, y = reorder(chr_band, percentage), fill = cnv_type)) +
  geom_bar(stat = "identity", position = "dodge", width = 0.8) +  # Adjusted bar width
  scale_fill_manual(values = c("Complete Loss (0x)" = "#fdae61",  # Orange for complete loss
                               "Loss of One Copy (0.5x)" = "#313695",  # Blue for partial loss
                               "Addition of One Copy (1.5x)" = "red",  # Red for gain
                               "Addition of Two Copies (2x)" = "darkgreen")) +  # Green for higher gain
  labs(
    title = "Significant CNVs Affecting >15% of Cells",  # Update title accordingly
    subtitle = "Chromosomal Bands with CNVs in >15% of Cells",  # Update subtitle accordingly
    x = "Percentage of Cells (%)",
    y = "Chromosomal Band",
    fill = "CNV Type"
  ) +
  theme_minimal(base_size = 14) +  # Adjust text size for readability
  theme(
    axis.title.x = element_text(size = 14, face = "bold"),
    axis.title.y = element_text(size = 14, face = "bold"),
    axis.text.y = element_text(size = 12),
    axis.text.x = element_text(size = 12),
    legend.title = element_text(size = 12, face = "bold"),
    legend.text = element_text(size = 10)
  )

NA
NA
NA
NA

Visualization_L7


# Load necessary libraries
library(dplyr)
library(readr)
library(ggplot2)
library(gtable)

# Step 1: Read the CNV data file
significant_cnv_data <- read_csv("../L7/cnv_event_percentages_by_cell.csv")
Rows: 2085 Columns: 4── Column specification ──────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────
Delimiter: ","
chr (2): chr_band, cnv_type
dbl (2): cell_count, percentage
ℹ Use `spec()` to retrieve the full column specification for this data.
ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
# Verify that the data was read correctly
print(head(significant_cnv_data))

# Step 2: Filter data to only include rows with a percentage > 5%
filtered_cnv_data <- significant_cnv_data %>%
  filter(percentage > 15)  # Change 5 to 90 if you want to filter for >90%

# Step 3: Create the bar plot using the filtered data
ggplot(filtered_cnv_data, aes(x = percentage, y = reorder(chr_band, percentage), fill = cnv_type)) +
  geom_bar(stat = "identity", position = "dodge", width = 0.8) +  # Adjusted bar width
  scale_fill_manual(values = c("Complete Loss (0x)" = "#fdae61",  # Orange for complete loss
                               "Loss of One Copy (0.5x)" = "#313695",  # Blue for partial loss
                               "Addition of One Copy (1.5x)" = "red",  # Red for gain
                               "Addition of Two Copies (2x)" = "darkgreen")) +  # Green for higher gain
  labs(
    title = "Significant CNVs Affecting >15% of Cells",  # Update title accordingly
    subtitle = "Chromosomal Bands with CNVs in >15% of Cells",  # Update subtitle accordingly
    x = "Percentage of Cells (%)",
    y = "Chromosomal Band",
    fill = "CNV Type"
  ) +
  theme_minimal(base_size = 14) +  # Adjust text size for readability
  theme(
    axis.title.x = element_text(size = 14, face = "bold"),
    axis.title.y = element_text(size = 14, face = "bold"),
    axis.text.y = element_text(size = 12),
    axis.text.x = element_text(size = 12),
    legend.title = element_text(size = 12, face = "bold"),
    legend.text = element_text(size = 10)
  )

NA
NA
NA
NA
---
title: "InferCNV Analysis-Silvia-Scripts"
author: Nasir Mahmood Abbasi
date: "`r Sys.Date()`"
output:
  #rmdformats::readthedown
  html_notebook:
    toc: true
    toc_float: true
    toc_collapsed: true
---

# 1. load libraries
```{r setup, include=FALSE}


# Load required libraries
library(ggplot2)
library(dplyr)
library(readr)


```


# 2. perform infercnv operations to reveal cnv signal
```{r CNV_signal, fig.height=6, fig.width=10}

##this is to match the chr-regions to the chr-arms
# hg38 = read.delim("../cytoBand_hg38.txt", header = F)
# cnv = read.table("../L1/HMM_CNV_predictions.HMMi6.rand_trees.hmm_mode-subclusters.Pnorm_0.5.pred_cnv_regions.dat",header = T) ##from inferCNV resulting files
# 
# cytoband = hg38
# cytoband <- data.frame(V1=gsub("chr", "", cytoband[,1]), V2=cytoband[,2], V3=cytoband[,3], V4=substring(cytoband$V4, 1, 1), stringsAsFactors=F)
# start <- do.call(rbind, lapply(split(cytoband$V2, paste0(cytoband$V1, cytoband$V4)), min))
# end <- do.call(rbind, lapply(split(cytoband$V3, paste0(cytoband$V1, cytoband$V4)), max))
# cytoband <- data.frame(V1=gsub("p", "", gsub("q", "", rownames(start))), V2=start, V3=end, V4=rownames(start), stringsAsFactors=F)
# cytoband <- cytoband [as.vector(unlist(sapply(c(1:22, "X"), function(x) which(cytoband$V1 %in% x)))), ]
# cytoband$V4[grep("q", cytoband$V4)] <- "q"
# cytoband$V4[grep("p", cytoband$V4)] <- "p"
# rownames(cytoband) <- NULL
# names(cytoband) = c("chr", "start", "end", "arm")
# cytoband$chr_arm = paste0(cytoband$chr, cytoband$arm)
# cytoband$chromosome = paste0("chr", cytoband$chr)
# cytoband$chr = cytoband$chromosome
# cytoband = cytoband[,1:5]
# 
# 
# library(dplyr)
# cnv$chr = as.character(cnv$chr)
# cytoband$chr = as.character(cytoband$chr)
# cnv$mid = (as.numeric(cnv$start) + as.numeric(cnv$end))/2
#     new = inner_join(cnv,cytoband, by=c("chr"))%>%
#           mutate(arm = ifelse(mid >= as.numeric(start.y) & mid <= as.numeric(end.y), chr_arm, NA))%>%
#           group_by(chr)%>%
#           filter(!is.na(arm)|n()==1)
#           final_cnv = new[,c(1:6, 10)]
# 
# names(final_cnv) = c("cell_group_name", "cnv_name", "state", "chr", "start", "end", "arm")
# 
# final_cnv$event = ifelse(final_cnv$state >=4, "gain", "loss")
# 
# final_cnv$large_event = paste(final_cnv$arm, final_cnv$event, sep = " ")
# 
# final_cnv = final_cnv[!duplicated(final_cnv[,c('cell_group_name', 'large_event')]),]
# 
# large_events = unique(final_cnv$large_event)
# 
# final_cnv$group = gsub("sample.", "", final_cnv$cell_group_name)
# 
# ##change the data frame to wide to summarize the event
# library(reshape2)
# wide = reshape2::dcast(final_cnv, large_event ~ group, value.var = "event", fun.aggregate = length)
# ##change the table to 0/1
# rownames(wide) = wide$large_event
# wide = wide[,-1]
# wide = as.matrix(ifelse(wide >= 1, 1, 0))
# wide = as.data.frame(cbind(wide, total = rowSums(wide)))
# new = wide[order(-wide$total),]
# new$"1.1.1" = ifelse((new$`1.1.1.1` + new$`1.1.1.2`) >=2, 1, 0)
# new$"1.1.2" = ifelse((new$`1.1.2.1` + new$`1.1.2.2`) >=2, 1, 0)
# new$"1.1" = ifelse((new$`1.1.1` + new$`1.1.2`) >=2, 1, 0)
# new$"1.2.1" = ifelse((new$`1.2.1.1` + new$`1.2.1.2`) >= 2, 1, 0)
# #new$"1.2.2" = ifelse((new$`1.2.2.1` + new$`1.2.2.2`) >= 2, 1, 0)
# new$"1.2" = ifelse((new$`1.2.1` + new$`1.2.2`) >=2, 1, 0)
# new$"1" = ifelse((new$`1.1` + new$`1.2`) >=2, 1, 0)
# new = new[,c(14,11,9,1,2,10,3,4,13,12,5,6,7,8)]
# write.csv(new, "../L1/events.csv")
# 
# ##read in the cell grouping data from UPhyloplot2 to calculate the cell number in each branch
# cells = read.table("../../uphyloplot2-2.3/Inputs/L1.cell_groupings", header = T)
# sum = as.data.frame(table(cells$cell_group_name))
# write.csv(sum, "../L1/cell_group_frequency.csv")
# ##calculate the percentage of each CNV event
# cell_group = read.csv("17_HMM_predHMMi6.rand_trees.hmm_mode-subclusters.cell_groupings", header = F)
# group = as.data.frame(cell_group[,2])
# rownames(group) = cell_group[,1]
# per = new[,-14]
# percentage = per*group[,1][col(per)]
# percentage$sample = rowSums(percentage)
# write.csv(percentage, "../L1/sample_events_percentage.csv")

```
# 3. perform infercnv operations to reveal cnv signal
```{r CNV_signal2, fig.height=6, fig.width=10}

library(dplyr)
library(reshape2)

# Read input files
hg38 = read.delim("../cytoBand_hg38.txt", header = F)
cnv = read.table("../L1/HMM_CNV_predictions.HMMi6.rand_trees.hmm_mode-subclusters.Pnorm_0.5.pred_cnv_regions.dat", header = T)

# Process cytoband data
cytoband = hg38
cytoband <- data.frame(V1=gsub("chr", "", cytoband[,1]), V2=cytoband[,2], V3=cytoband[,3], V4=substring(cytoband$V4, 1, 1), stringsAsFactors=F)
start <- do.call(rbind, lapply(split(cytoband$V2, paste0(cytoband$V1, cytoband$V4)), min))
end <- do.call(rbind, lapply(split(cytoband$V3, paste0(cytoband$V1, cytoband$V4)), max))
cytoband <- data.frame(V1=gsub("p", "", gsub("q", "", rownames(start))), V2=start, V3=end, V4=rownames(start), stringsAsFactors=F)
cytoband <- cytoband[as.vector(unlist(sapply(c(1:22, "X"), function(x) which(cytoband$V1 %in% x)))), ]
cytoband$V4[grep("q", cytoband$V4)] <- "q"
cytoband$V4[grep("p", cytoband$V4)] <- "p"
rownames(cytoband) <- NULL
names(cytoband) = c("chr", "start", "end", "arm")
cytoband$chr_arm = paste0(cytoband$chr, cytoband$arm)
cytoband$chromosome = paste0("chr", cytoband$chr)
cytoband$chr = cytoband$chromosome
cytoband = cytoband[,1:5]

# Process CNV data
cnv$chr = as.character(cnv$chr)
cytoband$chr = as.character(cytoband$chr)
cnv$mid = (as.numeric(cnv$start) + as.numeric(cnv$end))/2

# Join CNV and cytoband data
new = inner_join(cnv, cytoband, by=c("chr")) %>%
      mutate(arm = ifelse(mid >= as.numeric(start.y) & mid <= as.numeric(end.y), chr_arm, NA)) %>%
      group_by(chr) %>%
      filter(!is.na(arm) | n()==1)
final_cnv = new[,c(1:6, 10)]

names(final_cnv) = c("cell_group_name", "cnv_name", "state", "chr", "start", "end", "arm")

final_cnv$event = ifelse(final_cnv$state >= 4, "gain", "loss")
final_cnv$large_event = paste(final_cnv$arm, final_cnv$event, sep = " ")
final_cnv = final_cnv[!duplicated(final_cnv[,c('cell_group_name', 'large_event')]),]
final_cnv$group = gsub("sample.", "", final_cnv$cell_group_name)

# Create wide format data
wide = dcast(final_cnv, large_event ~ group, value.var = "event", fun.aggregate = length)
rownames(wide) = wide$large_event
wide = wide[,-1]
wide = as.matrix(ifelse(wide >= 1, 1, 0))
wide = as.data.frame(cbind(wide, total = rowSums(wide)))
new = wide[order(-wide$total),]

# Create new columns based on existing ones
new$"L1.L1.1.1" = ifelse((new$L1.L1.1.1.1.1 + new$L1.L1.1.1.1.2) >= 2, 1, 0)
new$"L1.L1.1.2" = ifelse((new$L1.L1.1.2.1.1 + new$L1.L1.1.2.1.2) >= 2, 1, 0)
new$"L1.L1.1" = ifelse((new$L1.L1.1.1 + new$L1.L1.1.2) >= 2, 1, 0)
new$"L1.L1.2.1" = ifelse((new$L1.L1.1.2.2.1 + new$L1.L1.1.2.2.2) >= 2, 1, 0)
new$"L1.L1.2" = new$L1.L1.2.1
new$"L1.L1" = ifelse((new$L1.L1.1 + new$L1.L1.2) >= 2, 1, 0)

# Reorder columns
new_order = c("L1.L1", "L1.L1.1", "L1.L1.1.1", "L1.L1.1.1.1.1", "L1.L1.1.1.1.2", 
              "L1.L1.1.2", "L1.L1.1.2.1.1", "L1.L1.1.2.1.2", 
              "L1.L1.2", "L1.L1.2.1", "L1.L1.1.2.2.1", "L1.L1.1.2.2.2", "total")
new = new[, c(new_order, setdiff(names(new), new_order))]

write.csv(new, "../L1/events.csv", row.names = TRUE)

# Calculate percentage of each CNV event
tryCatch({
  # Read the file as a single column
  cell_group = read.csv("../../uphyloplot2-2.3/Inputs/L1.cell_groupings", 
                        header = FALSE, stringsAsFactors = FALSE)
  
  # Split the column into two based on the tab character
  cell_group = data.frame(do.call('rbind', strsplit(as.character(cell_group$V1), '\t', fixed=TRUE)))
  colnames(cell_group) = c("cell_group_name", "cell")
  
  # Remove the header row
  cell_group = cell_group[-1,]
  
  print("Structure of cell_group:")
  print(str(cell_group))
  
  # Create group dataframe
  group = as.data.frame(table(cell_group$cell_group_name))
  colnames(group) = c("cell_group_name", "count")
  rownames(group) = group$cell_group_name
  
  print("Structure of group:")
  print(str(group))
  
  # Assuming 'new' is your dataframe with CNV events
  per = new[, setdiff(names(new), "total")]
  
  print("Columns of per:")
  print(colnames(per))
  
  print("Rows of group:")
  print(rownames(group))
  
  matching_rows = intersect(rownames(group), colnames(per))
  if(length(matching_rows) == 0) {
    stop("No matching row names between group and columns of per.")
  }
  
  # Calculate percentages
  percentage = per[, matching_rows] * group[matching_rows, "count"]
  
  # Calculate total cells for each CNV event
  total_cells = rowSums(percentage)
  
  # Convert counts to percentages
  percentage = sweep(percentage, 1, total_cells, "/") * 100
  
  # Add total percentage column
  percentage$sample = rowSums(percentage)
  
  write.csv(percentage, "../L1/sample_events_percentage.csv", row.names = TRUE)
  
  print("Percentage calculation completed successfully.")
}, error = function(e) {
  print(paste("An error occurred:", e$message))
})
```

# 2. perform infercnv operations to reveal cnv signal
```{r CNV_signal3, fig.height=6, fig.width=10}


library(dplyr)
library(readr)

# Read the .cell_groupings file, skipping the first row which is an extra header
cell_groupings_file <- "../L7/L7.cell_groupings"
cell_mapping <- read_tsv(cell_groupings_file, col_names = c("cell_group_name", "cell_id"))

# Filter for only L1 cells
cell_mapping <- cell_mapping %>%
  filter(grepl("^L7_", cell_id))

# Write the mapping to a CSV file
write_csv(cell_mapping, "../L7/cell_to_group_mapping.csv")

# Print the first few rows to verify
print(head(cell_mapping))

# Print the number of rows to verify it's not empty
print(paste("Number of rows in cell_mapping:", nrow(cell_mapping)))






library(dplyr)
library(tidyr)

# Read the CNV data
cnv_data <- read.csv("../L7/L7_infercnv_with_chr_bands.csv", stringsAsFactors = FALSE)

# Read the cell to cell group mapping file
# This file should have columns: cell_id, cell_group_name
cell_mapping <- read.csv("../L7/cell_to_group_mapping.csv", stringsAsFactors = FALSE)

# Join the CNV data with the cell mapping
expanded_cnv_data <- cnv_data %>%
  left_join(cell_mapping, by = "cell_group_name") %>%
  select(-cell_group_name) %>%  # Remove the cell group column
  rename(cell_id = cell_id)  # Rename for clarity

# Count total number of cells
total_cells <- nrow(cell_mapping)

# Process the expanded data
cnv_summary <- expanded_cnv_data %>%
  # Group by chromosome band and CNV state
  group_by(chr_band, state) %>%
  # Count cells for each CNV event
  summarize(
    cell_count = n(),
    .groups = 'drop'
  ) %>%
  # Calculate percentage
  mutate(
    percentage = (cell_count / total_cells) * 100,
    cnv_type = case_when(
      state == 1 ~ "Complete Loss (0x)",
      state == 2 ~ "Loss of One Copy (0.5x)",
      state == 3 ~ "Neutral (1x)",
      state == 4 ~ "Addition of One Copy (1.5x)",
      state == 5 ~ "Addition of Two Copies (2x)",
      state == 6 ~ "Placeholder for >2x Copies (3x)"
    )
  ) %>%
  # Select relevant columns and sort
  select(chr_band, cnv_type, cell_count, percentage) %>%
  arrange(desc(percentage))

# Display the results
print(cnv_summary)

# Optionally, write to a CSV file
write.csv(cnv_summary, "../L7/cnv_event_percentages_by_cell.csv", row.names = FALSE)

```


## Visualization_L1
```{r visualize1, fig.height=8, fig.width=12}

# Load necessary libraries
library(dplyr)
library(readr)
library(ggplot2)
library(gtable)

# Step 1: Read the CNV data file
significant_cnv_data <- read_csv("../L1/cnv_event_percentages_by_cell.csv")

# Verify that the data was read correctly
print(head(significant_cnv_data))

# Step 2: Filter data to only include rows with a percentage > 5%
filtered_cnv_data <- significant_cnv_data %>%
  filter(percentage > 10)  # Change 5 to 90 if you want to filter for >90%

# Step 3: Create the bar plot using the filtered data
ggplot(filtered_cnv_data, aes(x = percentage, y = reorder(chr_band, percentage), fill = cnv_type)) +
  geom_bar(stat = "identity", position = "dodge", width = 0.8) +  # Adjusted bar width
  scale_fill_manual(values = c("Complete Loss (0x)" = "#fdae61",  # Orange for complete loss
                               "Loss of One Copy (0.5x)" = "#313695",  # Blue for partial loss
                               "Addition of One Copy (1.5x)" = "red",  # Red for gain
                               "Addition of Two Copies (2x)" = "darkgreen")) +  # Green for higher gain
  labs(
    title = "Significant CNVs Affecting >10% of Cells",  # Update title accordingly
    subtitle = "Chromosomal Bands with CNVs in >10% of Cells",  # Update subtitle accordingly
    x = "Percentage of Cells (%)",
    y = "Chromosomal Band",
    fill = "CNV Type"
  ) +
  theme_minimal(base_size = 14) +  # Adjust text size for readability
  theme(
    axis.title.x = element_text(size = 14, face = "bold"),
    axis.title.y = element_text(size = 14, face = "bold"),
    axis.text.y = element_text(size = 12),
    axis.text.x = element_text(size = 12),
    legend.title = element_text(size = 12, face = "bold"),
    legend.text = element_text(size = 10)
  )




```


## Visualization_L2
```{r visualize2, fig.height=8, fig.width=12}

# Load necessary libraries
library(dplyr)
library(readr)
library(ggplot2)
library(gtable)

# Step 1: Read the CNV data file
significant_cnv_data <- read_csv("../L2/cnv_event_percentages_by_cell.csv")

# Verify that the data was read correctly
print(head(significant_cnv_data))

# Step 2: Filter data to only include rows with a percentage > 5%
filtered_cnv_data <- significant_cnv_data %>%
  filter(percentage > 15)  # Change 5 to 90 if you want to filter for >90%

# Step 3: Create the bar plot using the filtered data
ggplot(filtered_cnv_data, aes(x = percentage, y = reorder(chr_band, percentage), fill = cnv_type)) +
  geom_bar(stat = "identity", position = "dodge", width = 0.8) +  # Adjusted bar width
  scale_fill_manual(values = c("Complete Loss (0x)" = "#fdae61",  # Orange for complete loss
                               "Loss of One Copy (0.5x)" = "#313695",  # Blue for partial loss
                               "Addition of One Copy (1.5x)" = "red",  # Red for gain
                               "Addition of Two Copies (2x)" = "darkgreen")) +  # Green for higher gain
  labs(
    title = "Significant CNVs Affecting >15% of Cells",  # Update title accordingly
    subtitle = "Chromosomal Bands with CNVs in >15% of Cells",  # Update subtitle accordingly
    x = "Percentage of Cells (%)",
    y = "Chromosomal Band",
    fill = "CNV Type"
  ) +
  theme_minimal(base_size = 14) +  # Adjust text size for readability
  theme(
    axis.title.x = element_text(size = 14, face = "bold"),
    axis.title.y = element_text(size = 14, face = "bold"),
    axis.text.y = element_text(size = 12),
    axis.text.x = element_text(size = 12),
    legend.title = element_text(size = 12, face = "bold"),
    legend.text = element_text(size = 10)
  )




```


## Visualization_L3
```{r visualize3, fig.height=8, fig.width=12}

# Load necessary libraries
library(dplyr)
library(readr)
library(ggplot2)
library(gtable)

# Step 1: Read the CNV data file
significant_cnv_data <- read_csv("../L3/cnv_event_percentages_by_cell.csv")

# Verify that the data was read correctly
print(head(significant_cnv_data))

# Step 2: Filter data to only include rows with a percentage > 5%
filtered_cnv_data <- significant_cnv_data %>%
  filter(percentage > 15)  # Change 5 to 90 if you want to filter for >90%

# Step 3: Create the bar plot using the filtered data
ggplot(filtered_cnv_data, aes(x = percentage, y = reorder(chr_band, percentage), fill = cnv_type)) +
  geom_bar(stat = "identity", position = "dodge", width = 0.8) +  # Adjusted bar width
  scale_fill_manual(values = c("Complete Loss (0x)" = "#fdae61",  # Orange for complete loss
                               "Loss of One Copy (0.5x)" = "#313695",  # Blue for partial loss
                               "Addition of One Copy (1.5x)" = "red",  # Red for gain
                               "Addition of Two Copies (2x)" = "darkgreen")) +  # Green for higher gain
  labs(
    title = "Significant CNVs Affecting >15% of Cells",  # Update title accordingly
    subtitle = "Chromosomal Bands with CNVs in >15% of Cells",  # Update subtitle accordingly
    x = "Percentage of Cells (%)",
    y = "Chromosomal Band",
    fill = "CNV Type"
  ) +
  theme_minimal(base_size = 14) +  # Adjust text size for readability
  theme(
    axis.title.x = element_text(size = 14, face = "bold"),
    axis.title.y = element_text(size = 14, face = "bold"),
    axis.text.y = element_text(size = 12),
    axis.text.x = element_text(size = 12),
    legend.title = element_text(size = 12, face = "bold"),
    legend.text = element_text(size = 10)
  )




```



## Visualization_L4
```{r visualize4, fig.height=8, fig.width=12}

# Load necessary libraries
library(dplyr)
library(readr)
library(ggplot2)
library(gtable)

# Step 1: Read the CNV data file
significant_cnv_data <- read_csv("../L4/cnv_event_percentages_by_cell.csv")

# Verify that the data was read correctly
print(head(significant_cnv_data))

# Step 2: Filter data to only include rows with a percentage > 5%
filtered_cnv_data <- significant_cnv_data %>%
  filter(percentage > 15)  # Change 5 to 90 if you want to filter for >90%

# Step 3: Create the bar plot using the filtered data
ggplot(filtered_cnv_data, aes(x = percentage, y = reorder(chr_band, percentage), fill = cnv_type)) +
  geom_bar(stat = "identity", position = "dodge", width = 0.8) +  # Adjusted bar width
  scale_fill_manual(values = c("Complete Loss (0x)" = "#fdae61",  # Orange for complete loss
                               "Loss of One Copy (0.5x)" = "#313695",  # Blue for partial loss
                               "Addition of One Copy (1.5x)" = "red",  # Red for gain
                               "Addition of Two Copies (2x)" = "darkgreen")) +  # Green for higher gain
  labs(
    title = "Significant CNVs Affecting >15% of Cells",  # Update title accordingly
    subtitle = "Chromosomal Bands with CNVs in >15% of Cells",  # Update subtitle accordingly
    x = "Percentage of Cells (%)",
    y = "Chromosomal Band",
    fill = "CNV Type"
  ) +
  theme_minimal(base_size = 14) +  # Adjust text size for readability
  theme(
    axis.title.x = element_text(size = 14, face = "bold"),
    axis.title.y = element_text(size = 14, face = "bold"),
    axis.text.y = element_text(size = 12),
    axis.text.x = element_text(size = 12),
    legend.title = element_text(size = 12, face = "bold"),
    legend.text = element_text(size = 10)
  )




```

## Visualization_L7
```{r visualize7, fig.height=8, fig.width=12}

# Load necessary libraries
library(dplyr)
library(readr)
library(ggplot2)
library(gtable)

# Step 1: Read the CNV data file
significant_cnv_data <- read_csv("../L7/cnv_event_percentages_by_cell.csv")

# Verify that the data was read correctly
print(head(significant_cnv_data))

# Step 2: Filter data to only include rows with a percentage > 5%
filtered_cnv_data <- significant_cnv_data %>%
  filter(percentage > 15)  # Change 5 to 90 if you want to filter for >90%

# Step 3: Create the bar plot using the filtered data
ggplot(filtered_cnv_data, aes(x = percentage, y = reorder(chr_band, percentage), fill = cnv_type)) +
  geom_bar(stat = "identity", position = "dodge", width = 0.8) +  # Adjusted bar width
  scale_fill_manual(values = c("Complete Loss (0x)" = "#fdae61",  # Orange for complete loss
                               "Loss of One Copy (0.5x)" = "#313695",  # Blue for partial loss
                               "Addition of One Copy (1.5x)" = "red",  # Red for gain
                               "Addition of Two Copies (2x)" = "darkgreen")) +  # Green for higher gain
  labs(
    title = "Significant CNVs Affecting >15% of Cells",  # Update title accordingly
    subtitle = "Chromosomal Bands with CNVs in >15% of Cells",  # Update subtitle accordingly
    x = "Percentage of Cells (%)",
    y = "Chromosomal Band",
    fill = "CNV Type"
  ) +
  theme_minimal(base_size = 14) +  # Adjust text size for readability
  theme(
    axis.title.x = element_text(size = 14, face = "bold"),
    axis.title.y = element_text(size = 14, face = "bold"),
    axis.text.y = element_text(size = 12),
    axis.text.x = element_text(size = 12),
    legend.title = element_text(size = 12, face = "bold"),
    legend.text = element_text(size = 10)
  )




```





