# Load Dataset
dataset_s2 <- read.csv("DatasetS2.csv")
# List of PGSC Gene IDs of Interest
pgsc_gene_ids <- c(
"PGSC0003DMG402029202", "PGSC0003DMG400032587", "PGSC0003DMG400009854",
"PGSC0003DMG400029200", "PGSC0003DMG400029201", "PGSC0003DMG402029198",
"PGSC0003DMG400005598", "PGSC0003DMG400031568", "PGSC0003DMG400031559",
"PGSC0003DMG400016402", "PGSC0003DMG400014722", "PGSC0003DMG400006855",
"PGSC0003DMG400026049", "PGSC0003DMG400041045", "PGSC0003DMG400025991",
"PGSC0003DMG400046672", "PGSC0003DMG400025989", "PGSC0003DMG400026048",
"PGSC0003DMG400040573", "PGSC0003DMG400012763", "PGSC0003DMG400018245",
"PGSC0003DMG400018286", "PGSC0003DMG400018284", "PGSC0003DMG400005105",
"PGSC0003DMG400025871", "PGSC0003DMG400025873", "PGSC0003DMG400025874",
"PGSC0003DMG400018641", "PGSC0003DMG400011801", "PGSC0003DMG400003461",
"PGSC0003DMG400003324", "PGSC0003DMG400037578", "PGSC0003DMG400021142",
"PGSC0003DMG400013663", "PGSC0003DMG400001374", "PGSC0003DMG400018679",
"PGSC0003DMG400028702", "PGSC0003DMG400012790", "PGSC0003DMG401031204",
"PGSC0003DMG400020965", "PGSC0003DMG400004923", "PGSC0003DMG400009924",
"PGSC0003DMG400043233", "PGSC0003DMG400003022", "PGSC0003DMG400007027",
"PGSC0003DMG400002156", "PGSC0003DMG400013034", "PGSC0003DMG400018066",
"PGSC0003DMG400009832", "PGSC0003DMG400019784", "PGSC0003DMG400019783",
"PGSC0003DMG400019782", "PGSC0003DMG400019773", "PGSC0003DMG400024420",
"PGSC0003DMG400005931", "PGSC0003DMG400030422", "PGSC0003DMG400027684",
"PGSC0003DMG400011740", "PGSC0003DMG400011749", "PGSC0003DMG400017379",
"PGSC0003DMG400035925", "PGSC0003DMG401015022", "PGSC0003DMG402015022",
"PGSC0003DMG400024442", "PGSC0003DMG400024441", "PGSC0003DMG400024440",
"PGSC0003DMG400010593", "PGSC0003DMG400010592", "PGSC0003DMG400010585",
"PGSC0003DMG400004305", "PGSC0003DMG400017508", "PGSC0003DMG400003929",
"PGSC0003DMG400022749", "PGSC0003DMG400022750", "PGSC0003DMG400002720",
"PGSC0003DMG400035185", "PGSC0003DMG400042285", "PGSC0003DMG400020939",
"PGSC0003DMG400003408", "PGSC0003DMG400018937", "PGSC0003DMG400034947",
"PGSC0003DMG400039005", "PGSC0003DMG400008184", "PGSC0003DMG401031070",
"PGSC0003DMG402031070", "PGSC0003DMG400034115", "PGSC0003DMG400034116",
"PGSC0003DMG400027337", "PGSC0003DMG400047267", "PGSC0003DMG400046343",
"PGSC0003DMG400020680", "PGSC0003DMG401018966", "PGSC0003DMG400030301",
"PGSC0003DMG400045253", "PGSC0003DMG400039789", "PGSC0003DMG400015437",
"PGSC0003DMG400015438", "PGSC0003DMG400015439", "PGSC0003DMG400004633"
)
columns_of_interest1 <- c(
"PI292110_stn", "PI230512_stn", "PI365344_stn", "PI558142_adg", "PI320355_phu",
"PI243469_phu", "PI225710_phu", "PI234011_stn", "PI195191_phu", "PI214421_adg",
"PI234013_stn", "PI245940_tub", "PI258874_adg", "PI258885_adg", "PI245935_tub",
"PI245847_tub", "PI546023_adg", "PI365345_adg", "PI607886_adg", "Spunta",
"EarlyRose", "RussetBurbank", "Missaukee", "Burbank", "Norland", "PurpleMajesty",
"Katahdin", "PremierRusset", "Shepody", "Snowden", "YukonGold", "Atlantic",
"RussetNorkotah", "SierraGold", "RioGrandeRusset", "IrishCobbler", "DakotaDiamond",
"Kennebec", "Kalkaska", "Superior", "GarnetChili", "MountainRose", "PI210044_cnd",
"PI195204_stn", "PI265863_cnd", "PI296126_rap", "PI265872_med", "PI546000_blv_mga",
"PI473385_brc_spl", "PI458355_mcd", "PI458368_oka", "PI473305_vrn", "PI275139_chc",
"PI473065_brc_grl", "PI545987_brc_lph", "PI458324_ifd", "PI498359_ktz",
"PI498112_brc_brc", "PI472978_brc_spg", "PI458365_ber", "PI545964_blv",
"PI275260_ver", "PI558050_cmm", "PI365339_chom", "PI275216_ehr", "PI458424_jam",
"PI558288_etub"
)
# Filter
matched_data <- dataset_s2[dataset_s2$Gene.ID %in% pgsc_gene_ids, ]
heatmap_data <- matched_data[, columns_of_interest1, drop = FALSE]
# Numeric Data
heatmap_data <- as.data.frame(lapply(heatmap_data, as.numeric))
# NAs= 0
heatmap_data[is.na(heatmap_data)] <- 0
# Create a matrix
heatmap_matrix <- as.matrix(heatmap_data)
# Generate the heatmap
library(pheatmap)
pheatmap(heatmap_matrix,
color = colorRampPalette(c("blue", "white", "red"))(50),
cluster_rows = TRUE,
cluster_cols = TRUE,
main = "Heatmap of Gene Copies")

# Filter Data
matched_data <- dataset_s2[dataset_s2$Gene.ID %in% pgsc_gene_ids, ]
columns_of_interest <- c(
"RussetBurbank", "Atlantic", "Katahdin",
"PI210044_cnd", "PI265863_cnd", "PI558050_cmm"
)
heatmap_data <- matched_data[, columns_of_interest, drop = FALSE]
# Ensure Numeric Data
heatmap_data <- as.data.frame(lapply(heatmap_data, as.numeric))
heatmap_data[is.na(heatmap_data)] <- 0
rownames(heatmap_data) <- matched_data$Gene.ID
# Filter Data
matched_data <- dataset_s2[dataset_s2$Gene.ID %in% pgsc_gene_ids, ]
columns_of_interest <- c(
"RussetBurbank", "Atlantic", "Katahdin",
"PI210044_cnd", "PI265863_cnd", "PI558050_cmm"
)
heatmap_data <- matched_data[, columns_of_interest, drop = FALSE]
# Ensure Numeric Data
heatmap_data <- as.data.frame(lapply(heatmap_data, as.numeric))
heatmap_data[is.na(heatmap_data)] <- 0
rownames(heatmap_data) <- matched_data$Gene.ID
# Create Heatmap Matrix
heatmap_matrix <- as.matrix(heatmap_data)
# Generate Heatmap
# Create Heatmap Matrix
heatmap_matrix <- as.matrix(heatmap_data)
# Generate Heatmap
pheatmap(
heatmap_matrix,
color = colorRampPalette(c("blue", "white", "red"))(20),
cluster_rows = TRUE,
cluster_cols = TRUE,
main = "Heatmap of Top Genes"
)

# Asignar nombres de genes a las filas de heatmap_data si no estƔn
rownames(heatmap_data) <- matched_data$Gene.ID # matched_data debe contener Gene.ID
# Definir las columnas comerciales y silvestres
commercial_cols <- c("RussetBurbank", "Atlantic", "Katahdin") # Ajustar a las muestras comerciales
wild_cols <- c("PI210044_cnd", "PI265863_cnd", "PI558050_cmm") # Ajustar a las muestras silvestres
# Calcular promedios de genes para variedades comerciales y silvestres
commercial_means <- rowMeans(heatmap_data[, commercial_cols, drop = FALSE], na.rm = TRUE)
wild_means <- rowMeans(heatmap_data[, wild_cols, drop = FALSE], na.rm = TRUE)
# Crear un data frame para la comparación
comparison <- data.frame(
Gene = rownames(heatmap_data),
Commercial = commercial_means,
Wild = wild_means
)
# Revisar estructura de los datos
print(head(comparison))
## Gene Commercial Wild
## PGSC0003DMG402029202 PGSC0003DMG402029202 1.00000000 0.4243333
## PGSC0003DMG400032587 PGSC0003DMG400032587 0.08333333 0.1666667
## PGSC0003DMG400009854 PGSC0003DMG400009854 0.08333333 0.1396667
## PGSC0003DMG400029200 PGSC0003DMG400029200 0.53633333 0.4270000
## PGSC0003DMG400029201 PGSC0003DMG400029201 1.14200000 1.2090000
## PGSC0003DMG402029198 PGSC0003DMG402029198 0.96300000 2.1936667
# Cargar ggplot2 para graficar
library(ggplot2)
# Crear el grƔfico de barras
ggplot(comparison, aes(x = Gene)) +
geom_bar(aes(y = Commercial, fill = "Commercial"), stat = "identity", position = "dodge") +
geom_bar(aes(y = Wild, fill = "Wild"), stat = "identity", position = "dodge") +
scale_fill_manual(values = c("Commercial" = "red", "Wild" = "blue")) +
labs(
title = "Comparison of Gene Copy Numbers Between Commercial and Wild Varieties",
x = "Genes",
y = "Average Copy Number",
fill = "Group"
) +
theme(
axis.text.x = element_text(angle = 90, hjust = 1, size = 8), # Rotar nombres de genes
plot.title = element_text(size = 16, face = "bold")
)

# Calculate Averages
commercial_samples <- c("RussetBurbank", "Atlantic", "Katahdin")
wild_samples <- c("PI210044_cnd", "PI265863_cnd", "PI558050_cmm")
comparison <- data.frame(
Gene = rownames(heatmap_data),
Commercial = rowMeans(heatmap_data[, commercial_samples], na.rm = TRUE),
Wild = rowMeans(heatmap_data[, wild_samples], na.rm = TRUE)
)
# Add Difference Column
comparison$Difference <- abs(comparison$Commercial - comparison$Wild)
# Sort and Select Top Genes
top_genes <- head(comparison[order(-comparison$Difference), ], 10)
# Plot
ggplot(top_genes, aes(x = reorder(Gene, -Difference))) +
geom_bar(aes(y = Commercial, fill = "Commercial"), stat = "identity", position = "dodge") +
geom_bar(aes(y = Wild, fill = "Wild"), stat = "identity", position = "dodge") +
scale_fill_manual(values = c("Commercial" = "red", "Wild" = "blue")) +
labs(
title = "Top 10 Genes with Greatest Copy Number Differences",
x = "Genes",
y = "Average Copy Number",
fill = "Group"
) +
theme(
axis.text.x = element_text(angle = 90, hjust = 1, size = 8),
plot.title = element_text(size = 16, face = "bold")
)

# Binary Presence/Absence Matrix
binary_matrix <- as.data.frame(t(apply(heatmap_matrix, 1, function(x) as.numeric(x > 0))))
colnames(binary_matrix) <- colnames(heatmap_data)
rownames(binary_matrix) <- rownames(heatmap_data)
# UpSet Plot
upset(
binary_matrix,
sets.bar.color = "tomato",
main.bar.color = "steelblue",
mainbar.y.label = "Gene Overlap Count",
sets.x.label = "Species",
order.by = "freq",
text.scale = c(1.5, 1.2)
)
