Software
library(ggplot2)
library(dplyr)
library(tidyr)
library(knitr)
Data
## CanFam 3.1
setwd("C:/Users/edlarsen/Documents/StructuralVariants")
vcf_Cfam3 <- read.csv("PTCL_StructuralVariants_CanFam3.filtered.ann.csv")
valid_chromosomes <- c(as.character(1:38), "X")
vcf_Cfam3 <- vcf_Cfam3[vcf_Cfam3$X.CHROM %in% valid_chromosomes, ]
## CanFam 4
vcf_Cfam4 <- read.csv("PTCL_StructuralVariants_CanFam4.filtered.ann.csv")
vcf_Cfam4 <- vcf_Cfam4[vcf_Cfam4$X.CHROM %in% valid_chromosomes, ]
Number of variants per chromosome
# CanFam 3.1
variant_counts_Cfam3 <- table(vcf_Cfam3$X.CHROM)
variant_df_Cfam3 <- as.data.frame(variant_counts_Cfam3)
colnames(variant_df_Cfam3) <- c("Chromosome", "VariantCount")
print(ggplot(variant_df_Cfam3, aes(x = Chromosome, y = VariantCount)) +
geom_bar(stat = "identity", fill = "steelblue") +
theme_minimal() +
labs(
title = "Structural Variant Calls per Chromosome (CanFam 3.1)",
x = "Chromosome",
y = "Number of Variants"
) +
theme(axis.text.x = element_text(angle = 45, hjust = 1)))

# CanFam 4
variant_counts_Cfam4 <- table(vcf_Cfam4$X.CHROM)
variant_df_Cfam4 <- as.data.frame(variant_counts_Cfam4)
colnames(variant_df_Cfam4) <- c("Chromosome", "VariantCount")
print(ggplot(variant_df_Cfam4, aes(x = Chromosome, y = VariantCount)) +
geom_bar(stat = "identity", fill = "steelblue") +
theme_minimal() +
labs(
title = "Structural Variant Calls per Chromosome (CanFam4)",
x = "Chromosome",
y = "Number of Variants"
) +
theme(axis.text.x = element_text(angle = 45, hjust = 1)))

Types of variants by chromosome
## CanFam 3.1
# Extract the variant type (SVTYPE) from the INFO column
vcf_Cfam3$SVTYPE <- sapply(vcf_Cfam3$INFO, function(info) {
match <- regmatches(info, regexpr("SVTYPE=[^;]+", info))
if (length(match) > 0) {
sub("SVTYPE=", "", match)
} else {
NA
}
})
vcf_Cfam3 <- vcf_Cfam3[!is.na(vcf_Cfam3$SVTYPE),] # remove NAs
# Summarize the counts of each variant type per chromosome
variant_summary_Cfam3 <- vcf_Cfam3 %>%
group_by(X.CHROM, SVTYPE) %>%
summarise(Count = n()) %>%
ungroup()
# Rename chromosome column for better usability in plotting
colnames(variant_summary_Cfam3)[1] <- "Chromosome"
# Ensure the chromosome column is ordered correctly
variant_summary_Cfam3$Chromosome <- factor(variant_summary_Cfam3$Chromosome, levels = c(as.character(1:38), "X"))
# Plot the number of variants per chromosome by type
ggplot(variant_summary_Cfam3, aes(x = Chromosome, y = Count, fill = SVTYPE)) +
geom_bar(stat = "identity") +
theme_minimal() +
labs(
title = "Structural Variants per Chromosome by Type (CanFam 3.1)",
x = "Chromosome",
y = "Number of Variants",
fill = "Variant Type"
) +
theme(axis.text.x = element_text(angle = 45, hjust = 1)) +
scale_fill_brewer(palette = "Set2")

kable(variant_summary_Cfam3, caption = "Variant count per chromosome (CanFam 3.1)")
Variant count per chromosome (CanFam 3.1)
1 |
BND |
156 |
1 |
DEL |
2275 |
1 |
DUP |
268 |
1 |
INS |
3552 |
1 |
INV |
25 |
10 |
BND |
46 |
10 |
DEL |
1269 |
10 |
DUP |
141 |
10 |
INS |
1828 |
10 |
INV |
10 |
11 |
BND |
91 |
11 |
DEL |
1396 |
11 |
DUP |
149 |
11 |
INS |
2025 |
11 |
INV |
11 |
12 |
BND |
83 |
12 |
DEL |
1684 |
12 |
DUP |
180 |
12 |
INS |
2594 |
12 |
INV |
14 |
13 |
BND |
56 |
13 |
DEL |
1224 |
13 |
DUP |
161 |
13 |
INS |
1955 |
13 |
INV |
7 |
14 |
BND |
44 |
14 |
DEL |
1213 |
14 |
DUP |
118 |
14 |
INS |
1841 |
14 |
INV |
13 |
15 |
BND |
61 |
15 |
DEL |
1344 |
15 |
DUP |
187 |
15 |
INS |
2085 |
15 |
INV |
4 |
16 |
BND |
67 |
16 |
DEL |
1320 |
16 |
DUP |
161 |
16 |
INS |
1927 |
16 |
INV |
8 |
17 |
BND |
56 |
17 |
DEL |
1369 |
17 |
DUP |
137 |
17 |
INS |
2060 |
17 |
INV |
13 |
18 |
BND |
69 |
18 |
DEL |
1199 |
18 |
DUP |
142 |
18 |
INS |
1710 |
18 |
INV |
16 |
19 |
BND |
71 |
19 |
DEL |
1203 |
19 |
DUP |
160 |
19 |
INS |
2015 |
19 |
INV |
17 |
2 |
BND |
47 |
2 |
DEL |
1587 |
2 |
DUP |
163 |
2 |
INS |
2478 |
2 |
INV |
10 |
20 |
BND |
60 |
20 |
DEL |
1179 |
20 |
DUP |
88 |
20 |
INS |
1729 |
20 |
INV |
12 |
21 |
BND |
61 |
21 |
DEL |
1173 |
21 |
DUP |
126 |
21 |
INS |
1702 |
21 |
INV |
5 |
22 |
BND |
92 |
22 |
DEL |
1208 |
22 |
DUP |
144 |
22 |
INS |
1811 |
22 |
INV |
11 |
23 |
BND |
69 |
23 |
DEL |
1056 |
23 |
DUP |
115 |
23 |
INS |
1761 |
23 |
INV |
6 |
24 |
BND |
40 |
24 |
DEL |
970 |
24 |
DUP |
78 |
24 |
INS |
1475 |
24 |
INV |
5 |
25 |
BND |
68 |
25 |
DEL |
977 |
25 |
DUP |
135 |
25 |
INS |
1501 |
25 |
INV |
2 |
26 |
BND |
53 |
26 |
DEL |
1038 |
26 |
DUP |
78 |
26 |
INS |
1489 |
26 |
INV |
9 |
27 |
BND |
47 |
27 |
DEL |
1132 |
27 |
DUP |
92 |
27 |
INS |
1826 |
27 |
INV |
10 |
28 |
BND |
47 |
28 |
DEL |
734 |
28 |
DUP |
65 |
28 |
INS |
1010 |
28 |
INV |
3 |
29 |
BND |
45 |
29 |
DEL |
979 |
29 |
DUP |
117 |
29 |
INS |
1496 |
29 |
INV |
9 |
3 |
BND |
90 |
3 |
DEL |
1774 |
3 |
DUP |
215 |
3 |
INS |
2681 |
3 |
INV |
8 |
30 |
BND |
41 |
30 |
DEL |
810 |
30 |
DUP |
60 |
30 |
INS |
1378 |
30 |
INV |
6 |
31 |
BND |
51 |
31 |
DEL |
869 |
31 |
DUP |
106 |
31 |
INS |
1389 |
31 |
INV |
2 |
32 |
BND |
58 |
32 |
DEL |
1114 |
32 |
DUP |
84 |
32 |
INS |
1541 |
32 |
INV |
7 |
33 |
BND |
49 |
33 |
DEL |
781 |
33 |
DUP |
128 |
33 |
INS |
1222 |
33 |
INV |
6 |
34 |
BND |
49 |
34 |
DEL |
915 |
34 |
DUP |
97 |
34 |
INS |
1324 |
34 |
INV |
9 |
35 |
BND |
23 |
35 |
DEL |
648 |
35 |
DUP |
74 |
35 |
INS |
950 |
36 |
BND |
19 |
36 |
DEL |
650 |
36 |
DUP |
67 |
36 |
INS |
1040 |
36 |
INV |
6 |
37 |
BND |
20 |
37 |
DEL |
598 |
37 |
DUP |
64 |
37 |
INS |
972 |
37 |
INV |
4 |
38 |
BND |
10 |
38 |
DEL |
578 |
38 |
DUP |
60 |
38 |
INS |
849 |
38 |
INV |
1 |
4 |
BND |
119 |
4 |
DEL |
1669 |
4 |
DUP |
196 |
4 |
INS |
2433 |
4 |
INV |
7 |
5 |
BND |
63 |
5 |
DEL |
1589 |
5 |
DUP |
181 |
5 |
INS |
2416 |
5 |
INV |
7 |
6 |
BND |
55 |
6 |
DEL |
1519 |
6 |
DUP |
175 |
6 |
INS |
2216 |
6 |
INV |
9 |
7 |
BND |
63 |
7 |
DEL |
1476 |
7 |
DUP |
163 |
7 |
INS |
2313 |
7 |
INV |
11 |
8 |
BND |
175 |
8 |
DEL |
1737 |
8 |
DUP |
148 |
8 |
INS |
2697 |
8 |
INV |
16 |
9 |
BND |
58 |
9 |
DEL |
1271 |
9 |
DUP |
113 |
9 |
INS |
1933 |
9 |
INV |
13 |
X |
BND |
218 |
X |
DEL |
1634 |
X |
DUP |
133 |
X |
INS |
2278 |
X |
INV |
18 |
total_variant_summary_Cfam3 <- vcf_Cfam3 %>%
group_by(SVTYPE) %>%
summarise(Total_Count = n()) %>%
ungroup()
kable(total_variant_summary_Cfam3, caption = "Total variant count by type (CanFam 3.1)")
Total variant count by type (CanFam 3.1)
BND |
2590 |
DEL |
47161 |
DUP |
5069 |
INS |
71502 |
INV |
350 |
## CanFam4
# Extract the variant type (SVTYPE) from the INFO column
vcf_Cfam4$SVTYPE <- sapply(vcf_Cfam4$INFO, function(info) {
match <- regmatches(info, regexpr("SVTYPE=[^;]+", info))
if (length(match) > 0) {
sub("SVTYPE=", "", match)
} else {
NA
}
})
vcf_Cfam4 <- vcf_Cfam4[!is.na(vcf_Cfam4$SVTYPE),] # remove NAs
# Summarize the counts of each variant type per chromosome
variant_summary_Cfam4 <- vcf_Cfam4 %>%
group_by(X.CHROM, SVTYPE) %>%
summarise(Count = n()) %>%
ungroup()
# Rename chromosome column for better usability in plotting
colnames(variant_summary_Cfam4)[1] <- "Chromosome"
# Ensure the chromosome column is ordered correctly
variant_summary_Cfam4$Chromosome <- factor(variant_summary_Cfam4$Chromosome, levels = c(as.character(1:38), "X"))
# Plot the number of variants per chromosome by type
ggplot(variant_summary_Cfam4, aes(x = Chromosome, y = Count, fill = SVTYPE)) +
geom_bar(stat = "identity") +
theme_minimal() +
labs(
title = "Structural Variants per Chromosome by Type (CanFam4)",
x = "Chromosome",
y = "Number of Variants",
fill = "Variant Type"
) +
theme(axis.text.x = element_text(angle = 45, hjust = 1)) +
scale_fill_brewer(palette = "Set2")

kable(variant_summary_Cfam4, caption = "Variant count per chromosome (CanFam4)")
Variant count per chromosome (CanFam4)
1 |
BND |
157 |
1 |
DEL |
2401 |
1 |
DUP |
268 |
1 |
INS |
3386 |
1 |
INV |
26 |
10 |
BND |
48 |
10 |
DEL |
1379 |
10 |
DUP |
174 |
10 |
INS |
1836 |
10 |
INV |
11 |
11 |
BND |
88 |
11 |
DEL |
1475 |
11 |
DUP |
141 |
11 |
INS |
2044 |
11 |
INV |
8 |
12 |
BND |
78 |
12 |
DEL |
1764 |
12 |
DUP |
202 |
12 |
INS |
2590 |
12 |
INV |
12 |
13 |
BND |
75 |
13 |
DEL |
1304 |
13 |
DUP |
155 |
13 |
INS |
1895 |
13 |
INV |
10 |
14 |
BND |
61 |
14 |
DEL |
1238 |
14 |
DUP |
124 |
14 |
INS |
1827 |
14 |
INV |
10 |
15 |
BND |
69 |
15 |
DEL |
1397 |
15 |
DUP |
192 |
15 |
INS |
2044 |
15 |
INV |
9 |
16 |
BND |
56 |
16 |
DEL |
1262 |
16 |
DUP |
131 |
16 |
INS |
1739 |
16 |
INV |
8 |
17 |
BND |
77 |
17 |
DEL |
1372 |
17 |
DUP |
131 |
17 |
INS |
1979 |
17 |
INV |
10 |
18 |
BND |
108 |
18 |
DEL |
1278 |
18 |
DUP |
155 |
18 |
INS |
1709 |
18 |
INV |
18 |
19 |
BND |
100 |
19 |
DEL |
1371 |
19 |
DUP |
195 |
19 |
INS |
1956 |
19 |
INV |
15 |
2 |
BND |
69 |
2 |
DEL |
1626 |
2 |
DUP |
178 |
2 |
INS |
2348 |
2 |
INV |
12 |
20 |
BND |
59 |
20 |
DEL |
1181 |
20 |
DUP |
111 |
20 |
INS |
1741 |
20 |
INV |
9 |
21 |
BND |
45 |
21 |
DEL |
1225 |
21 |
DUP |
130 |
21 |
INS |
1685 |
21 |
INV |
6 |
22 |
BND |
68 |
22 |
DEL |
1286 |
22 |
DUP |
161 |
22 |
INS |
1738 |
22 |
INV |
15 |
23 |
BND |
72 |
23 |
DEL |
1163 |
23 |
DUP |
118 |
23 |
INS |
1699 |
23 |
INV |
8 |
24 |
BND |
39 |
24 |
DEL |
1006 |
24 |
DUP |
83 |
24 |
INS |
1475 |
24 |
INV |
6 |
25 |
BND |
38 |
25 |
DEL |
1034 |
25 |
DUP |
153 |
25 |
INS |
1492 |
25 |
INV |
4 |
26 |
BND |
27 |
26 |
DEL |
1032 |
26 |
DUP |
102 |
26 |
INS |
1382 |
26 |
INV |
10 |
27 |
BND |
44 |
27 |
DEL |
1220 |
27 |
DUP |
118 |
27 |
INS |
1775 |
27 |
INV |
7 |
28 |
BND |
35 |
28 |
DEL |
816 |
28 |
DUP |
73 |
28 |
INS |
971 |
28 |
INV |
6 |
29 |
BND |
48 |
29 |
DEL |
1099 |
29 |
DUP |
131 |
29 |
INS |
1457 |
29 |
INV |
5 |
3 |
BND |
142 |
3 |
DEL |
1814 |
3 |
DUP |
228 |
3 |
INS |
2727 |
3 |
INV |
9 |
30 |
BND |
34 |
30 |
DEL |
926 |
30 |
DUP |
87 |
30 |
INS |
1311 |
30 |
INV |
6 |
31 |
BND |
27 |
31 |
DEL |
1076 |
31 |
DUP |
127 |
31 |
INS |
1340 |
31 |
INV |
5 |
32 |
BND |
41 |
32 |
DEL |
1239 |
32 |
DUP |
116 |
32 |
INS |
1570 |
32 |
INV |
7 |
33 |
BND |
31 |
33 |
DEL |
993 |
33 |
DUP |
159 |
33 |
INS |
1158 |
33 |
INV |
7 |
34 |
BND |
45 |
34 |
DEL |
934 |
34 |
DUP |
125 |
34 |
INS |
1368 |
34 |
INV |
10 |
35 |
BND |
39 |
35 |
DEL |
780 |
35 |
DUP |
104 |
35 |
INS |
1069 |
35 |
INV |
1 |
36 |
BND |
33 |
36 |
DEL |
744 |
36 |
DUP |
110 |
36 |
INS |
1037 |
36 |
INV |
8 |
37 |
BND |
20 |
37 |
DEL |
648 |
37 |
DUP |
74 |
37 |
INS |
945 |
37 |
INV |
3 |
38 |
BND |
20 |
38 |
DEL |
712 |
38 |
DUP |
75 |
38 |
INS |
850 |
38 |
INV |
5 |
4 |
BND |
128 |
4 |
DEL |
1708 |
4 |
DUP |
175 |
4 |
INS |
2393 |
4 |
INV |
11 |
5 |
BND |
76 |
5 |
DEL |
1584 |
5 |
DUP |
192 |
5 |
INS |
2337 |
5 |
INV |
5 |
6 |
BND |
65 |
6 |
DEL |
1549 |
6 |
DUP |
197 |
6 |
INS |
2162 |
6 |
INV |
6 |
7 |
BND |
95 |
7 |
DEL |
1558 |
7 |
DUP |
151 |
7 |
INS |
2289 |
7 |
INV |
9 |
8 |
BND |
110 |
8 |
DEL |
1660 |
8 |
DUP |
157 |
8 |
INS |
2551 |
8 |
INV |
18 |
9 |
BND |
54 |
9 |
DEL |
1144 |
9 |
DUP |
106 |
9 |
INS |
1925 |
9 |
INV |
10 |
X |
BND |
246 |
X |
DEL |
1667 |
X |
DUP |
146 |
X |
INS |
2042 |
X |
INV |
23 |
total_variant_summary_Cfam4 <- vcf_Cfam4 %>%
group_by(SVTYPE) %>%
summarise(Total_Count = n()) %>%
ungroup()
kable(total_variant_summary_Cfam4, caption = "Total variant count by type (CanFam4)")
Total variant count by type (CanFam4)
BND |
2667 |
DEL |
49665 |
DUP |
5555 |
INS |
69842 |
INV |
368 |
Identify shared fusion partners with RNA-seq data
## CanFam 3.1
# read in csv file(s) of RNA fusion calls
star_rna_fusions <- read.csv("CanFam31_StarFusionSummary.csv")
fc_rna_fusions <- read.csv("fusionCatcher-results-ptcl-no-ctrls.csv")
# get list of implicated genes
star_genes_3 <- star_rna_fusions$X3_prime_gene
star_genes_5 <- star_rna_fusions$X5_prime_gene
fc_genes_3 <- fc_rna_fusions$X3_prime_gene
fc_genes_5 <- fc_rna_fusions$X5_prime_gene
all_star_genes <- c(star_genes_3, star_genes_5)
all_star_genes <- unique(all_star_genes)
all_fc_genes <- c(fc_genes_3, fc_genes_5)
all_fc_genes <- unique(all_fc_genes)
# compare to genes in INFO column of BND variant file
bnd_vcf <- read.csv("PTCL_BND_Variants.CanFam3.filtered.ann.bcsq.csv")
extract_gene <- function(info) {
# Find the BCSQ field
bcsc_field <- stringr::str_extract(info, "BCSQ=[^;]+")
# Extract the gene name (second element after "|")
if (!is.na(bcsc_field)) {
gene <- strsplit(bcsc_field, "\\|")[[1]][2]
return(gene)
}
return(NA)
}
bnd_vcf <- bnd_vcf %>%
mutate(Gene = sapply(INFO, extract_gene))
shared_star_bnd_Cfam3 <- bnd_vcf %>%
filter(Gene %in% all_star_genes)
shared_fc_bnd_Cfam3 <- bnd_vcf %>%
filter(Gene %in% all_fc_genes)
# print summary
print(unique(shared_star_bnd_Cfam3$Gene))
## [1] "PALM2AKAP2" "DLA-64" "ENSCAFG00000014478"
## [4] "ENSCAFG00000048749" "FRG1" "CPT1A"
## [7] "MITF" "DOCK3" "NBEA"
## [10] "CPXM2" "ST8SIA4" "HIVEP1"
## [13] "ATXN1" "ENSCAFG00000043983" "UIMC1"
## [16] "ENSCAFG00000047976" "ENSCAFG00000045242" "ENSCAFG00000031023"
## [19] "ENSCAFG00000043621" "ENSCAFG00000046168" "ENSCAFG00000043094"
## [22] "TRAV24" "ENSCAFG00000043049" "ENSCAFG00000025128"
## [25] "ENSCAFG00000030258" "ENSCAFG00000028509" "ENSCAFG00000028642"
## [28] "ENSCAFG00000029953" "ENSCAFG00000029046" "ENSCAFG00000044010"
## [31] "STAG2" "SH2D1A"
print(unique(shared_fc_bnd_Cfam3$Gene))
## [1] "SERPINB5" "MYCT1" "SAE1" "KDM4C" "DNAH8" "KCNQ5"
## [7] "VPS13B" "SND1" "GATB" "TRBV28" "MAGI2" "IGHMBP2"
## [13] "CPT1A" "INPP4B" "VPS13D" "MITF" "NISCH" "TBC1D4"
## [19] "ANKRD28" "SHLD1" "TTC28" "LCOR" "ZFYVE27" "ATE1"
## [25] "CPXM2" "RALYL" "CERT1" "TCF12" "SETD4" "GOLGB1"
## [31] "UBE2E3" "PDGFD" "DYNC2H1" "RABGAP1L" "PTPRM" "RGS6"
## [37] "ACACA" "BCLAF3"
# export
write.csv(shared_star_bnd_Cfam3, file = "shared_PacBio_STARFusion_calls.CanFam3.csv")
write.csv(shared_fc_bnd_Cfam3, file = "shared_PacBio_FusionCatcher_calls.CanFam3.csv")
## CanFam4
# read in csv file of RNA fusion calls
star_rna_fusions <- read.csv("CanFam4_StarFusionSummary.csv")
# get list of implicated genes
genes_3 <- star_rna_fusions$X3_prime_gene
genes_5 <- star_rna_fusions$X5_prime_gene
all_genes <- c(genes_3, genes_5)
all_genes <- unique(all_genes)
# compare to genes in INFO column of BND variant file
bnd_vcf <- read.csv("PTCL_BND_Variants.CanFam4.filtered.ann.bcsq.csv")
extract_gene <- function(info) {
# Find the BCSQ field
bcsc_field <- stringr::str_extract(info, "BCSQ=[^;]+")
# Extract the gene name (second element after "|")
if (!is.na(bcsc_field)) {
gene <- strsplit(bcsc_field, "\\|")[[1]][2]
return(gene)
}
return(NA)
}
bnd_vcf <- bnd_vcf %>%
mutate(Gene = sapply(INFO, extract_gene))
shared_bnd_Cfam4 <- bnd_vcf %>%
filter(Gene %in% all_genes)
# print summary
print(unique(shared_bnd_Cfam4$Gene))
## [1] "PALM2AKAP2" "FBXW7" "EXOC6B"
## [4] "ENSCAFG00805017629" "TSN" "ENSCAFG00805002052"
## [7] "TNFRSF1B" "SUMF1" "DOCK3"
## [10] "ENSCAFG00805023894" "ENSCAFG00805023901" "PLEKHA5"
## [13] "AEBP2" "TMEM117" "CPXM2"
## [16] "ENSCAFG00805004162" "ST8SIA4" "PDLIM5"
## [19] "CASR" "UIMC1" "ENSCAFG00805008215"
## [22] "ENSCAFG00805007848" "ENSCAFG00805008376" "ENSCAFG00805008332"
## [25] "ENSCAFG00805008471" "ENSCAFG00805008844" "TRAV24"
## [28] "ENSCAFG00805009255" "ENSCAFG00805009486" "ENSCAFG00805005681"
## [31] "STAG2" "SH2D1A"
# export
write.csv(shared_bnd_Cfam4, file = "shared_PacBio_STARFusion_calls.CanFam4.csv")
Identify structural variants implicating cancer-associated
genes
# import vcf file of only variants with a BCSQ tag in the INFO field
ann_vcf_Cfam3 <- read.csv("PTCL_StructuralVariants_CanFam3.filtered.ann.bcsq.csv")
ann_vcf_Cfam4 <- read.csv("PTCL_StructuralVariants_CanFam4.filtered.ann.bcsq.csv")
# list of cancer-associated genes from OncoKB
oncokb <- read.csv("cancerGeneList.csv")
oncokb_genes <- oncokb$Hugo.Symbol
extract_gene <- function(info) {
# Find the BCSQ field
bcsc_field <- stringr::str_extract(info, "BCSQ=[^;]+")
# Extract the gene name (second element after "|")
if (!is.na(bcsc_field)) {
gene <- strsplit(bcsc_field, "\\|")[[1]][2]
return(gene)
}
return(NA)
}
onco_vcf_Cfam3 <- ann_vcf_Cfam3 %>%
mutate(Gene = sapply(INFO, extract_gene))
onco_vcf_Cfam3 <- onco_vcf_Cfam3 %>%
filter(Gene %in% oncokb_genes)
onco_vcf_Cfam4 <- ann_vcf_Cfam4 %>%
mutate(Gene = sapply(INFO, extract_gene))
onco_vcf_Cfam4 <- onco_vcf_Cfam4 %>%
filter(Gene %in% oncokb_genes)
# export
write.csv(onco_vcf_Cfam3, file = "CanFam3_SVs_in_Cancer_Genes.csv")
write.csv(onco_vcf_Cfam4, file = "CanFam4_SVs_in_Cancer_Genes.csv")
Identify shared variable genes between CanFam3.1 and CanFam4
alignments
# Extract BCSQ gene annotation and SVTYPE from INFO field and place in separate columns called 'Gene' and 'SVTYPE'
extract_gene <- function(info) {
# Find the BCSQ field
bcsc_field <- stringr::str_extract(info, "BCSQ=[^;]+")
# Extract the gene name (second element after "|")
if (!is.na(bcsc_field)) {
gene <- strsplit(bcsc_field, "\\|")[[1]][2]
return(gene)
}
return(NA)
}
extract_variant_type <- function(info) {
# Find the SVTYPE field
svtype_field <- stringr::str_extract(info, "SVTYPE=[^;]+")
# Extract the variant type (first element after "=")
if (!is.na(svtype_field)) {
var_type <- strsplit(svtype_field, "=")[[1]][2]
return(var_type)
}
return(NA)
}
Cfam3_vcf_alt <- ann_vcf_Cfam3 %>%
mutate(Gene = sapply(INFO, extract_gene))
Cfam3_vcf_alt <- Cfam3_vcf_alt %>%
mutate(SVTYPE = sapply(INFO, extract_variant_type))
Cfam4_vcf_alt <- ann_vcf_Cfam4 %>%
mutate(Gene = sapply(INFO, extract_gene))
Cfam4_vcf_alt <- Cfam4_vcf_alt %>%
mutate(SVTYPE = sapply(INFO, extract_variant_type))
# Combine Gene and SVTYPE info into one column
Cfam3_vcf_alt <- Cfam3_vcf_alt %>%
mutate(
SVTYPE_Gene = paste(SVTYPE, Gene, sep = "_")
)
Cfam4_vcf_alt <- Cfam4_vcf_alt %>%
mutate(
SVTYPE_Gene = paste(SVTYPE, Gene, sep = "_")
)
# Create vector of SVTYPE_Gene column contents
Cfam3_variantGenes <- unique(Cfam3_vcf_alt$SVTYPE_Gene)
Cfam4_variantGenes <- unique(Cfam4_vcf_alt$SVTYPE_Gene)
# Filter variant calls of each alignment to include only those of the same variant type in shared genes
Cfam3_shared <- Cfam3_vcf_alt %>%
filter(SVTYPE_Gene %in% Cfam4_variantGenes) %>%
filter(Gene != "NA")
Cfam4_shared <- Cfam4_vcf_alt %>%
filter(SVTYPE_Gene %in% Cfam3_variantGenes) %>%
filter(Gene != "NA")
# print summary of total number of shared variable genes
paste("Number of genes with the same variant type called in CanFam3.1 and CanFam4: ", length(unique(Cfam3_shared$Gene)), sep = " ")
## [1] "Number of genes with the same variant type called in CanFam3.1 and CanFam4: 4717"
# export
write.csv(Cfam3_shared, file = "CanFam3_PacBio_Variants_in_Shared_Genes_with_CanFam4.csv")
write.csv(Cfam4_shared, file = "CanFam4_PacBio_Variants_in_Shared_Genes_with_CanFam3.csv")
# Limit to OncoKB cancer-associated genes
Cfam3_shared_oncokb <- Cfam3_shared %>%
filter(Gene %in% oncokb_genes)
Cfam4_shared_oncokb <- Cfam4_shared %>%
filter(Gene %in% oncokb_genes)
paste("Number of cancer-associated genes with same variant type called in CanFam3.1 and CanFam4:", length(unique(Cfam3_shared_oncokb$Gene)), sep = " ")
## [1] "Number of cancer-associated genes with same variant type called in CanFam3.1 and CanFam4: 419"
# export
write.csv(Cfam3_shared_oncokb, file = "CanFam3_PacBio_Variants_in_Shared_OncoKB_Genes_with_CanFam4.csv")
write.csv(Cfam4_shared_oncokb, file = "CanFam4_PacBio_Variants_in_Shared_OncoKB_Genes_with_CanFam3.csv")
Citations
sessionInfo()
## R version 4.4.0 (2024-04-24 ucrt)
## Platform: x86_64-w64-mingw32/x64
## Running under: Windows 11 x64 (build 22631)
##
## Matrix products: default
##
##
## locale:
## [1] LC_COLLATE=English_United States.utf8
## [2] LC_CTYPE=English_United States.utf8
## [3] LC_MONETARY=English_United States.utf8
## [4] LC_NUMERIC=C
## [5] LC_TIME=English_United States.utf8
##
## time zone: America/Denver
## tzcode source: internal
##
## attached base packages:
## [1] stats graphics grDevices utils datasets methods base
##
## other attached packages:
## [1] knitr_1.49 tidyr_1.3.1 dplyr_1.1.4 ggplot2_3.5.1
##
## loaded via a namespace (and not attached):
## [1] vctrs_0.6.5 cli_3.6.2 rlang_1.1.3 xfun_0.49
## [5] stringi_1.8.4 purrr_1.0.2 generics_0.1.3 jsonlite_1.8.9
## [9] labeling_0.4.3 glue_1.8.0 colorspace_2.1-1 htmltools_0.5.8.1
## [13] sass_0.4.9 scales_1.3.0 rmarkdown_2.29 grid_4.4.0
## [17] evaluate_1.0.3 munsell_0.5.1 jquerylib_0.1.4 tibble_3.2.1
## [21] fastmap_1.2.0 yaml_2.3.10 lifecycle_1.0.4 stringr_1.5.1
## [25] compiler_4.4.0 codetools_0.2-20 RColorBrewer_1.1-3 pkgconfig_2.0.3
## [29] rstudioapi_0.17.1 farver_2.1.2 digest_0.6.35 R6_2.5.1
## [33] tidyselect_1.2.1 pillar_1.10.1 magrittr_2.0.3 bslib_0.8.0
## [37] withr_3.0.2 tools_4.4.0 gtable_0.3.6 cachem_1.1.0
citation()
## To cite R in publications use:
##
## R Core Team (2024). _R: A Language and Environment for Statistical
## Computing_. R Foundation for Statistical Computing, Vienna, Austria.
## <https://www.R-project.org/>.
##
## A BibTeX entry for LaTeX users is
##
## @Manual{,
## title = {R: A Language and Environment for Statistical Computing},
## author = {{R Core Team}},
## organization = {R Foundation for Statistical Computing},
## address = {Vienna, Austria},
## year = {2024},
## url = {https://www.R-project.org/},
## }
##
## We have invested a lot of time and effort in creating R, please cite it
## when using it for data analysis. See also 'citation("pkgname")' for
## citing R packages.