Software

library(ggplot2)
library(dplyr)
library(tidyr)
library(knitr)

Data

## CanFam 3.1
setwd("C:/Users/edlarsen/Documents/StructuralVariants")
vcf_Cfam3 <- read.csv("PTCL_StructuralVariants_CanFam3.filtered.ann.csv")

valid_chromosomes <- c(as.character(1:38), "X")
vcf_Cfam3 <- vcf_Cfam3[vcf_Cfam3$X.CHROM %in% valid_chromosomes, ]

## CanFam 4
vcf_Cfam4 <- read.csv("PTCL_StructuralVariants_CanFam4.filtered.ann.csv")
vcf_Cfam4 <- vcf_Cfam4[vcf_Cfam4$X.CHROM %in% valid_chromosomes, ]

Number of variants per chromosome

# CanFam 3.1
variant_counts_Cfam3 <- table(vcf_Cfam3$X.CHROM)
variant_df_Cfam3 <- as.data.frame(variant_counts_Cfam3)
colnames(variant_df_Cfam3) <- c("Chromosome", "VariantCount")

print(ggplot(variant_df_Cfam3, aes(x = Chromosome, y = VariantCount)) +
  geom_bar(stat = "identity", fill = "steelblue") +
  theme_minimal() +
  labs(
    title = "Structural Variant Calls per Chromosome (CanFam 3.1)",
    x = "Chromosome",
    y = "Number of Variants"
  ) +
  theme(axis.text.x = element_text(angle = 45, hjust = 1)))

# CanFam 4
variant_counts_Cfam4 <- table(vcf_Cfam4$X.CHROM)
variant_df_Cfam4 <- as.data.frame(variant_counts_Cfam4)
colnames(variant_df_Cfam4) <- c("Chromosome", "VariantCount")

print(ggplot(variant_df_Cfam4, aes(x = Chromosome, y = VariantCount)) +
  geom_bar(stat = "identity", fill = "steelblue") +
  theme_minimal() +
  labs(
    title = "Structural Variant Calls per Chromosome (CanFam4)",
    x = "Chromosome",
    y = "Number of Variants"
  ) +
  theme(axis.text.x = element_text(angle = 45, hjust = 1)))

Types of variants by chromosome

## CanFam 3.1
# Extract the variant type (SVTYPE) from the INFO column
vcf_Cfam3$SVTYPE <- sapply(vcf_Cfam3$INFO, function(info) {
  match <- regmatches(info, regexpr("SVTYPE=[^;]+", info))
  if (length(match) > 0) {
    sub("SVTYPE=", "", match)
  } else {
    NA
  }
})

vcf_Cfam3 <- vcf_Cfam3[!is.na(vcf_Cfam3$SVTYPE),] # remove NAs

# Summarize the counts of each variant type per chromosome
variant_summary_Cfam3 <- vcf_Cfam3 %>%
  group_by(X.CHROM, SVTYPE) %>%
  summarise(Count = n()) %>%
  ungroup()

# Rename chromosome column for better usability in plotting
colnames(variant_summary_Cfam3)[1] <- "Chromosome"

# Ensure the chromosome column is ordered correctly
variant_summary_Cfam3$Chromosome <- factor(variant_summary_Cfam3$Chromosome, levels = c(as.character(1:38), "X"))

# Plot the number of variants per chromosome by type
ggplot(variant_summary_Cfam3, aes(x = Chromosome, y = Count, fill = SVTYPE)) +
  geom_bar(stat = "identity") +
  theme_minimal() +
  labs(
    title = "Structural Variants per Chromosome by Type (CanFam 3.1)",
    x = "Chromosome",
    y = "Number of Variants",
    fill = "Variant Type"
  ) +
  theme(axis.text.x = element_text(angle = 45, hjust = 1)) +
  scale_fill_brewer(palette = "Set2")

kable(variant_summary_Cfam3, caption = "Variant count per chromosome (CanFam 3.1)")

Variant count per chromosome (CanFam 3.1)
Chromosome	SVTYPE	Count
1	BND	156
1	DEL	2275
1	DUP	268
1	INS	3552
1	INV	25
10	BND	46
10	DEL	1269
10	DUP	141
10	INS	1828
10	INV	10
11	BND	91
11	DEL	1396
11	DUP	149
11	INS	2025
11	INV	11
12	BND	83
12	DEL	1684
12	DUP	180
12	INS	2594
12	INV	14
13	BND	56
13	DEL	1224
13	DUP	161
13	INS	1955
13	INV	7
14	BND	44
14	DEL	1213
14	DUP	118
14	INS	1841
14	INV	13
15	BND	61
15	DEL	1344
15	DUP	187
15	INS	2085
15	INV	4
16	BND	67
16	DEL	1320
16	DUP	161
16	INS	1927
16	INV	8
17	BND	56
17	DEL	1369
17	DUP	137
17	INS	2060
17	INV	13
18	BND	69
18	DEL	1199
18	DUP	142
18	INS	1710
18	INV	16
19	BND	71
19	DEL	1203
19	DUP	160
19	INS	2015
19	INV	17
2	BND	47
2	DEL	1587
2	DUP	163
2	INS	2478
2	INV	10
20	BND	60
20	DEL	1179
20	DUP	88
20	INS	1729
20	INV	12
21	BND	61
21	DEL	1173
21	DUP	126
21	INS	1702
21	INV	5
22	BND	92
22	DEL	1208
22	DUP	144
22	INS	1811
22	INV	11
23	BND	69
23	DEL	1056
23	DUP	115
23	INS	1761
23	INV	6
24	BND	40
24	DEL	970
24	DUP	78
24	INS	1475
24	INV	5
25	BND	68
25	DEL	977
25	DUP	135
25	INS	1501
25	INV	2
26	BND	53
26	DEL	1038
26	DUP	78
26	INS	1489
26	INV	9
27	BND	47
27	DEL	1132
27	DUP	92
27	INS	1826
27	INV	10
28	BND	47
28	DEL	734
28	DUP	65
28	INS	1010
28	INV	3
29	BND	45
29	DEL	979
29	DUP	117
29	INS	1496
29	INV	9
3	BND	90
3	DEL	1774
3	DUP	215
3	INS	2681
3	INV	8
30	BND	41
30	DEL	810
30	DUP	60
30	INS	1378
30	INV	6
31	BND	51
31	DEL	869
31	DUP	106
31	INS	1389
31	INV	2
32	BND	58
32	DEL	1114
32	DUP	84
32	INS	1541
32	INV	7
33	BND	49
33	DEL	781
33	DUP	128
33	INS	1222
33	INV	6
34	BND	49
34	DEL	915
34	DUP	97
34	INS	1324
34	INV	9
35	BND	23
35	DEL	648
35	DUP	74
35	INS	950
36	BND	19
36	DEL	650
36	DUP	67
36	INS	1040
36	INV	6
37	BND	20
37	DEL	598
37	DUP	64
37	INS	972
37	INV	4
38	BND	10
38	DEL	578
38	DUP	60
38	INS	849
38	INV	1
4	BND	119
4	DEL	1669
4	DUP	196
4	INS	2433
4	INV	7
5	BND	63
5	DEL	1589
5	DUP	181
5	INS	2416
5	INV	7
6	BND	55
6	DEL	1519
6	DUP	175
6	INS	2216
6	INV	9
7	BND	63
7	DEL	1476
7	DUP	163
7	INS	2313
7	INV	11
8	BND	175
8	DEL	1737
8	DUP	148
8	INS	2697
8	INV	16
9	BND	58
9	DEL	1271
9	DUP	113
9	INS	1933
9	INV	13
X	BND	218
X	DEL	1634
X	DUP	133
X	INS	2278
X	INV	18

total_variant_summary_Cfam3 <- vcf_Cfam3 %>%
  group_by(SVTYPE) %>%
  summarise(Total_Count = n()) %>%
  ungroup()

kable(total_variant_summary_Cfam3, caption = "Total variant count by type (CanFam 3.1)")

Total variant count by type (CanFam 3.1)
SVTYPE	Total_Count
BND	2590
DEL	47161
DUP	5069
INS	71502
INV	350

## CanFam4
# Extract the variant type (SVTYPE) from the INFO column
vcf_Cfam4$SVTYPE <- sapply(vcf_Cfam4$INFO, function(info) {
  match <- regmatches(info, regexpr("SVTYPE=[^;]+", info))
  if (length(match) > 0) {
    sub("SVTYPE=", "", match)
  } else {
    NA
  }
})

vcf_Cfam4 <- vcf_Cfam4[!is.na(vcf_Cfam4$SVTYPE),] # remove NAs

# Summarize the counts of each variant type per chromosome
variant_summary_Cfam4 <- vcf_Cfam4 %>%
  group_by(X.CHROM, SVTYPE) %>%
  summarise(Count = n()) %>%
  ungroup()

# Rename chromosome column for better usability in plotting
colnames(variant_summary_Cfam4)[1] <- "Chromosome"

# Ensure the chromosome column is ordered correctly
variant_summary_Cfam4$Chromosome <- factor(variant_summary_Cfam4$Chromosome, levels = c(as.character(1:38), "X"))

# Plot the number of variants per chromosome by type
ggplot(variant_summary_Cfam4, aes(x = Chromosome, y = Count, fill = SVTYPE)) +
  geom_bar(stat = "identity") +
  theme_minimal() +
  labs(
    title = "Structural Variants per Chromosome by Type (CanFam4)",
    x = "Chromosome",
    y = "Number of Variants",
    fill = "Variant Type"
  ) +
  theme(axis.text.x = element_text(angle = 45, hjust = 1)) +
  scale_fill_brewer(palette = "Set2")

kable(variant_summary_Cfam4, caption = "Variant count per chromosome (CanFam4)")

Variant count per chromosome (CanFam4)
Chromosome	SVTYPE	Count
1	BND	157
1	DEL	2401
1	DUP	268
1	INS	3386
1	INV	26
10	BND	48
10	DEL	1379
10	DUP	174
10	INS	1836
10	INV	11
11	BND	88
11	DEL	1475
11	DUP	141
11	INS	2044
11	INV	8
12	BND	78
12	DEL	1764
12	DUP	202
12	INS	2590
12	INV	12
13	BND	75
13	DEL	1304
13	DUP	155
13	INS	1895
13	INV	10
14	BND	61
14	DEL	1238
14	DUP	124
14	INS	1827
14	INV	10
15	BND	69
15	DEL	1397
15	DUP	192
15	INS	2044
15	INV	9
16	BND	56
16	DEL	1262
16	DUP	131
16	INS	1739
16	INV	8
17	BND	77
17	DEL	1372
17	DUP	131
17	INS	1979
17	INV	10
18	BND	108
18	DEL	1278
18	DUP	155
18	INS	1709
18	INV	18
19	BND	100
19	DEL	1371
19	DUP	195
19	INS	1956
19	INV	15
2	BND	69
2	DEL	1626
2	DUP	178
2	INS	2348
2	INV	12
20	BND	59
20	DEL	1181
20	DUP	111
20	INS	1741
20	INV	9
21	BND	45
21	DEL	1225
21	DUP	130
21	INS	1685
21	INV	6
22	BND	68
22	DEL	1286
22	DUP	161
22	INS	1738
22	INV	15
23	BND	72
23	DEL	1163
23	DUP	118
23	INS	1699
23	INV	8
24	BND	39
24	DEL	1006
24	DUP	83
24	INS	1475
24	INV	6
25	BND	38
25	DEL	1034
25	DUP	153
25	INS	1492
25	INV	4
26	BND	27
26	DEL	1032
26	DUP	102
26	INS	1382
26	INV	10
27	BND	44
27	DEL	1220
27	DUP	118
27	INS	1775
27	INV	7
28	BND	35
28	DEL	816
28	DUP	73
28	INS	971
28	INV	6
29	BND	48
29	DEL	1099
29	DUP	131
29	INS	1457
29	INV	5
3	BND	142
3	DEL	1814
3	DUP	228
3	INS	2727
3	INV	9
30	BND	34
30	DEL	926
30	DUP	87
30	INS	1311
30	INV	6
31	BND	27
31	DEL	1076
31	DUP	127
31	INS	1340
31	INV	5
32	BND	41
32	DEL	1239
32	DUP	116
32	INS	1570
32	INV	7
33	BND	31
33	DEL	993
33	DUP	159
33	INS	1158
33	INV	7
34	BND	45
34	DEL	934
34	DUP	125
34	INS	1368
34	INV	10
35	BND	39
35	DEL	780
35	DUP	104
35	INS	1069
35	INV	1
36	BND	33
36	DEL	744
36	DUP	110
36	INS	1037
36	INV	8
37	BND	20
37	DEL	648
37	DUP	74
37	INS	945
37	INV	3
38	BND	20
38	DEL	712
38	DUP	75
38	INS	850
38	INV	5
4	BND	128
4	DEL	1708
4	DUP	175
4	INS	2393
4	INV	11
5	BND	76
5	DEL	1584
5	DUP	192
5	INS	2337
5	INV	5
6	BND	65
6	DEL	1549
6	DUP	197
6	INS	2162
6	INV	6
7	BND	95
7	DEL	1558
7	DUP	151
7	INS	2289
7	INV	9
8	BND	110
8	DEL	1660
8	DUP	157
8	INS	2551
8	INV	18
9	BND	54
9	DEL	1144
9	DUP	106
9	INS	1925
9	INV	10
X	BND	246
X	DEL	1667
X	DUP	146
X	INS	2042
X	INV	23

total_variant_summary_Cfam4 <- vcf_Cfam4 %>%
  group_by(SVTYPE) %>%
  summarise(Total_Count = n()) %>%
  ungroup()

kable(total_variant_summary_Cfam4, caption = "Total variant count by type (CanFam4)")

Total variant count by type (CanFam4)
SVTYPE	Total_Count
BND	2667
DEL	49665
DUP	5555
INS	69842
INV	368

Identify shared fusion partners with RNA-seq data

## CanFam 3.1
# read in csv file(s) of RNA fusion calls
star_rna_fusions <- read.csv("CanFam31_StarFusionSummary.csv")
fc_rna_fusions <- read.csv("fusionCatcher-results-ptcl-no-ctrls.csv")

# get list of implicated genes
star_genes_3 <- star_rna_fusions$X3_prime_gene
star_genes_5 <- star_rna_fusions$X5_prime_gene
fc_genes_3 <- fc_rna_fusions$X3_prime_gene
fc_genes_5 <- fc_rna_fusions$X5_prime_gene

all_star_genes <- c(star_genes_3, star_genes_5)
all_star_genes <- unique(all_star_genes)
all_fc_genes <- c(fc_genes_3, fc_genes_5)
all_fc_genes <- unique(all_fc_genes)

# compare to genes in INFO column of BND variant file
bnd_vcf <- read.csv("PTCL_BND_Variants.CanFam3.filtered.ann.bcsq.csv")

extract_gene <- function(info) {
  # Find the BCSQ field
  bcsc_field <- stringr::str_extract(info, "BCSQ=[^;]+")
  # Extract the gene name (second element after "|")
  if (!is.na(bcsc_field)) {
    gene <- strsplit(bcsc_field, "\\|")[[1]][2]
    return(gene)
  }
  return(NA)
}

bnd_vcf <- bnd_vcf %>%
  mutate(Gene = sapply(INFO, extract_gene))

shared_star_bnd_Cfam3 <- bnd_vcf %>%
  filter(Gene %in% all_star_genes)

shared_fc_bnd_Cfam3 <- bnd_vcf %>%
  filter(Gene %in% all_fc_genes)

# print summary
print(unique(shared_star_bnd_Cfam3$Gene))

##  [1] "PALM2AKAP2"         "DLA-64"             "ENSCAFG00000014478"
##  [4] "ENSCAFG00000048749" "FRG1"               "CPT1A"             
##  [7] "MITF"               "DOCK3"              "NBEA"              
## [10] "CPXM2"              "ST8SIA4"            "HIVEP1"            
## [13] "ATXN1"              "ENSCAFG00000043983" "UIMC1"             
## [16] "ENSCAFG00000047976" "ENSCAFG00000045242" "ENSCAFG00000031023"
## [19] "ENSCAFG00000043621" "ENSCAFG00000046168" "ENSCAFG00000043094"
## [22] "TRAV24"             "ENSCAFG00000043049" "ENSCAFG00000025128"
## [25] "ENSCAFG00000030258" "ENSCAFG00000028509" "ENSCAFG00000028642"
## [28] "ENSCAFG00000029953" "ENSCAFG00000029046" "ENSCAFG00000044010"
## [31] "STAG2"              "SH2D1A"

print(unique(shared_fc_bnd_Cfam3$Gene))

##  [1] "SERPINB5" "MYCT1"    "SAE1"     "KDM4C"    "DNAH8"    "KCNQ5"   
##  [7] "VPS13B"   "SND1"     "GATB"     "TRBV28"   "MAGI2"    "IGHMBP2" 
## [13] "CPT1A"    "INPP4B"   "VPS13D"   "MITF"     "NISCH"    "TBC1D4"  
## [19] "ANKRD28"  "SHLD1"    "TTC28"    "LCOR"     "ZFYVE27"  "ATE1"    
## [25] "CPXM2"    "RALYL"    "CERT1"    "TCF12"    "SETD4"    "GOLGB1"  
## [31] "UBE2E3"   "PDGFD"    "DYNC2H1"  "RABGAP1L" "PTPRM"    "RGS6"    
## [37] "ACACA"    "BCLAF3"

# export
write.csv(shared_star_bnd_Cfam3, file = "shared_PacBio_STARFusion_calls.CanFam3.csv")
write.csv(shared_fc_bnd_Cfam3, file = "shared_PacBio_FusionCatcher_calls.CanFam3.csv")

## CanFam4
# read in csv file of RNA fusion calls
star_rna_fusions <- read.csv("CanFam4_StarFusionSummary.csv")

# get list of implicated genes
genes_3 <- star_rna_fusions$X3_prime_gene
genes_5 <- star_rna_fusions$X5_prime_gene
all_genes <- c(genes_3, genes_5)
all_genes <- unique(all_genes)

# compare to genes in INFO column of BND variant file
bnd_vcf <- read.csv("PTCL_BND_Variants.CanFam4.filtered.ann.bcsq.csv")

extract_gene <- function(info) {
  # Find the BCSQ field
  bcsc_field <- stringr::str_extract(info, "BCSQ=[^;]+")
  # Extract the gene name (second element after "|")
  if (!is.na(bcsc_field)) {
    gene <- strsplit(bcsc_field, "\\|")[[1]][2]
    return(gene)
  }
  return(NA)
}

bnd_vcf <- bnd_vcf %>%
  mutate(Gene = sapply(INFO, extract_gene))

shared_bnd_Cfam4 <- bnd_vcf %>%
  filter(Gene %in% all_genes)

# print summary
print(unique(shared_bnd_Cfam4$Gene))

##  [1] "PALM2AKAP2"         "FBXW7"              "EXOC6B"            
##  [4] "ENSCAFG00805017629" "TSN"                "ENSCAFG00805002052"
##  [7] "TNFRSF1B"           "SUMF1"              "DOCK3"             
## [10] "ENSCAFG00805023894" "ENSCAFG00805023901" "PLEKHA5"           
## [13] "AEBP2"              "TMEM117"            "CPXM2"             
## [16] "ENSCAFG00805004162" "ST8SIA4"            "PDLIM5"            
## [19] "CASR"               "UIMC1"              "ENSCAFG00805008215"
## [22] "ENSCAFG00805007848" "ENSCAFG00805008376" "ENSCAFG00805008332"
## [25] "ENSCAFG00805008471" "ENSCAFG00805008844" "TRAV24"            
## [28] "ENSCAFG00805009255" "ENSCAFG00805009486" "ENSCAFG00805005681"
## [31] "STAG2"              "SH2D1A"

# export
write.csv(shared_bnd_Cfam4, file = "shared_PacBio_STARFusion_calls.CanFam4.csv")

Identify structural variants implicating cancer-associated genes

# import vcf file of only variants with a BCSQ tag in the INFO field
ann_vcf_Cfam3 <- read.csv("PTCL_StructuralVariants_CanFam3.filtered.ann.bcsq.csv")
ann_vcf_Cfam4 <- read.csv("PTCL_StructuralVariants_CanFam4.filtered.ann.bcsq.csv")

# list of cancer-associated genes from OncoKB
oncokb <- read.csv("cancerGeneList.csv")
oncokb_genes <- oncokb$Hugo.Symbol

extract_gene <- function(info) {
  # Find the BCSQ field
  bcsc_field <- stringr::str_extract(info, "BCSQ=[^;]+")
  # Extract the gene name (second element after "|")
  if (!is.na(bcsc_field)) {
    gene <- strsplit(bcsc_field, "\\|")[[1]][2]
    return(gene)
  }
  return(NA)
}

onco_vcf_Cfam3 <- ann_vcf_Cfam3 %>%
  mutate(Gene = sapply(INFO, extract_gene))

onco_vcf_Cfam3 <- onco_vcf_Cfam3 %>%
  filter(Gene %in% oncokb_genes)

onco_vcf_Cfam4 <- ann_vcf_Cfam4 %>%
  mutate(Gene = sapply(INFO, extract_gene))

onco_vcf_Cfam4 <- onco_vcf_Cfam4 %>%
  filter(Gene %in% oncokb_genes)

# export
write.csv(onco_vcf_Cfam3, file = "CanFam3_SVs_in_Cancer_Genes.csv")
write.csv(onco_vcf_Cfam4, file = "CanFam4_SVs_in_Cancer_Genes.csv")

Identify shared variable genes between CanFam3.1 and CanFam4 alignments

# Extract BCSQ gene annotation and SVTYPE from INFO field and place in separate columns called 'Gene' and 'SVTYPE'
extract_gene <- function(info) {
  # Find the BCSQ field
  bcsc_field <- stringr::str_extract(info, "BCSQ=[^;]+")
  # Extract the gene name (second element after "|")
  if (!is.na(bcsc_field)) {
    gene <- strsplit(bcsc_field, "\\|")[[1]][2]
    return(gene)
  }
  return(NA)
}

extract_variant_type <- function(info) {
  # Find the SVTYPE field
  svtype_field <- stringr::str_extract(info, "SVTYPE=[^;]+")
  # Extract the variant type (first element after "=")
  if (!is.na(svtype_field)) {
    var_type <- strsplit(svtype_field, "=")[[1]][2]
    return(var_type)
  }
  return(NA)
}

Cfam3_vcf_alt <- ann_vcf_Cfam3 %>%
  mutate(Gene = sapply(INFO, extract_gene))
Cfam3_vcf_alt <- Cfam3_vcf_alt %>%
  mutate(SVTYPE = sapply(INFO, extract_variant_type))

Cfam4_vcf_alt <- ann_vcf_Cfam4 %>%
  mutate(Gene = sapply(INFO, extract_gene))
Cfam4_vcf_alt <- Cfam4_vcf_alt %>%
  mutate(SVTYPE = sapply(INFO, extract_variant_type))

# Combine Gene and SVTYPE info into one column
Cfam3_vcf_alt <- Cfam3_vcf_alt %>%
  mutate(
    SVTYPE_Gene = paste(SVTYPE, Gene, sep = "_")
  )

Cfam4_vcf_alt <- Cfam4_vcf_alt %>%
  mutate(
    SVTYPE_Gene = paste(SVTYPE, Gene, sep = "_")
  )


# Create vector of SVTYPE_Gene column contents
Cfam3_variantGenes <- unique(Cfam3_vcf_alt$SVTYPE_Gene)
Cfam4_variantGenes <- unique(Cfam4_vcf_alt$SVTYPE_Gene)

# Filter variant calls of each alignment to include only those of the same variant type in shared genes
Cfam3_shared <- Cfam3_vcf_alt %>%
  filter(SVTYPE_Gene %in% Cfam4_variantGenes) %>%
  filter(Gene != "NA")

Cfam4_shared <- Cfam4_vcf_alt %>%
  filter(SVTYPE_Gene %in% Cfam3_variantGenes) %>%
  filter(Gene != "NA")

# print summary of total number of shared variable genes
paste("Number of genes with the same variant type called in CanFam3.1 and CanFam4: ", length(unique(Cfam3_shared$Gene)), sep = " ")

## [1] "Number of genes with the same variant type called in CanFam3.1 and CanFam4:  4717"

# export
write.csv(Cfam3_shared, file = "CanFam3_PacBio_Variants_in_Shared_Genes_with_CanFam4.csv")
write.csv(Cfam4_shared, file = "CanFam4_PacBio_Variants_in_Shared_Genes_with_CanFam3.csv")

# Limit to OncoKB cancer-associated genes
Cfam3_shared_oncokb <- Cfam3_shared %>%
  filter(Gene %in% oncokb_genes)

Cfam4_shared_oncokb <- Cfam4_shared %>%
  filter(Gene %in% oncokb_genes)

paste("Number of cancer-associated genes with same variant type called in CanFam3.1 and CanFam4:", length(unique(Cfam3_shared_oncokb$Gene)), sep = " ")

## [1] "Number of cancer-associated genes with same variant type called in CanFam3.1 and CanFam4: 419"

# export
write.csv(Cfam3_shared_oncokb, file = "CanFam3_PacBio_Variants_in_Shared_OncoKB_Genes_with_CanFam4.csv")
write.csv(Cfam4_shared_oncokb, file = "CanFam4_PacBio_Variants_in_Shared_OncoKB_Genes_with_CanFam3.csv")

Citations

sessionInfo()

## R version 4.4.0 (2024-04-24 ucrt)
## Platform: x86_64-w64-mingw32/x64
## Running under: Windows 11 x64 (build 22631)
## 
## Matrix products: default
## 
## 
## locale:
## [1] LC_COLLATE=English_United States.utf8 
## [2] LC_CTYPE=English_United States.utf8   
## [3] LC_MONETARY=English_United States.utf8
## [4] LC_NUMERIC=C                          
## [5] LC_TIME=English_United States.utf8    
## 
## time zone: America/Denver
## tzcode source: internal
## 
## attached base packages:
## [1] stats     graphics  grDevices utils     datasets  methods   base     
## 
## other attached packages:
## [1] knitr_1.49    tidyr_1.3.1   dplyr_1.1.4   ggplot2_3.5.1
## 
## loaded via a namespace (and not attached):
##  [1] vctrs_0.6.5        cli_3.6.2          rlang_1.1.3        xfun_0.49         
##  [5] stringi_1.8.4      purrr_1.0.2        generics_0.1.3     jsonlite_1.8.9    
##  [9] labeling_0.4.3     glue_1.8.0         colorspace_2.1-1   htmltools_0.5.8.1 
## [13] sass_0.4.9         scales_1.3.0       rmarkdown_2.29     grid_4.4.0        
## [17] evaluate_1.0.3     munsell_0.5.1      jquerylib_0.1.4    tibble_3.2.1      
## [21] fastmap_1.2.0      yaml_2.3.10        lifecycle_1.0.4    stringr_1.5.1     
## [25] compiler_4.4.0     codetools_0.2-20   RColorBrewer_1.1-3 pkgconfig_2.0.3   
## [29] rstudioapi_0.17.1  farver_2.1.2       digest_0.6.35      R6_2.5.1          
## [33] tidyselect_1.2.1   pillar_1.10.1      magrittr_2.0.3     bslib_0.8.0       
## [37] withr_3.0.2        tools_4.4.0        gtable_0.3.6       cachem_1.1.0

citation()

## To cite R in publications use:
## 
##   R Core Team (2024). _R: A Language and Environment for Statistical
##   Computing_. R Foundation for Statistical Computing, Vienna, Austria.
##   <https://www.R-project.org/>.
## 
## A BibTeX entry for LaTeX users is
## 
##   @Manual{,
##     title = {R: A Language and Environment for Statistical Computing},
##     author = {{R Core Team}},
##     organization = {R Foundation for Statistical Computing},
##     address = {Vienna, Austria},
##     year = {2024},
##     url = {https://www.R-project.org/},
##   }
## 
## We have invested a lot of time and effort in creating R, please cite it
## when using it for data analysis. See also 'citation("pkgname")' for
## citing R packages.

Summarize VCF

Eileen Owens

2025-02-14

Software

Data

Number of variants per chromosome

Types of variants by chromosome

Identify shared fusion partners with RNA-seq data

Identify structural variants implicating cancer-associated genes

Identify shared variable genes between CanFam3.1 and CanFam4 alignments

Citations