library(tidyverse)
library(dplyr)
library(ggplot2)
library(gsubfn)
library("yarrr")
library("knitr")
GDCTF <- read.csv("~/Desktop/GDCAnalysis/GDCTF.csv", stringsAsFactors = FALSE)
newGDCTF <- GDCTF %>%
## rename the fields and convert strings to numbers
rename(SSMAffectedCasesInCohort = X..SSM.Affected.Cases.in.Cohort) %>%
rename(SSMAffectedCasesAcrossGDC = X..SSM.Affected.Cases.Across.the.GDC) %>%
rename(CNVGain = X..CNV.Gain) %>%
rename(CNVLoss = X..CNV.Loss) %>%
rename(Mutations = X..Mutations) %>%
mutate(Mutations = gsub(",", "", Mutations)) %>%
mutate(Mutations= as.numeric(Mutations)) %>%
## sub the percentage only
mutate(SSMAffectedCasesInCohort = gsub(".+\\(", "", SSMAffectedCasesInCohort)) %>%
mutate(SSMAffectedCasesInCohort = as.numeric(gsub("%\\)", "", SSMAffectedCasesInCohort))) %>%
mutate(CNVLoss = gsub(".+\\(", "", CNVLoss)) %>%
mutate(CNVLoss = as.numeric(gsub("%\\)", "", CNVLoss))) %>%
mutate(CNVGain = gsub(".+\\(", "", CNVGain)) %>%
mutate(CNVGain = as.numeric(gsub("%\\)", "", CNVGain)))
## rename empty cell no
loop <- 1
while (loop <= nrow(newGDCTF)) {
if (newGDCTF$Annotations[loop] == "") {
newGDCTF$Annotations[loop] <- "no"
} else {
newGDCTF$Annotations[loop] <- "yes"
}
loop <- loop + 1
}
## calculate the percentage
loop <- 1
while (loop <= nrow(newGDCTF)) {
s <- unlist(strsplit(toString(newGDCTF$SSMAffectedCasesAcrossGDC[loop]), " / "))
newGDCTF$SSMAffectedCasesAcrossGDC[loop] <- as.numeric(gsub(",", "", s[1])) / as.numeric(gsub(",", "", s[2])) * 100
loop <- loop + 1
}
## convert strings to numbers
newGDCTF <- newGDCTF %>%
mutate(SSMAffectedCasesAcrossGDC = as.numeric(SSMAffectedCasesAcrossGDC))
newGDCTF$Annotations <- factor(newGDCTF$Annotations, levels = c('yes', 'no'), ordered = TRUE)
| ENSG00000198300 |
PEG3 |
paternally expressed 3 |
7.53 |
19q13.43 |
6.910410 |
protein_coding |
4.73 |
5.36 |
907 |
no |
NA |
| ENSG00000164256 |
PRDM9 |
PR domain containing 9 |
6.57 |
5p14.2 |
6.028230 |
protein_coding |
4.86 |
1.52 |
704 |
no |
NA |
| ENSG00000116539 |
ASH1L |
ash1 (absent, small, or homeotic)-like (Drosophila) |
6.34 |
1q22 |
5.822388 |
protein_coding |
10.53 |
0.91 |
752 |
no |
NA |
| ENSG00000121297 |
TSHZ3 |
teashirt zinc finger homeobox 3 |
6.24 |
19q12 |
5.724368 |
protein_coding |
6.49 |
1.40 |
672 |
no |
NA |
| ENSG00000080603 |
SRCAP |
Snf2-related CREBBP activator protein |
6.17 |
16p11.2 |
5.665556 |
protein_coding |
4.45 |
0.85 |
743 |
no |
NA |
| ENSG00000103449 |
SALL1 |
spalt-like transcription factor 1 |
5.86 |
16q12.1 |
5.381298 |
protein_coding |
2.48 |
2.74 |
647 |
no |
NA |
| ENSG00000106571 |
GLI3 |
GLI family zinc finger 3 |
5.69 |
7p14.1 |
5.224466 |
protein_coding |
2.41 |
1.86 |
635 |
no |
NA |
| ENSG00000186487 |
MYT1L |
myelin transcription factor 1-like |
5.56 |
2p25.3 |
5.106842 |
protein_coding |
4.79 |
3.72 |
609 |
no |
NA |
| ENSG00000189079 |
ARID2 |
AT rich interactive domain 2 (ARID, RFX-like) |
5.45 |
12q12 |
4.999020 |
protein_coding |
3.28 |
3.44 |
617 |
yes |
NA |
| ENSG00000169946 |
ZFPM2 |
zinc finger protein, FOG family member 2 |
5.20 |
8q22.3, 8q23.1 |
4.773574 |
protein_coding |
10.63 |
2.00 |
552 |
no |
NA |
Plots
## the numbers of SSM detected in that gene
ggplot(newGDCTF, aes(x=Annotations, y=Mutations)) + geom_boxplot() + geom_point() +
labs(x = "Cancer Gene Census?")

## The number of cases affected by SSMs (simple somatic mutations) in the Cohort.
ggplot(newGDCTF, aes(x=Annotations, y=SSMAffectedCasesInCohort)) + geom_boxplot() + geom_point() +
labs(x = "Cancer Gene Census?")

## The number of cases within all the projects in the GDC that contain a mutation on this gene.
ggplot(newGDCTF, aes(x=Annotations, y=SSMAffectedCasesAcrossGDC)) + geom_boxplot() + geom_point() +
labs(x = "Cancer Gene Census?")

## The number of CNV (copy number variation) events detected in that gene which resulted in an increase (gain) in the gene's copy number.
ggplot(newGDCTF, aes(x=Annotations, y=CNVGain)) + geom_boxplot() + geom_point() +
labs(x = "Cancer Gene Census?")

## The number of CNV events detected in that gene which resulted in a decrease (loss) in the gene's copy number.
ggplot(newGDCTF, aes(x=Annotations, y=CNVLoss)) + geom_boxplot() + geom_point() +
labs(x = "Cancer Gene Census?")
