library(tidyverse)
library(dplyr)
library(ggplot2)
library(gsubfn)
library("yarrr")
library("knitr")
GDCTF <- read.csv("~/Desktop/GDCAnalysis/GDCTF.csv", stringsAsFactors = FALSE)
newGDCTF <- GDCTF %>% 
  ## rename the fields and convert strings to numbers
  rename(SSMAffectedCasesInCohort = X..SSM.Affected.Cases.in.Cohort) %>%
  rename(SSMAffectedCasesAcrossGDC = X..SSM.Affected.Cases.Across.the.GDC) %>%
  rename(CNVGain = X..CNV.Gain) %>%
  rename(CNVLoss = X..CNV.Loss) %>%
  rename(Mutations = X..Mutations) %>%
  mutate(Mutations = gsub(",", "", Mutations)) %>%
  mutate(Mutations= as.numeric(Mutations)) %>%
  ## sub the percentage only
  mutate(SSMAffectedCasesInCohort = gsub(".+\\(", "", SSMAffectedCasesInCohort)) %>%
  mutate(SSMAffectedCasesInCohort = as.numeric(gsub("%\\)", "", SSMAffectedCasesInCohort))) %>%
  
  mutate(CNVLoss = gsub(".+\\(", "", CNVLoss)) %>%
  mutate(CNVLoss = as.numeric(gsub("%\\)", "", CNVLoss))) %>%
  
  mutate(CNVGain = gsub(".+\\(", "", CNVGain)) %>%
  mutate(CNVGain = as.numeric(gsub("%\\)", "", CNVGain)))

## rename empty cell no
loop <- 1
while (loop <= nrow(newGDCTF)) {
  if (newGDCTF$Annotations[loop] == "") {
    newGDCTF$Annotations[loop] <- "no"
  } else {
    newGDCTF$Annotations[loop] <- "yes"
  }
  loop <- loop + 1
}

## calculate the percentage
loop <- 1
while (loop <= nrow(newGDCTF)) {
  s <- unlist(strsplit(toString(newGDCTF$SSMAffectedCasesAcrossGDC[loop]), " / "))
  newGDCTF$SSMAffectedCasesAcrossGDC[loop] <- as.numeric(gsub(",", "", s[1])) / as.numeric(gsub(",", "", s[2])) * 100
  loop <- loop + 1
}
## convert strings to numbers
newGDCTF <- newGDCTF %>%
  mutate(SSMAffectedCasesAcrossGDC = as.numeric(SSMAffectedCasesAcrossGDC))

newGDCTF$Annotations <- factor(newGDCTF$Annotations, levels = c('yes', 'no'), ordered = TRUE)
Gene.ID Symbol Name SSMAffectedCasesInCohort Cytoband SSMAffectedCasesAcrossGDC Type CNVGain CNVLoss Mutations Annotations Survival
ENSG00000198300 PEG3 paternally expressed 3 7.53 19q13.43 6.910410 protein_coding 4.73 5.36 907 no NA
ENSG00000164256 PRDM9 PR domain containing 9 6.57 5p14.2 6.028230 protein_coding 4.86 1.52 704 no NA
ENSG00000116539 ASH1L ash1 (absent, small, or homeotic)-like (Drosophila) 6.34 1q22 5.822388 protein_coding 10.53 0.91 752 no NA
ENSG00000121297 TSHZ3 teashirt zinc finger homeobox 3 6.24 19q12 5.724368 protein_coding 6.49 1.40 672 no NA
ENSG00000080603 SRCAP Snf2-related CREBBP activator protein 6.17 16p11.2 5.665556 protein_coding 4.45 0.85 743 no NA
ENSG00000103449 SALL1 spalt-like transcription factor 1 5.86 16q12.1 5.381298 protein_coding 2.48 2.74 647 no NA
ENSG00000106571 GLI3 GLI family zinc finger 3 5.69 7p14.1 5.224466 protein_coding 2.41 1.86 635 no NA
ENSG00000186487 MYT1L myelin transcription factor 1-like 5.56 2p25.3 5.106842 protein_coding 4.79 3.72 609 no NA
ENSG00000189079 ARID2 AT rich interactive domain 2 (ARID, RFX-like) 5.45 12q12 4.999020 protein_coding 3.28 3.44 617 yes NA
ENSG00000169946 ZFPM2 zinc finger protein, FOG family member 2 5.20 8q22.3, 8q23.1 4.773574 protein_coding 10.63 2.00 552 no NA

Plots

## the numbers of SSM detected in that gene
ggplot(newGDCTF, aes(x=Annotations, y=Mutations)) + geom_boxplot() + geom_point() +
  labs(x = "Cancer Gene Census?")

## The number of cases affected by SSMs (simple somatic mutations) in the Cohort.
ggplot(newGDCTF, aes(x=Annotations, y=SSMAffectedCasesInCohort)) + geom_boxplot() + geom_point() +
  labs(x = "Cancer Gene Census?")

## The number of cases within all the projects in the GDC that contain a mutation on this gene.
ggplot(newGDCTF, aes(x=Annotations, y=SSMAffectedCasesAcrossGDC)) + geom_boxplot() + geom_point() +
  labs(x = "Cancer Gene Census?")

## The number of CNV (copy number variation) events detected in that gene which resulted in an increase (gain) in the gene's copy number.
ggplot(newGDCTF, aes(x=Annotations, y=CNVGain)) + geom_boxplot() + geom_point() +
  labs(x = "Cancer Gene Census?")