R Markdown

This is an R Markdown document. Markdown is a simple formatting syntax for authoring HTML, PDF, and MS Word documents. For more details on using R Markdown see http://rmarkdown.rstudio.com.

Grouping and combining data.

In this part, I am going to try to reproduce the session from week 5 of the materials for an R course run by the Bioinformatics Core at the Cancer Research UK Cambridge Institute. The difference will be that the data used will be from Pediatric Brain Cancer https://www.cbioportal.org/study/clinicalData?id=brain_cptac_2020. With some functions of the dplyr package.

I’ll try to customize the plots using ggplot2. These dplyr and ggplot2 are core component packages within the tidyverse and both are loaded as part of the tidyverse.

library(tidyverse)
## ── Attaching packages ─────────────────────────────────────── tidyverse 1.3.2 ──
## ✔ ggplot2 3.3.6     ✔ purrr   0.3.4
## ✔ tibble  3.1.8     ✔ dplyr   1.0.9
## ✔ tidyr   1.2.0     ✔ stringr 1.4.0
## ✔ readr   2.1.2     ✔ forcats 0.5.1
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
brain <- read_tsv("brain_cptac_2020_clinical_data.tsv")
## Rows: 218 Columns: 63
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: "\t"
## chr (51): Study ID, Patient ID, Sample ID, Age Class, BRAF_RELA Status, BRAF...
## dbl (12): AGE, Age at Chemotherapy Start, Age at Chemotherapy Stop, Age at I...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
head(brain, n = 4)
## # A tibble: 4 × 63
##   Study …¹ Patie…² Sampl…³   AGE Age a…⁴ Age a…⁵ Age a…⁶ Age a…⁷ Age a…⁸ Age a…⁹
##   <chr>    <chr>   <chr>   <dbl>   <dbl>   <dbl>   <dbl>   <dbl>   <dbl>   <dbl>
## 1 brain_c… PT_CWD… 7316-1…     9      NA      NA    3472    4008      NA      NA
## 2 brain_c… PT_5FR… 7316-1…    11      NA      NA    4234    6510      NA      NA
## 3 brain_c… PT_RM5… 7316-1…    12    4630    4720    4519    6220    4556    4600
## 4 brain_c… PT_3X3… 7316-1…    30   11201   11242   11184   11881   11215   11247
## # … with 53 more variables: `Age at Specimen Diagnosis` <dbl>,
## #   `Age Class` <chr>, `BRAF_RELA Status` <chr>, `BRAF Status` <chr>,
## #   `BRAF Status2` <chr>, `Cancer Predispositions` <chr>, `Cancer Type` <chr>,
## #   `Cancer Type Detailed` <chr>, Chemotherapy <chr>,
## #   `Chemotherapy Agents` <chr>, `Chemotherapy Type` <chr>,
## #   `Clinical Status at Collection Event` <chr>,
## #   `Completed Radiation Dose` <chr>, `Completed Total Radiation Dose` <chr>, …
## # ℹ Use `colnames()` to see all variable names
table(brain$Chemotherapy)
## 
##           No Not Reported  Unavailable          Yes 
##          136           22            3           48
summary(brain$`Mutation Count`)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max.    NA's 
##    3.00   13.00   27.00   44.94   59.50  587.00      18

We can create a scatter plot to see if there is any relationship between the age of the patient at diagnosis of brain cancer and the Nottingham prognostic index (whatever that is).

plot(brain$`Age at Initial Diagnosis`, brain$`Nottingham prognostic index`,
     xlab = "Age at diagnosis", ylab = "Nottingham prognostic index")
## Warning: Unknown or uninitialised column: `Nottingham prognostic index`.

library(readr)

braincsv <- read.csv2("brain_cptac_2020_clinical_data.csv", header = TRUE, sep = ";", dec = ",",fill = TRUE)


head(braincsv, n = 4)
##           Study.ID  Patient.ID Sample.ID AGE Age.at.Chemotherapy.Start
## 1 brain_cptac_2020 PT_CWD717Q0  7316-101   9                        NA
## 2 brain_cptac_2020 PT_5FR2YA6E  7316-109  11                        NA
## 3 brain_cptac_2020 PT_RM5S859Q  7316-111  12                      4630
## 4 brain_cptac_2020 PT_3X3MF8ZD  7316-114  30                     11201
##   Age.at.Chemotherapy.Stop Age.at.Initial.Diagnosis
## 1                       NA                     3472
## 2                       NA                     4234
## 3                     4720                     4519
## 4                    11242                    11184
##   Age.at.Last.Known.Clinical.Status Age.at.Radiation.Start
## 1                              4008                     NA
## 2                              6510                     NA
## 3                              6220                   4556
## 4                             11881                  11215
##   Age.at.Radiation.Stop Age.at.Specimen.Diagnosis Age.Class BRAF_RELA.Status
## 1                    NA                      3472    [5-10)             <NA>
## 2                    NA                      4234   [10-15)             <NA>
## 3                  4600                      4519   [10-15)      RELA Fusion
## 4                 11247                     11184       20+             <NA>
##   BRAF.Status BRAF.Status2 Cancer.Predispositions                    CancerType
## 1        <NA>         <NA>        None documented                        Glioma
## 2        <NA>         <NA>        None documented               Embryonal Tumor
## 3 RELA Fusion         <NA>        None documented                    CNS Cancer
## 4        <NA>         <NA>           Not Reported High-grade glioma/astrocytoma
##           Cancer.Type.Detailed Chemotherapy
## 1                Ganglioglioma           No
## 2              Medulloblastoma          Yes
## 3            Ependymomal Tumor          Yes
## 4 Pediatric High Grade Gliomas          Yes
##                                   Chemotherapy.Agents
## 1                                                <NA>
## 2                                Cisplatin, Lomustine
## 3 Cisplatin, Cyclophosphamide, Etoposide, Vincristine
## 4                                        Temozolamide
##                                                                         Chemotherapy.Type
## 1                                                                                    <NA>
## 2                                Treatment follows a protocol but subject is not enrolled
## 3                      Treatment follows a protocol and subject is enrolled on a protocol
## 4 Treatment follows other standard of care not associated with a current or past protocol
##   Clinical.Status.at.Collection.Event Completed.Radiation.Dose
## 1                               Alive                     <NA>
## 2                               Alive                     <NA>
## 3                               Alive                 5940 cGy
## 4                               Alive                 2340 cGy
##   Completed.Total.Radiation.Dose CTNNB1.Status DFS.Months            DFS.Status
## 1                           <NA>          <NA>         17         0:DiseaseFree
## 2                           <NA>          <NA>         47 1:Recurred/Progressed
## 3                       5940 cGy     CTNNB1 WT         55         0:DiseaseFree
## 4                       4500 cGy          <NA>         22         0:DiseaseFree
##   Ependymoma_RELA.Status              Ethnicity  Extent.of.Tumor.Resection
## 1                   <NA> Not Hispanic or Latino Gross/Near total resection
## 2                   <NA> Not Hispanic or Latino                       <NA>
## 3            RELA Fusion Not Hispanic or Latino Gross/Near total resection
## 4                   <NA> Not Hispanic or Latino                Biopsy only
##   External.Patient.ID
## 1              C23862
## 2              C28905
## 3              C30381
## 4              C30258
##                                                                                                                    Formulation
## 1                                                                                                                         <NA>
## 2 Reduced cisplatin dose by 50% due to high frequency hearing loss and reduced CCNU dose by 25% due to persistent neutropenia.
## 3                                                                                                                 Not Reported
## 4                                                                                                                 Not Reported
##   H3F3A_CTNNB1.Status HGG_H3F3A.Status
## 1                <NA>             <NA>
## 2                <NA>             <NA>
## 3           CTNNB1 WT             <NA>
## 4            H3F3A WT         H3F3A WT
##   Initial.CNS.Tumor.Diagnosis.Related.to.OS
## 1                             Ganglioglioma
## 2                           Medulloblastoma
## 3                                Ependymoma
## 4             High-grade glioma/astrocytoma
##                 Initial.Diagnosis.Type LGG_BRAF.Status
## 1                    Initial CNS Tumor            <NA>
## 2                    Initial CNS Tumor            <NA>
## 3                    Initial CNS Tumor            <NA>
## 4 Second Malignancy (first HGG record)            <NA>
##                                                                                                  Medical.Conditions
## 1                                                                                                  Seizure disorder
## 2                                                                                      Other medical conditions NOS
## 3             Focal neurological deficit (cranial nerve palsies, motor deficits, sensory deficits),Seizure disorder
## 4 Focal neurological deficit (cranial nerve palsies, motor deficits, sensory deficits),Other medical conditions NOS
##   Multiple.Cancer.Predispositions Multiple.Medical.Conditions
## 1                              No                          No
## 2                              No                          No
## 3                              No                         Yes
## 4                            <NA>                         Yes
##   Multiple.Tumor.Locations Mutation.Count Oncotree.Code OS.Months  OS.Status
## 1                       No             69           GNG        17   0:LIVING
## 2                       No             41           MBL        74 1:DECEASED
## 3                       No             40          EPMT        55   0:LIVING
## 4                      Yes             46          PHGG        22 1:DECEASED
##   Protocol.and.Treatment.Arm                      Race Radiation
## 1                       <NA> Black or African American        No
## 2                    CHP 455                     White        No
## 3         COG trial ACNS0831                     White       Yes
## 4               Not Reported                     White       Yes
##                  Radiation.Site                     Radiation.Type
## 1                          <NA>                               <NA>
## 2                          <NA>                               <NA>
## 3               Focal/Tumor bed                            Protons
## 4 Craniospinal with focal boost Combination of Protons and Photons
##   Sample.Annotation Number.of.Samples.Per.Patient             Sample.Origin
## 1   Treatment naive                             1 Initial CNS Tumor Surgery
## 2   Treatment naive                             1                      <NA>
## 3   Treatment naive                             1 Initial CNS Tumor Surgery
## 4   Treatment naive                             1         Second malignancy
##      Sex Surgery Timing.Other.Notes          Treatment Treatment.Changed
## 1   Male     Yes               <NA>                New         Diagnosis
## 2   Male      No       Not Reported Modified Treatment   12 Month Update
## 3 Female     Yes       Not Reported                New         Diagnosis
## 4 Female     Yes       Not Reported                New         Diagnosis
##   Treatment.Status Tumor.Location.Condensed
## 1  Treatment naive            Temporal Lobe
## 2  Treatment naive               Cerebellar
## 3  Treatment naive           Supratentorial
## 4  Treatment naive                  Midline
##                                                                                                    Tumor.Tissue.Site
## 1                                                                                                      Temporal Lobe
## 2                                                                                         Cerebellum/Posterior Fossa
## 3                                                                                                     Occipital Lobe
## 4 Brain Stem- Pons,Brain Stem-Medulla,Spinal Cord- Cervical,Spinal Cord- Thoracic,Suprasellar/Hypothalamic/Pituitary
##          Tumor.Type Updated.Grade
## 1           Primary             I
## 2           Primary            IV
## 3           Primary           III
## 4 Second Malignancy            IV
#library(ggplot2)

ggplot(data = braincsv) +
  geom_point(mapping = aes(x =AGE , y = CancerType))

ggplot(data = braincsv) +
  geom_point(mapping = aes(x = AGE, y = CancerType, colour = OS.Status))

ggplot(data = braincsv) +
  geom_point(mapping = aes(x = AGE, y = CancerType, colour = "Nottingham_prognostic_index"))

ggplot(data = braincsv) +
  geom_point(mapping = aes(x = AGE, y = CancerType, size = OS.Status))

ggplot(data = braincsv) +
  geom_bar(mapping = aes(x = CancerType))+
  theme(axis.text.x = element_text(angle = 15, size = 10))

ggplot(data = braincsv) +
  geom_bar(mapping = aes(x = CancerType, fill = CancerType))+
  theme(axis.text.x = element_text(angle = 15, size = 10))

ggplot(data = braincsv) +
  geom_bar(mapping = aes(x = CancerType, fill = Sex))+
  theme(axis.text.x = element_text(angle = 15, size = 10))

ggplot(data = braincsv, mapping = aes(x = CancerType, y = AGE)) +
  geom_point(size = 2, alpha = 1) +
  theme(axis.text.x = element_text(angle = 15, size = 10))+
  geom_smooth()

ggplot(data = braincsv, mapping = aes(x = AGE, y = CancerType, colour = OS.Status)) +
  geom_point(size = 2, alpha = 1) +
  theme(axis.text.x = element_text(angle = 15, size = 10))

## Box plots

ggplot(data = braincsv, mapping = aes(x = AGE, y = CancerType)) +
  geom_boxplot()

See geom_boxplot help to explain how the box and whiskers are constructed and how it decides which points are outliers and should be displayed as points.

ggplot(data = braincsv, mapping = aes(x = AGE, y = CancerType)) +
  geom_boxplot() +
  geom_point()

Here we can notice an atypical value, an age of 30 years in pediatric brain cancer, I think it is outside the pediatric margin

Ideally, we’d like these points to be spread out a bit. The geom_point help points to geom_jitter as more suitable when one of the variables is categorical.

ggplot(data = braincsv, mapping = aes(x = AGE, y = CancerType)) +
  geom_boxplot() +
  geom_jitter(width = 0.3, size = 0.8, alpha = 0.25)

ggplot(data = braincsv, mapping = aes(x = AGE, y = CancerType, colour = OS.Status)) +
  geom_boxplot()