2 Results

library(recount)

## Warning: package 'recount' was built under R version 4.3.2

## Loading required package: SummarizedExperiment

## Loading required package: MatrixGenerics

## Loading required package: matrixStats

## Warning: package 'matrixStats' was built under R version 4.3.2

## 
## Attaching package: 'MatrixGenerics'

## The following objects are masked from 'package:matrixStats':
## 
##     colAlls, colAnyNAs, colAnys, colAvgsPerRowSet, colCollapse,
##     colCounts, colCummaxs, colCummins, colCumprods, colCumsums,
##     colDiffs, colIQRDiffs, colIQRs, colLogSumExps, colMadDiffs,
##     colMads, colMaxs, colMeans2, colMedians, colMins, colOrderStats,
##     colProds, colQuantiles, colRanges, colRanks, colSdDiffs, colSds,
##     colSums2, colTabulates, colVarDiffs, colVars, colWeightedMads,
##     colWeightedMeans, colWeightedMedians, colWeightedSds,
##     colWeightedVars, rowAlls, rowAnyNAs, rowAnys, rowAvgsPerColSet,
##     rowCollapse, rowCounts, rowCummaxs, rowCummins, rowCumprods,
##     rowCumsums, rowDiffs, rowIQRDiffs, rowIQRs, rowLogSumExps,
##     rowMadDiffs, rowMads, rowMaxs, rowMeans2, rowMedians, rowMins,
##     rowOrderStats, rowProds, rowQuantiles, rowRanges, rowRanks,
##     rowSdDiffs, rowSds, rowSums2, rowTabulates, rowVarDiffs, rowVars,
##     rowWeightedMads, rowWeightedMeans, rowWeightedMedians,
##     rowWeightedSds, rowWeightedVars

## Loading required package: GenomicRanges

## Loading required package: stats4

## Loading required package: BiocGenerics

## 
## Attaching package: 'BiocGenerics'

## The following objects are masked from 'package:stats':
## 
##     IQR, mad, sd, var, xtabs

## The following objects are masked from 'package:base':
## 
##     anyDuplicated, aperm, append, as.data.frame, basename, cbind,
##     colnames, dirname, do.call, duplicated, eval, evalq, Filter, Find,
##     get, grep, grepl, intersect, is.unsorted, lapply, Map, mapply,
##     match, mget, order, paste, pmax, pmax.int, pmin, pmin.int,
##     Position, rank, rbind, Reduce, rownames, sapply, setdiff, sort,
##     table, tapply, union, unique, unsplit, which.max, which.min

## Loading required package: S4Vectors

## Warning: package 'S4Vectors' was built under R version 4.3.2

## 
## Attaching package: 'S4Vectors'

## The following object is masked from 'package:utils':
## 
##     findMatches

## The following objects are masked from 'package:base':
## 
##     expand.grid, I, unname

## Loading required package: IRanges

## Loading required package: GenomeInfoDb

## Warning: package 'GenomeInfoDb' was built under R version 4.3.3

## Loading required package: Biobase

## Welcome to Bioconductor
## 
##     Vignettes contain introductory material; view with
##     'browseVignettes()'. To cite Bioconductor, see
##     'citation("Biobase")', and for packages 'citation("pkgname")'.

## 
## Attaching package: 'Biobase'

## The following object is masked from 'package:MatrixGenerics':
## 
##     rowMedians

## The following objects are masked from 'package:matrixStats':
## 
##     anyMissing, rowMedians

## Setting options('download.file.method.GEOquery'='auto')

## Setting options('GEOquery.inmemory.gpl'=FALSE)

library(edgeR)

## Warning: package 'edgeR' was built under R version 4.3.2

## Loading required package: limma

## 
## Attaching package: 'limma'

## The following object is masked from 'package:BiocGenerics':
## 
##     plotMA

library(ggfortify)

## Warning: package 'ggfortify' was built under R version 4.3.2

## Loading required package: ggplot2

## Warning: package 'ggplot2' was built under R version 4.3.2

library(tidyverse)

## Warning: package 'tidyr' was built under R version 4.3.2

## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.4     ✔ readr     2.1.5
## ✔ forcats   1.0.0     ✔ stringr   1.5.1
## ✔ lubridate 1.9.3     ✔ tibble    3.2.1
## ✔ purrr     1.0.2     ✔ tidyr     1.3.1

## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ lubridate::%within%() masks IRanges::%within%()
## ✖ dplyr::collapse()     masks IRanges::collapse()
## ✖ dplyr::combine()      masks Biobase::combine(), BiocGenerics::combine()
## ✖ dplyr::count()        masks matrixStats::count()
## ✖ dplyr::desc()         masks IRanges::desc()
## ✖ tidyr::expand()       masks S4Vectors::expand()
## ✖ dplyr::filter()       masks stats::filter()
## ✖ dplyr::first()        masks S4Vectors::first()
## ✖ dplyr::lag()          masks stats::lag()
## ✖ ggplot2::Position()   masks BiocGenerics::Position(), base::Position()
## ✖ purrr::reduce()       masks GenomicRanges::reduce(), IRanges::reduce()
## ✖ dplyr::rename()       masks S4Vectors::rename()
## ✖ lubridate::second()   masks S4Vectors::second()
## ✖ lubridate::second<-() masks S4Vectors::second<-()
## ✖ dplyr::slice()        masks IRanges::slice()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors

library(Homo.sapiens)

## Loading required package: AnnotationDbi

## Warning: package 'AnnotationDbi' was built under R version 4.3.2

## 
## Attaching package: 'AnnotationDbi'
## 
## The following object is masked from 'package:dplyr':
## 
##     select
## 
## Loading required package: OrganismDbi
## Loading required package: GenomicFeatures

## Warning: package 'GenomicFeatures' was built under R version 4.3.3

## Loading required package: GO.db
## 
## Loading required package: org.Hs.eg.db
## 
## Loading required package: TxDb.Hsapiens.UCSC.hg19.knownGene

load("/Users/alyssayamada/Downloads/rse_gene_skin.Rdata")

rse_gene

## class: RangedSummarizedExperiment 
## dim: 58037 473 
## metadata(0):
## assays(1): counts
## rownames(58037): ENSG00000000003.14 ENSG00000000005.5 ...
##   ENSG00000283698.1 ENSG00000283699.1
## rowData names(3): gene_id bp_length symbol
## colnames(473): 72189C87-53DD-427E-B163-91E404577A7A
##   27496AAB-AC06-4100-BB41-0029F63CC678 ...
##   D868AC1A-11DB-4A07-BF2F-1429FA197F7B
##   4AFBA9FB-3DBA-4271-B20D-40580B89D09E
## colData names(864): project sample ...
##   xml_primary_pathology_history_myasthenia_gravis
##   xml_primary_pathology_section_myasthenia_gravis

counts <- assay(rse_gene)
genes <- rowData(rse_gene)
samples <- as.data.frame(colData(rse_gene))

TCGA <- DGEList(counts = counts, samples = samples, genes = genes)
TCGA <- calcNormFactors(TCGA, method = "TMM")
keep <- filterByExpr(TCGA)

## Warning in filterByExpr.DGEList(TCGA): All samples appear to belong to the same
## group.

TCGA <- TCGA[keep,]
cpm <- cpm(TCGA)
lcpm <- cpm(TCGA, log=TRUE)
pca <- prcomp(t(lcpm), scale = TRUE)
autoplot(pca, data = samples, colour = "cgc_case_age_at_diagnosis")

Figure 1: Principle component analysis on the variable age at diagnosis.

It can be seen that there seems to be a cluster towards the right side of the graph, however there is no distinct clustering of the same colour so subsequently age at diagnosis. PC1 and PC2 explains for 12.69% and 6.28% of the variation in the data respectively.

design <- model.matrix(~ cgc_case_age_at_diagnosis, samples)
v <- voom(TCGA, design, plot=TRUE)

Figure 2: Voom plot of age at diagnosis.

Each point represents a gene and the red line is fitted was fitted to the points. A log tranformation was applied so that each gene contribute equal weight in the data.

fit <- lmFit(v, design)
efit <- eBayes(fit)
age_fdr <- decideTests(efit, adjust.method = "fdr", p.value = 0.05)
summary(age_fdr)

##        (Intercept) cgc_case_age_at_diagnosis
## Down         14291                      1309
## NotSig        2074                     31247
## Up           17078                       887

The table shows that there were 1309 genes that were down regulated and 887 gene that were up regulated when using the false discovery rate.

age_bonferroni <- decideTests(efit, adjust.method = "bonferroni", p.value = 0.05)
summary(age_bonferroni)

##        (Intercept) cgc_case_age_at_diagnosis
## Down         12238                       102
## NotSig        5354                     33325
## Up           15851                        16

Using the Bonferroni method, 102 genes were upregulated and 16 genes were upregulated. This is because the Bonferroni is a stricker in comparison to the false discovery rate method.

hist(efit$p.value[samples$cgc_case_age_at_diagnosis])

Figure 3: Histogram of the p values in the age variable.

It can be seen that there is a significant difference between the genes in the age at diagnosis variable.

volcanoplot(efit, coef = "cgc_case_age_at_diagnosis", highlight = 10, names = efit$genes$symbol)

Figure 4: Volcano plot of the age at diagnosis variable

It can be seen in the volcano plot that the variable is less significance in as most of the points are headed toward 0 on the y axis. Additionally there is less difference between genes in the age variable. The most significantly differentiated genes is labeled in blue

age <- topTable(efit, coef = "cgc_case_age_at_diagnosis", adjust.method = "fdr", p.value = 0.05,
    number = Inf)
useCoef <- which(colnames(efit) == "cgc_case_age_at_diagnosis")
plotMD(efit, column = useCoef, status = age_fdr[, useCoef], main = colnames(efit)[useCoef],
    )

Figure 5: Mean difference plot of the age at diagnosis variable

The mean difference plot visualizes the non significant, up regulated and down regulated genes. It is notable that there is a high number of non significant genes.

load(url("http://bioinf.wehi.edu.au/software/MSigDB/human_c2_v5p2.rdata"))
symbol <- unlist(lapply(v$genes$symbol,function(x)x[1])) 
ENTREZID <- mapIds(Homo.sapiens, keys=symbol, column=c("ENTREZID"),keytype="SYMBOL", multiVals = "first")

## 'select()' returned 1:many mapping between keys and columns

v$genes$ENTREZID <-ENTREZID
idx <- ids2indices(Hs.c2,id=v$genes$ENTREZID)
cam<- camera(v,idx,design,contrast="cgc_case_age_at_diagnosis",trend.var = TRUE)
head(cam,10)

##                                             NGenes Direction       PValue
## SMID_BREAST_CANCER_NORMAL_LIKE_UP              452      Down 3.981207e-15
## PICCALUGA_ANGIOIMMUNOBLASTIC_LYMPHOMA_UP       198      Down 4.182985e-13
## SCHUETZ_BREAST_CANCER_DUCTAL_INVASIVE_UP       344      Down 1.966308e-12
## BOQUEST_STEM_CELL_UP                           260      Down 9.218588e-12
## GINESTIER_BREAST_CANCER_ZNF217_AMPLIFIED_DN    322        Up 1.124005e-11
## MCLACHLAN_DENTAL_CARIES_UP                     235      Down 3.052341e-11
## NAKAYAMA_SOFT_TISSUE_TUMORS_PCA2_DN             75      Down 3.513699e-10
## VECCHI_GASTRIC_CANCER_ADVANCED_VS_EARLY_UP     168      Down 1.583817e-09
## NIKOLSKY_BREAST_CANCER_8Q12_Q22_AMPLICON       122      Down 6.562718e-09
## RUIZ_TNC_TARGETS_UP                            151      Down 1.452923e-08
##                                                      FDR
## SMID_BREAST_CANCER_NORMAL_LIKE_UP           1.882713e-11
## PICCALUGA_ANGIOIMMUNOBLASTIC_LYMPHOMA_UP    9.890667e-10
## SCHUETZ_BREAST_CANCER_DUCTAL_INVASIVE_UP    3.099556e-09
## BOQUEST_STEM_CELL_UP                        1.063084e-08
## GINESTIER_BREAST_CANCER_ZNF217_AMPLIFIED_DN 1.063084e-08
## MCLACHLAN_DENTAL_CARIES_UP                  2.405753e-08
## NAKAYAMA_SOFT_TISSUE_TUMORS_PCA2_DN         2.373755e-07
## VECCHI_GASTRIC_CANCER_ADVANCED_VS_EARLY_UP  9.362336e-07
## NIKOLSKY_BREAST_CANCER_8Q12_Q22_AMPLICON    3.448344e-06
## RUIZ_TNC_TARGETS_UP                         6.672495e-06

Table containing the top 10 most significant genes where it can be seen that Breast Cancer, Lymphoma and invasive Breast Cancer is the most sifnificant and are upregulated genes.

library(gplots)

## Warning: package 'gplots' was built under R version 4.3.2

## 
## Attaching package: 'gplots'

## The following object is masked from 'package:IRanges':
## 
##     space

## The following object is masked from 'package:S4Vectors':
## 
##     space

## The following object is masked from 'package:stats':
## 
##     lowess

age.topgenes <- age$symbol[1:5]

i <- which(v$genes$symbol %in% age.topgenes)
coolmap(lcpm[i, ], margin = c(8, 6), cexRow = 0.7, cexCol = 0.7)

$Figure 6: Heat map of genes frpm age at diagnosos variable.$

Figure 6: Heat map of genes frpm age at diagnosos variable.

It can be seen that majority of the genes are not significant, indicated by white. However, it is notable that there is lowly expressed genes to the left of the map for multiple genes.

strictage <- topTable(efit, coef = "cgc_case_age_at_diagnosis", adjust.method = "fdr", p.value = 0.01, number = Inf)

nrow(strictage)

## [1] 686

sortedage <- strictage[order(-strictage$logFC), ]
head(strictage)

##                               gene_id bp_length  symbol       logFC    AveExpr
## ENSG00000168306.12 ENSG00000168306.12      3859   ACOX2 -0.03901059  0.1318327
## ENSG00000168672.3   ENSG00000168672.3      5404  FAM84B -0.03837188  4.0270064
## ENSG00000283383.1   ENSG00000283383.1       526      NA -0.07131446 -6.5907938
## ENSG00000164056.10 ENSG00000164056.10      3963   SPRY1 -0.02769988  4.8152212
## ENSG00000164237.8   ENSG00000164237.8      3324    CMBL -0.04480958  1.9469306
## ENSG00000271079.1   ENSG00000271079.1      2587 CTAGE15 -0.03580224 -2.3382112
##                            t      P.Value    adj.P.Val        B
## ENSG00000168306.12 -7.048727 6.397633e-12 1.989368e-07 15.56639
## ENSG00000168672.3  -6.952842 1.189706e-11 1.989368e-07 14.58284
## ENSG00000283383.1  -6.724399 5.078148e-11 5.660950e-07 14.33622
## ENSG00000164056.10 -6.674685 6.929084e-11 5.793233e-07 12.79098
## ENSG00000164237.8  -6.495015 2.098230e-10 1.403422e-06 11.95849
## ENSG00000271079.1  -6.403542 3.654429e-10 2.036918e-06 11.90323

MSRG <- read.csv("https://wimr-genomics.vip.sydney.edu.au/AMED3002/data/MSRG.txt",
    sep = "")

It can be seen that gene ENSG00000168306.12 is the most significant gene compared to other genes in the data set.

load(url("http://bioinf.wehi.edu.au/software/MSigDB/human_c2_v5p2.rdata"))

# We need to make indicies to match these gene sets to the genes in our
# expression data (v)
idx <- ids2indices(Hs.c2, id = v$genes$ENTREZID)
cam.age <- camera(v, idx, design, contrast = "cgc_case_age_at_diagnosis")
head(cam.age, 10)

##                                             NGenes Direction       PValue
## SMID_BREAST_CANCER_NORMAL_LIKE_UP              452      Down 3.980203e-15
## PICCALUGA_ANGIOIMMUNOBLASTIC_LYMPHOMA_UP       198      Down 4.175663e-13
## SCHUETZ_BREAST_CANCER_DUCTAL_INVASIVE_UP       344      Down 1.964203e-12
## BOQUEST_STEM_CELL_UP                           260      Down 9.215426e-12
## GINESTIER_BREAST_CANCER_ZNF217_AMPLIFIED_DN    322        Up 1.114458e-11
## MCLACHLAN_DENTAL_CARIES_UP                     235      Down 3.049996e-11
## NAKAYAMA_SOFT_TISSUE_TUMORS_PCA2_DN             75      Down 3.514456e-10
## VECCHI_GASTRIC_CANCER_ADVANCED_VS_EARLY_UP     168      Down 1.582042e-09
## NIKOLSKY_BREAST_CANCER_8Q12_Q22_AMPLICON       122      Down 6.552082e-09
## RUIZ_TNC_TARGETS_UP                            151      Down 1.450476e-08
##                                                      FDR
## SMID_BREAST_CANCER_NORMAL_LIKE_UP           1.882238e-11
## PICCALUGA_ANGIOIMMUNOBLASTIC_LYMPHOMA_UP    9.873355e-10
## SCHUETZ_BREAST_CANCER_DUCTAL_INVASIVE_UP    3.096239e-09
## BOQUEST_STEM_CELL_UP                        1.054054e-08
## GINESTIER_BREAST_CANCER_ZNF217_AMPLIFIED_DN 1.054054e-08
## MCLACHLAN_DENTAL_CARIES_UP                  2.403905e-08
## NAKAYAMA_SOFT_TISSUE_TUMORS_PCA2_DN         2.374266e-07
## VECCHI_GASTRIC_CANCER_ADVANCED_VS_EARLY_UP  9.351844e-07
## NIKOLSKY_BREAST_CANCER_8Q12_Q22_AMPLICON    3.442755e-06
## RUIZ_TNC_TARGETS_UP                         6.656539e-06

The top 10 most significant cancers were graphed in a table. The most significant cancer that is downregulated by age is normal like Breast Cancer with a very low p value suggesting that the observations were not due to chance. The only cancer that is upregulated is amplified breast cancer.

cam.age$pathway <- row.names(cam.age)
trimmedCam_age <- dplyr::filter(cam.age, grepl("KEGG|REACTOME|BIOCARTA", pathway))
head(trimmedCam_age)

##                                                  NGenes Direction       PValue
## REACTOME_POST_CHAPERONIN_TUBULIN_FOLDING_PATHWAY     18        Up 1.173329e-07
## REACTOME_REGULATION_OF_COMPLEMENT_CASCADE            12      Down 1.621063e-06
## KEGG_CYTOKINE_CYTOKINE_RECEPTOR_INTERACTION         223      Down 4.265086e-06
## KEGG_COMPLEMENT_AND_COAGULATION_CASCADES             59      Down 5.817660e-06
## KEGG_HEMATOPOIETIC_CELL_LINEAGE                      79      Down 6.216634e-06
## KEGG_OXIDATIVE_PHOSPHORYLATION                      114        Up 8.374448e-06
##                                                           FDR
## REACTOME_POST_CHAPERONIN_TUBULIN_FOLDING_PATHWAY 2.016347e-05
## REACTOME_REGULATION_OF_COMPLEMENT_CASCADE        1.437312e-04
## KEGG_CYTOKINE_CYTOKINE_RECEPTOR_INTERACTION      2.964811e-04
## KEGG_COMPLEMENT_AND_COAGULATION_CASCADES         3.668229e-04
## KEGG_HEMATOPOIETIC_CELL_LINEAGE                  3.777592e-04
## KEGG_OXIDATIVE_PHOSPHORYLATION                   4.710939e-04
##                                                                                           pathway
## REACTOME_POST_CHAPERONIN_TUBULIN_FOLDING_PATHWAY REACTOME_POST_CHAPERONIN_TUBULIN_FOLDING_PATHWAY
## REACTOME_REGULATION_OF_COMPLEMENT_CASCADE               REACTOME_REGULATION_OF_COMPLEMENT_CASCADE
## KEGG_CYTOKINE_CYTOKINE_RECEPTOR_INTERACTION           KEGG_CYTOKINE_CYTOKINE_RECEPTOR_INTERACTION
## KEGG_COMPLEMENT_AND_COAGULATION_CASCADES                 KEGG_COMPLEMENT_AND_COAGULATION_CASCADES
## KEGG_HEMATOPOIETIC_CELL_LINEAGE                                   KEGG_HEMATOPOIETIC_CELL_LINEAGE
## KEGG_OXIDATIVE_PHOSPHORYLATION                                     KEGG_OXIDATIVE_PHOSPHORYLATION

The top 6 most significant pathways were in this table, it can be seen that reactome post chaperonin tubulin folding pathway was the most significant according to the false discovery rate value. The age variable upregulates this pathway that contains 18 genes.

barcodeplot(efit$t[, "cgc_case_age_at_diagnosis"], index = idx$REACTOME_POST_CHAPERONIN_TUBULIN_FOLDING_PATHWAY)

Figure 7: Barcode plot of the reactome post chperonin tubulin folding pathway.

The barcode plot plots 18 genes from the reactome post chaperonin tubulin folding pathway. It shows that there is significant enrichment in upregulation, as the running enrichment increases and goes over the enrichment threshold.

AMED3002 Lab Report 3: Does the age of Diagnosis

2024-04-27

1 Introduction

2 Results

3 Conclusion