Background
Intratumoral hypoxia (reduced O2 availability) is a common finding in human cancer
Hypoxia leads to increased activity of hypoxia-inducible factors (HIFs), which regulate the expression of genes that contribute to angiogenesis, metabolic reprogramming, extracellular matrix remodeling, epithelial–mesenchymal transition, motility, invasion, metastasis, cancer stem cell maintenance, immune evasion, and resistance to chemotherapy and radiation therapy.
Conventional anticancer therapies target well-oxygenated and proliferating cancer cells, whereas there are no approved therapies that target hypoxic cancer cells, despite growing clinical and experimental evidence indicating that intratumoral hypoxia is a critical microenvironmental factor driving cancer progression
Calreticulin is an endoplasmic reticulum (ER) resident protein critical for maintaining Ca2+ homeostasis and glycoprotein folding in the ER. The protein has also been identified on the cell surface of apoptotic and necrotic cells and implicated to play a role in immunogenic cell death and other extracellular functions. Under cell stress conditions (environmental, drug induced, hypoxia), calreticulin may be upregulated as it attempts to regulate cell survival, death or repair. The initial signaling mechanisms involved in these processes may be regulated by the unfolded protein response (UPR) and genome damage response (GDR) pathways.
Major Findings from the paper
CALR is overexpressed in BC compared with normal tissue, and its expression is correlated with patient mortality and stemness indices
CALR expression was increased in mammosphere cultures, CD24−CD44+ cells, and aldehyde dehydrogenase–expressing cells, which are enriched for breast cancer stem cells (BCSCs)
CALR knockdown led to BCSC depletion, which impaired tumor initiation and metastasis and enhanced chemosensitivity in vivo
Chromatin immunoprecipitation and reporter assays revealed that hypoxia-inducible factor 1 (HIF-1) directly activated CALR transcription in hypoxic BC cells
CALR expression was correlated with Wnt/β-catenin pathway activation
Conclusion: CALR facilitates BC progression by promoting the BCSC phenotype through Wnt/β-catenin signaling in an HIF-1–dependent manner and suggest that CALR may represent a target for BC therapy.
# Load library for DESeq2
library(DESeq2)
## Loading required package: S4Vectors
## Loading required package: stats4
## Loading required package: BiocGenerics
##
## Attaching package: 'BiocGenerics'
## The following objects are masked from 'package:stats':
##
## IQR, mad, sd, var, xtabs
## The following objects are masked from 'package:base':
##
## anyDuplicated, append, as.data.frame, basename, cbind, colnames,
## dirname, do.call, duplicated, eval, evalq, Filter, Find, get, grep,
## grepl, intersect, is.unsorted, lapply, Map, mapply, match, mget,
## order, paste, pmax, pmax.int, pmin, pmin.int, Position, rank,
## rbind, Reduce, rownames, sapply, setdiff, sort, table, tapply,
## union, unique, unsplit, which.max, which.min
##
## Attaching package: 'S4Vectors'
## The following objects are masked from 'package:base':
##
## expand.grid, I, unname
## Loading required package: IRanges
##
## Attaching package: 'IRanges'
## The following object is masked from 'package:grDevices':
##
## windows
## Loading required package: GenomicRanges
## Warning: package 'GenomicRanges' was built under R version 4.1.2
## Loading required package: GenomeInfoDb
## Warning: package 'GenomeInfoDb' was built under R version 4.1.2
## Loading required package: SummarizedExperiment
## Loading required package: MatrixGenerics
## Loading required package: matrixStats
##
## Attaching package: 'MatrixGenerics'
## The following objects are masked from 'package:matrixStats':
##
## colAlls, colAnyNAs, colAnys, colAvgsPerRowSet, colCollapse,
## colCounts, colCummaxs, colCummins, colCumprods, colCumsums,
## colDiffs, colIQRDiffs, colIQRs, colLogSumExps, colMadDiffs,
## colMads, colMaxs, colMeans2, colMedians, colMins, colOrderStats,
## colProds, colQuantiles, colRanges, colRanks, colSdDiffs, colSds,
## colSums2, colTabulates, colVarDiffs, colVars, colWeightedMads,
## colWeightedMeans, colWeightedMedians, colWeightedSds,
## colWeightedVars, rowAlls, rowAnyNAs, rowAnys, rowAvgsPerColSet,
## rowCollapse, rowCounts, rowCummaxs, rowCummins, rowCumprods,
## rowCumsums, rowDiffs, rowIQRDiffs, rowIQRs, rowLogSumExps,
## rowMadDiffs, rowMads, rowMaxs, rowMeans2, rowMedians, rowMins,
## rowOrderStats, rowProds, rowQuantiles, rowRanges, rowRanks,
## rowSdDiffs, rowSds, rowSums2, rowTabulates, rowVarDiffs, rowVars,
## rowWeightedMads, rowWeightedMeans, rowWeightedMedians,
## rowWeightedSds, rowWeightedVars
## Loading required package: Biobase
## Welcome to Bioconductor
##
## Vignettes contain introductory material; view with
## 'browseVignettes()'. To cite Bioconductor, see
## 'citation("Biobase")', and for packages 'citation("pkgname")'.
##
## Attaching package: 'Biobase'
## The following object is masked from 'package:MatrixGenerics':
##
## rowMedians
## The following objects are masked from 'package:matrixStats':
##
## anyMissing, rowMedians
# Load library for RColorBrewer
library(RColorBrewer)
# Load library for pheatmap
library(pheatmap)
# Load library for tidyverse
library(tidyverse)
## -- Attaching packages --------------------------------------- tidyverse 1.3.1 --
## v ggplot2 3.3.5 v purrr 0.3.4
## v tibble 3.1.6 v dplyr 1.0.8
## v tidyr 1.2.0 v stringr 1.4.0.9000
## v readr 2.1.2 v forcats 0.5.1
## Warning: package 'tibble' was built under R version 4.1.2
## Warning: package 'tidyr' was built under R version 4.1.2
## Warning: package 'readr' was built under R version 4.1.2
## Warning: package 'dplyr' was built under R version 4.1.3
## -- Conflicts ------------------------------------------ tidyverse_conflicts() --
## x dplyr::collapse() masks IRanges::collapse()
## x dplyr::combine() masks Biobase::combine(), BiocGenerics::combine()
## x dplyr::count() masks matrixStats::count()
## x dplyr::desc() masks IRanges::desc()
## x tidyr::expand() masks S4Vectors::expand()
## x dplyr::filter() masks stats::filter()
## x dplyr::first() masks S4Vectors::first()
## x dplyr::lag() masks stats::lag()
## x ggplot2::Position() masks BiocGenerics::Position(), base::Position()
## x purrr::reduce() masks GenomicRanges::reduce(), IRanges::reduce()
## x dplyr::rename() masks S4Vectors::rename()
## x dplyr::slice() masks IRanges::slice()
library(dplyr)
library(remotes)
## Warning: package 'remotes' was built under R version 4.1.2
#Reading Raw Dataset
#Read dataset
GSE181431_expressed_gene_reads <- read.delim("D:/MScDataScience/dataHPC/finalExams/newFinal/GSE181431_expressed_gene_reads.txt", stringsAsFactors=TRUE)
rawdata <- GSE181431_expressed_gene_reads
head(rawdata)
## Gene CALR_KD1 CALR_KD2 CALR_KD3 Ctrl_1 Ctrl_2 Ctrl_3
## 1 ENSG00000237851.1 0 2 0 0 1 1
## 2 ENSG00000164054.15 12431 8441 15423 10959 11509 12096
## 3 ENSG00000257527.1 12 63 19 20 18 22
## 4 ENSG00000140830.8 852 1082 1177 959 1687 1773
## 5 ENSG00000180613.10 6 7 8 15 16 15
## 6 ENSG00000204444.10 24 7 16 16 12 14
## Symbol Chr GeneType Description
## 1 RP1-67K17.4 chr6 lincRNA --
## 2 SHISA5 chr3 protein_coding shisa family member 5
## 3 MIR3179-3 chr16 lincRNA microRNA 3179-3
## 4 TXNL4B chr16 protein_coding thioredoxin-like 4B
## 5 GSX2 chr4 protein_coding GS homeobox 2
## 6 APOM chr6 protein_coding apolipoprotein M
dim(rawdata)
## [1] 31582 11
#filter to obtain datasets for only protein_coding
rawdatafiltered <- rawdata %>% filter(GeneType=="protein_coding")
dim(rawdatafiltered)
## [1] 17231 11
numraw <- rawdatafiltered %>% select(2:7) #select only the reads
rawcolsum <- colSums(numraw)
framecolsum <- data.frame(rawcolsum)
names(framecolsum) <- c("count")
framecolsum$condition <- rownames(framecolsum)
rownames(framecolsum) <- NULL
framecolsum
## count condition
## 1 36459467 CALR_KD1
## 2 37397554 CALR_KD2
## 3 41206322 CALR_KD3
## 4 35774946 Ctrl_1
## 5 33412247 Ctrl_2
## 6 39214898 Ctrl_3
#plotting the total read counts per library
p.barplot <- ggplot(data=framecolsum, aes(x=condition, y=count)) + geom_bar(stat="identity", fill="steelblue")
p.barplot + theme(axis.text.x = element_text(angle = 45, vjust = 1, hjust=1)) + geom_text(aes(label=count), vjust=-0.3, size=3.5)
#Distribution of raw data
rawdatafiltered %>% select(2:7) %>% boxplot()
p <- ggplot(rawdatafiltered, aes(x=Ctrl_1)) +
geom_density()
p
#Reshaping the raw dataset in preparation for Deseq object
head(rawdatafiltered)
## Gene CALR_KD1 CALR_KD2 CALR_KD3 Ctrl_1 Ctrl_2 Ctrl_3 Symbol
## 1 ENSG00000164054.15 12431 8441 15423 10959 11509 12096 SHISA5
## 2 ENSG00000140830.8 852 1082 1177 959 1687 1773 TXNL4B
## 3 ENSG00000180613.10 6 7 8 15 16 15 GSX2
## 4 ENSG00000204444.10 24 7 16 16 12 14 APOM
## 5 ENSG00000163249.9 1069 1422 1224 1140 1121 1426 CCNYL1
## 6 ENSG00000265808.3 7012 12528 9632 8643 8670 9827 SEC22B
## Chr GeneType
## 1 chr3 protein_coding
## 2 chr16 protein_coding
## 3 chr4 protein_coding
## 4 chr6 protein_coding
## 5 chr2 protein_coding
## 6 chr1 protein_coding
## Description
## 1 shisa family member 5
## 2 thioredoxin-like 4B
## 3 GS homeobox 2
## 4 apolipoprotein M
## 5 cyclin Y-like 1
## 6 SEC22 vesicle trafficking protein homolog B (S. cerevisiae) (gene/pseudogene)
counts_rawdatafiltered <- rawdatafiltered %>% select(1:7) %>% column_to_rownames(var = "Gene") #Convert the gene column to rownames
head(counts_rawdatafiltered, 2)
## CALR_KD1 CALR_KD2 CALR_KD3 Ctrl_1 Ctrl_2 Ctrl_3
## ENSG00000164054.15 12431 8441 15423 10959 11509 12096
## ENSG00000140830.8 852 1082 1177 959 1687 1773
condition <- c("knockdown", "knockdown" , "knockdown", "control", "control", "control")
sample <- c("CALR_KD1", "CALR_KD2", "CALR_KD3", "Ctrl_1", "Ctrl_2", "Ctrl_3")
combinedata <- data.frame(sample, condition)
metadata <- column_to_rownames(combinedata, var = "sample")
metadata
## condition
## CALR_KD1 knockdown
## CALR_KD2 knockdown
## CALR_KD3 knockdown
## Ctrl_1 control
## Ctrl_2 control
## Ctrl_3 control
#Preparing the Data for DESeq
#check if the rownames and column names fo the two data sets match using the below code.
all(rownames(metadata)== colnames(counts_rawdatafiltered))
## [1] TRUE
# Create matrix
dds <- DESeqDataSetFromMatrix(countData= counts_rawdatafiltered,
colData = metadata,
design = ~ condition)
## Warning in DESeqDataSet(se, design = design, ignoreRank): some variables in
## design formula are characters, converting to factors
#Prefiltering
#For each gene, we count the total number of reads for that gene in all samples
#and remove those that don't have at least 10 read.
keep <- rowSums(counts(dds)) >= 10
dds <- dds[keep,]
nrow(dds)
## [1] 14837
vsd_dds <- vst(dds, blind = FALSE)
plotPCA(vsd_dds, intgroup="condition")
vsd_mat_dds <- assay(vsd_dds)
# Compute the correlation values between samples
vsd_cor_dds <- cor(vsd_mat_dds)
# Plot the heatmap and correlation matrix
pheatmap(vsd_cor_dds, annotation= select(metadata, condition))
library(corrplot)
## Warning: package 'corrplot' was built under R version 4.1.2
## corrplot 0.92 loaded
corrplot(vsd_cor_dds, order = 'hclust',
addrect = 2, addCoef.col = 'white',
number.cex = 0.7)
library(vsn)
meanSdPlot(assay(vsd_dds))
plot(assay(vsd_dds)[,1],assay(vsd_dds)[,2])
#Plot the distribution of vst transformed data
boxplot(assay(vsd_dds))
dds <- DESeq(dds)
## estimating size factors
## estimating dispersions
## gene-wise dispersion estimates
## mean-dispersion relationship
## final dispersion estimates
## fitting model and testing
# Extract results
dds_res <- results(dds,
contrast = c("condition", "knockdown", "control"),
alpha = 0.05)
dds_res
## log2 fold change (MLE): condition knockdown vs control
## Wald test p-value: condition knockdown vs control
## DataFrame with 14837 rows and 6 columns
## baseMean log2FoldChange lfcSE stat pvalue
## <numeric> <numeric> <numeric> <numeric> <numeric>
## ENSG00000164054.15 11792.8951 0.1418566 0.258044 0.549738 0.5824992
## ENSG00000140830.8 1238.2222 -0.4406657 0.264576 -1.665553 0.0958026
## ENSG00000180613.10 11.0412 -1.0740692 0.735226 -1.460869 0.1440515
## ENSG00000204444.10 14.9628 0.2522090 0.729693 0.345637 0.7296155
## ENSG00000163249.9 1221.7654 0.0776569 0.174581 0.444820 0.6564499
## ... ... ... ... ... ...
## ENSG00000175155.8 206.8833 -1.0560348 0.388279 -2.7197820 0.0065325
## ENSG00000144152.12 40.3088 0.7705785 0.660688 1.1663268 0.2434824
## ENSG00000126003.6 2275.0201 -0.2231200 0.188321 -1.1847834 0.2361031
## ENSG00000114948.12 33.0136 1.0950184 0.552377 1.9823736 0.0474374
## ENSG00000108387.14 35.2671 0.0379507 0.585160 0.0648551 0.9482894
## padj
## <numeric>
## ENSG00000164054.15 0.810884
## ENSG00000140830.8 0.374877
## ENSG00000180613.10 0.445800
## ENSG00000204444.10 0.886638
## ENSG00000163249.9 0.851705
## ... ...
## ENSG00000175155.8 0.0957145
## ENSG00000144152.12 0.5549530
## ENSG00000126003.6 0.5478135
## ENSG00000114948.12 0.2703644
## ENSG00000108387.14 0.9823831
#Consider all genes with an adjusted p-value below 5% as significant. The number of significant genes are…
alpha = 0.05
sum(dds_res$padj < alpha, na.rm = TRUE)
## [1] 540
#Select the signicant genes (alpha = 0.05) and subset the dds_res table to these genes. Sort it by the log2-fold-change estimate to get the signicant genes that are downregulated or upregulated.
sigUpReg = dds_res[ which(dds_res$padj < 0.05 & dds_res$log2FoldChange >= 1), ]
sigDownReg = dds_res[ which(dds_res$padj < 0.05 & dds_res$log2FoldChange < 0), ]
nrow(sigUpReg)
## [1] 89
nrow(sigDownReg)
## [1] 239
#Save resulting genes
#write.csv(sigDownReg, file = "Downreg.csv")
#write.csv(sigUpReg, file = "Upreg.csv")
#MA plot
DESeq2::plotMA(dds_res, alpha = 0.05)
#Volcano plot
library(EnhancedVolcano)
## Loading required package: ggrepel
## Registered S3 methods overwritten by 'ggalt':
## method from
## grid.draw.absoluteGrob ggplot2
## grobHeight.absoluteGrob ggplot2
## grobWidth.absoluteGrob ggplot2
## grobX.absoluteGrob ggplot2
## grobY.absoluteGrob ggplot2
p1 <- EnhancedVolcano(dds_res,
lab = rownames(dds_res),
x = 'log2FoldChange',
y = 'pvalue')
p1
#Gene clustering of most highly differentially expressed
#Transform the normalized counts using the varianceStabilizingTransformation() function and create a variable vsnd from it.
vsnd1 = varianceStabilizingTransformation(dds, blind = FALSE)
normData1 <- data.frame(assay(vsnd1))
normData <- rownames_to_column(normData1, var = "EnsemblId")
head(normData)
## EnsemblId CALR_KD1 CALR_KD2 CALR_KD3 Ctrl_1 Ctrl_2 Ctrl_3
## 1 ENSG00000164054.15 13.73161 13.013617 13.921986 13.445073 13.561657 13.381866
## 2 ENSG00000140830.8 9.98936 10.153466 10.308779 10.047707 10.853488 10.681577
## 3 ENSG00000180613.10 6.07140 6.081917 6.128672 6.347180 6.386531 6.292350
## 4 ENSG00000204444.10 6.59199 6.081917 6.369578 6.373158 6.274825 6.267157
## 5 ENSG00000163249.9 10.29084 10.520358 10.361358 10.277747 10.298755 10.386425
## 6 ENSG00000265808.3 12.91310 13.578031 13.247957 13.105753 13.156597 13.085032
nrow(normData)
## [1] 14837
topUpReg = dds_res[ which(dds_res$padj < 0.05 & dds_res$log2FoldChange >= 1.5), ]
topDownReg = dds_res[ which(dds_res$padj < 0.05 & dds_res$log2FoldChange <= -2), ]
head(topUpReg)
## log2 fold change (MLE): condition knockdown vs control
## Wald test p-value: condition knockdown vs control
## DataFrame with 6 rows and 6 columns
## baseMean log2FoldChange lfcSE stat pvalue
## <numeric> <numeric> <numeric> <numeric> <numeric>
## ENSG00000106538.9 653.5974 1.55374 0.296835 5.23437 1.65551e-07
## ENSG00000100628.11 65.8188 1.75108 0.453552 3.86080 1.13016e-04
## ENSG00000151651.15 6005.1314 1.70057 0.364422 4.66647 3.06413e-06
## ENSG00000085117.11 4299.8378 1.98091 0.299199 6.62072 3.57454e-11
## ENSG00000166920.10 9667.5072 1.92461 0.297004 6.48007 9.16792e-11
## ENSG00000198074.9 129.3626 2.14366 0.492074 4.35637 1.32237e-05
## padj
## <numeric>
## ENSG00000106538.9 1.07561e-04
## ENSG00000100628.11 7.68956e-03
## ENSG00000151651.15 6.34925e-04
## ENSG00000085117.11 9.77707e-08
## ENSG00000166920.10 1.79115e-07
## ENSG00000198074.9 1.86441e-03
frametopup1 <- data.frame(topUpReg)
frametopup <- rownames_to_column(frametopup1, var = "EnsemblId")
frametopdown1 <- data.frame(topDownReg)
frametopdown <- rownames_to_column(frametopdown1, var = "EnsemblId")
#Gene clustering for top20 downregulated genes
dp <- frametopdown %>%
arrange(log2FoldChange) %>% #arrange in descending order by log2foldchange
slice_head(n=20) %>% #select the top20
inner_join(normData)
## Joining, by = "EnsemblId"
dp
## EnsemblId baseMean log2FoldChange lfcSE stat pvalue
## 1 ENSG00000165685.8 7.232556 -6.270170 1.9180432 -3.269045 1.079110e-03
## 2 ENSG00000072041.16 15.491299 -5.469519 1.3629868 -4.012892 5.997926e-05
## 3 ENSG00000180447.6 28.453859 -5.367528 1.2458117 -4.308459 1.643963e-05
## 4 ENSG00000120149.8 6.681811 -5.158917 1.5897694 -3.245072 1.174207e-03
## 5 ENSG00000173391.8 65.269700 -5.112772 0.8918281 -5.732912 9.872068e-09
## 6 ENSG00000176399.3 6.445924 -5.103618 1.5621360 -3.267076 1.086644e-03
## 7 ENSG00000158104.11 13.023045 -4.611436 1.3301593 -3.466830 5.266355e-04
## 8 ENSG00000175426.10 10.009066 -4.232778 1.2769620 -3.314725 9.173322e-04
## 9 ENSG00000171246.5 18.358238 -4.105922 1.2121321 -3.387355 7.056990e-04
## 10 ENSG00000123364.4 21.422815 -3.903923 1.1698553 -3.337099 8.465769e-04
## 11 ENSG00000095777.14 7.660633 -3.790100 1.2148252 -3.119873 1.809291e-03
## 12 ENSG00000135111.14 32.062158 -3.649475 0.8905502 -4.098001 4.167336e-05
## 13 ENSG00000086300.15 71.667656 -3.534323 0.7354434 -4.805704 1.542077e-06
## 14 ENSG00000168453.14 28.743978 -3.456161 0.8371243 -4.128611 3.649606e-05
## 15 ENSG00000106823.12 19.985250 -3.452797 1.0379208 -3.326648 8.789727e-04
## 16 ENSG00000179388.8 17.695323 -3.417640 1.0820188 -3.158577 1.585412e-03
## 17 ENSG00000138685.12 30.857582 -3.397349 1.0169054 -3.340870 8.351619e-04
## 18 ENSG00000198947.14 15.315147 -3.374653 0.9459212 -3.567584 3.602879e-04
## 19 ENSG00000203995.9 22.814742 -3.361731 0.8222150 -4.088627 4.339334e-05
## 20 ENSG00000099810.18 75.827742 -3.329684 0.8182871 -4.069090 4.719715e-05
## padj CALR_KD1 CALR_KD2 CALR_KD3 Ctrl_1 Ctrl_2 Ctrl_3
## 1 3.448109e-02 5.533139 5.533139 5.533139 6.804519 5.964475 5.731280
## 2 5.126727e-03 5.753944 5.533139 5.744999 7.162315 6.360161 6.185251
## 3 2.121022e-03 5.533139 5.533139 5.955728 7.782181 6.332826 6.340176
## 4 3.629421e-02 5.533139 5.533139 5.744999 6.534360 6.060445 6.185251
## 5 1.350104e-05 5.533139 5.949040 6.090696 8.548878 6.915554 6.934566
## 6 3.464088e-02 5.533139 5.533139 5.744999 6.422599 6.274825 6.090535
## 7 2.273594e-02 5.753944 5.533139 5.832487 7.092132 6.177193 6.054935
## 8 3.128538e-02 5.533139 5.533139 5.899437 6.705439 6.274825 6.267157
## 9 2.668839e-02 5.533139 5.533139 6.049790 7.249795 6.483931 6.185251
## 10 3.022920e-02 5.533139 5.533139 6.128672 7.360533 6.360161 6.467712
## 11 4.767760e-02 5.845102 5.533139 5.744999 6.398273 6.274825 6.292350
## 12 3.957811e-03 5.845102 5.533139 6.259993 7.470954 6.705850 6.958390
## 13 4.055664e-04 5.845102 5.949040 6.628693 8.170030 7.367327 7.561201
## 14 3.539859e-03 6.071400 5.827733 6.049790 7.556093 6.741081 6.385065
## 15 3.084195e-02 5.533139 5.741629 6.164247 7.162315 6.332826 6.660498
## 16 4.415905e-02 5.753944 5.741629 6.090696 5.957575 6.986237 6.859748
## 17 2.989967e-02 5.533139 5.741629 6.343705 7.514238 6.870802 6.560579
## 18 1.753487e-02 5.753944 5.827733 6.005190 7.004400 6.360161 6.362956
## 19 4.031139e-03 5.845102 5.893630 6.090696 7.291242 6.483931 6.595118
## 20 4.246502e-03 5.845102 5.949040 6.751151 8.403372 7.367327 7.274991
setwd("D:/MScDataScience/dataHPC/finalExams/newFinal")
dp2 <- dp[, c(1, 8:13)]
dp3 <- column_to_rownames(dp2, var = "EnsemblId")
dp4 <- as.matrix(dp3)
#This was saved in excel and the ensemblid was converted to geneid using online tool
top20downregGI <- read.csv("D:/MScDataScience/dataHPC/finalExams/newFinal/top20downregGI.csv", row.names=1)
pheatmap(top20downregGI, main = "top20 downregulated genes")
#top20 upregulated genes
xp <- frametopup %>%
arrange(desc(log2FoldChange)) %>% #arrange in descending order by log2foldchange
slice_head(n=20) %>% #select the top20
inner_join(normData)
## Joining, by = "EnsemblId"
xp
## EnsemblId baseMean log2FoldChange lfcSE stat pvalue
## 1 ENSG00000118785.13 232.96335 4.899105 0.6262672 7.822708 5.169909e-15
## 2 ENSG00000172543.7 141.29906 2.990233 0.6902887 4.331858 1.478565e-05
## 3 ENSG00000162490.6 55.51583 2.702972 0.7009670 3.856061 1.152287e-04
## 4 ENSG00000155465.18 118.70450 2.517608 0.6767117 3.720355 1.989431e-04
## 5 ENSG00000174792.10 63.40655 2.452229 0.7074316 3.466383 5.275109e-04
## 6 ENSG00000182578.13 26.35348 2.346967 0.7347107 3.194410 1.401170e-03
## 7 ENSG00000198074.9 129.36256 2.143655 0.4920737 4.356370 1.322373e-05
## 8 ENSG00000198768.10 480.29818 2.120320 0.2594254 8.173139 3.004681e-16
## 9 ENSG00000085117.11 4299.83783 1.980912 0.2991989 6.620720 3.574536e-11
## 10 ENSG00000164220.6 61.15556 1.965413 0.6297118 3.121131 1.801577e-03
## 11 ENSG00000166920.10 9667.50724 1.924606 0.2970039 6.480071 9.167916e-11
## 12 ENSG00000119714.10 556.00807 1.872595 0.3752716 4.989972 6.038811e-07
## 13 ENSG00000162745.10 157.29404 1.847918 0.4376771 4.222104 2.420325e-05
## 14 ENSG00000100628.11 65.81877 1.751076 0.4535523 3.860802 1.130156e-04
## 15 ENSG00000151651.15 6005.13143 1.700567 0.3644223 4.666474 3.064129e-06
## 16 ENSG00000140835.9 130.36162 1.692689 0.4025525 4.204890 2.612096e-05
## 17 ENSG00000156966.6 71.05413 1.664583 0.5017078 3.317834 9.071842e-04
## 18 ENSG00000100292.16 2233.66497 1.641862 0.3082622 5.326187 1.002960e-07
## 19 ENSG00000172137.18 2993.95742 1.566777 0.3568387 4.390714 1.129792e-05
## 20 ENSG00000106538.9 653.59741 1.553743 0.2968350 5.234367 1.655513e-07
## padj CALR_KD1 CALR_KD2 CALR_KD3 Ctrl_1 Ctrl_2 Ctrl_3
## 1 2.356789e-11 9.964412 8.753401 7.498185 6.398273 6.243886 6.385065
## 2 2.002065e-03 9.147288 7.544766 7.984197 7.195882 6.386531 6.185251
## 3 7.801326e-03 8.149491 6.795622 6.998457 6.575254 6.211419 6.155376
## 4 1.159004e-02 9.061225 6.810735 7.673541 6.880289 6.650467 6.721632
## 5 2.273594e-02 6.633279 7.039244 8.340253 6.575254 6.436711 6.340176
## 6 4.042702e-02 6.897005 6.976887 6.783492 6.422599 6.060445 5.813134
## 7 1.864409e-03 7.296693 8.665532 8.331130 6.850640 6.824062 7.122431
## 8 2.054601e-12 9.442899 9.889773 9.919975 7.816340 8.106530 8.184009
## 9 9.777070e-08 13.033749 12.314834 12.849444 11.303215 10.623812 10.434697
## 10 4.767760e-02 6.881396 8.137804 6.998457 6.347180 6.631259 6.691551
## 11 1.791149e-07 14.088431 13.162200 14.258728 12.235831 11.938663 11.822210
## 12 1.964214e-04 9.890177 9.007595 10.478403 8.463408 8.333711 8.219176
## 13 2.669384e-03 8.966453 7.969740 7.930050 7.426128 7.065486 7.048841
## 14 7.689557e-03 7.859695 7.259386 7.324317 6.850640 6.506568 6.611903
## 15 6.349246e-04 13.861064 12.403414 12.879060 11.880983 11.328672 11.234311
## 16 2.815164e-03 8.608598 7.673661 8.001773 7.270697 7.078168 6.981705
## 17 3.117249e-02 7.987009 7.552716 6.945224 6.865569 6.650467 6.676151
## 18 8.068521e-05 11.098336 12.018644 11.995648 9.770352 10.404757 10.339407
## 19 1.661402e-03 11.146185 12.638362 12.306105 10.318278 10.833832 10.735913
## 20 1.075612e-04 10.319383 9.764876 10.034726 9.111072 8.473944 8.450417
colnames(xp)
## [1] "EnsemblId" "baseMean" "log2FoldChange" "lfcSE"
## [5] "stat" "pvalue" "padj" "CALR_KD1"
## [9] "CALR_KD2" "CALR_KD3" "Ctrl_1" "Ctrl_2"
## [13] "Ctrl_3"
xp2 <- xp[, c(1, 8:13)]
xp3 <- column_to_rownames(xp2, var = "EnsemblId")
xp4 <- as.matrix(xp3) #This variable was saved into an Excel file, ensembl gene id was converted to geneid using an online tool
top20upregGI <- read.csv("D:/MScDataScience/dataHPC/finalExams/newFinal/top20upregGI.csv", row.names=1)
pheatmap(top20upregGI, main = "top20 upregulated gene")
#Gene Ontology for downregulated genes
library(readxl)
## Warning: package 'readxl' was built under R version 4.1.3
GOdowngenesx <- read_excel("GOdowngenesx.xlsx")
ggplot(head(GOdowngenesx, 15), aes(x=reorder(Term, -PValue), y=Count, fill=-PValue)) +
geom_bar(stat = "identity") +
coord_flip() +
scale_fill_continuous(low="blue", high="red") +
labs(x = "", y = "", fill = "PValue") +
theme(axis.text=element_text(size=11)) + ylab("Count") + ggtitle("GO downregulated Genes")
Pathwaydown <- read_excel("Pathwaydown.xlsx")
ggplot(Pathwaydown, aes(x=reorder(Term, -PValue), y=Count, fill=-PValue)) +
geom_bar(stat = "identity") +
coord_flip() +
scale_fill_continuous(low="blue", high="red") +
labs(x = "", y = "", fill = "PValue") +
theme(axis.text=element_text(size=11)) + ylab("Count") + ggtitle("Pathway of downregulated Genes")
#Gene Ontology upregulated genes
upregGO <- read_excel("upregGO.xlsx")
ggplot(head(upregGO, 15), aes(x=reorder(Term, -PValue), y=Count, fill=-PValue)) +
geom_bar(stat = "identity") +
coord_flip() +
scale_fill_continuous(low="blue", high="red") +
labs(x = "", y = "", fill = "PValue") +
theme(axis.text=element_text(size=11)) + ylab("Count") + ggtitle("GO upregulated Genes")
#Pathway analysis for upregulated genes
upregpath <- read_excel("upregpath.xlsx")
ggplot(upregpath, aes(x=reorder(Term, -PValue), y=Count, fill=-PValue)) +
geom_bar(stat = "identity") +
coord_flip() +
scale_fill_continuous(low="blue", high="red") +
labs(x = "", y = "", fill = "PValue") +
theme(axis.text=element_text(size=11)) + ylab("Count") + ggtitle("Pathway analysis for upregulated Genes")
```