covid19_sum_ob <- readRDS("F:/BAI HC/Research 2/Covid19/covid19_sum_ob.RDS")
library(SummarizedExperiment)
## Loading required package: MatrixGenerics
## Loading required package: matrixStats
##
## Attaching package: 'MatrixGenerics'
## The following objects are masked from 'package:matrixStats':
##
## colAlls, colAnyNAs, colAnys, colAvgsPerRowSet, colCollapse,
## colCounts, colCummaxs, colCummins, colCumprods, colCumsums,
## colDiffs, colIQRDiffs, colIQRs, colLogSumExps, colMadDiffs,
## colMads, colMaxs, colMeans2, colMedians, colMins, colOrderStats,
## colProds, colQuantiles, colRanges, colRanks, colSdDiffs, colSds,
## colSums2, colTabulates, colVarDiffs, colVars, colWeightedMads,
## colWeightedMeans, colWeightedMedians, colWeightedSds,
## colWeightedVars, rowAlls, rowAnyNAs, rowAnys, rowAvgsPerColSet,
## rowCollapse, rowCounts, rowCummaxs, rowCummins, rowCumprods,
## rowCumsums, rowDiffs, rowIQRDiffs, rowIQRs, rowLogSumExps,
## rowMadDiffs, rowMads, rowMaxs, rowMeans2, rowMedians, rowMins,
## rowOrderStats, rowProds, rowQuantiles, rowRanges, rowRanks,
## rowSdDiffs, rowSds, rowSums2, rowTabulates, rowVarDiffs, rowVars,
## rowWeightedMads, rowWeightedMeans, rowWeightedMedians,
## rowWeightedSds, rowWeightedVars
## Loading required package: GenomicRanges
## Loading required package: stats4
## Loading required package: BiocGenerics
## Loading required package: parallel
##
## Attaching package: 'BiocGenerics'
## The following objects are masked from 'package:parallel':
##
## clusterApply, clusterApplyLB, clusterCall, clusterEvalQ,
## clusterExport, clusterMap, parApply, parCapply, parLapply,
## parLapplyLB, parRapply, parSapply, parSapplyLB
## The following objects are masked from 'package:stats':
##
## IQR, mad, sd, var, xtabs
## The following objects are masked from 'package:base':
##
## anyDuplicated, append, as.data.frame, basename, cbind, colnames,
## dirname, do.call, duplicated, eval, evalq, Filter, Find, get, grep,
## grepl, intersect, is.unsorted, lapply, Map, mapply, match, mget,
## order, paste, pmax, pmax.int, pmin, pmin.int, Position, rank,
## rbind, Reduce, rownames, sapply, setdiff, sort, table, tapply,
## union, unique, unsplit, which.max, which.min
## Loading required package: S4Vectors
##
## Attaching package: 'S4Vectors'
## The following object is masked from 'package:base':
##
## expand.grid
## Loading required package: IRanges
##
## Attaching package: 'IRanges'
## The following object is masked from 'package:grDevices':
##
## windows
## Loading required package: GenomeInfoDb
## Loading required package: Biobase
## Welcome to Bioconductor
##
## Vignettes contain introductory material; view with
## 'browseVignettes()'. To cite Bioconductor, see
## 'citation("Biobase")', and for packages 'citation("pkgname")'.
##
## Attaching package: 'Biobase'
## The following object is masked from 'package:MatrixGenerics':
##
## rowMedians
## The following objects are masked from 'package:matrixStats':
##
## anyMissing, rowMedians
#The count table in the covid19_sum_ob was data frame class,
#so I have to create the SummarizedExperiment again with
# count table in matrix class
count_matrix <- as.matrix(assay(covid19_sum_ob))
library(tidyverse)
## -- Attaching packages --------------------------------------- tidyverse 1.3.1 --
## v ggplot2 3.3.3 v purrr 0.3.4
## v tibble 3.1.1 v dplyr 1.0.5
## v tidyr 1.1.3 v stringr 1.4.0
## v readr 1.4.0 v forcats 0.5.1
## -- Conflicts ------------------------------------------ tidyverse_conflicts() --
## x dplyr::collapse() masks IRanges::collapse()
## x dplyr::combine() masks Biobase::combine(), BiocGenerics::combine()
## x dplyr::count() masks matrixStats::count()
## x dplyr::desc() masks IRanges::desc()
## x tidyr::expand() masks S4Vectors::expand()
## x dplyr::filter() masks stats::filter()
## x dplyr::first() masks S4Vectors::first()
## x dplyr::lag() masks stats::lag()
## x ggplot2::Position() masks BiocGenerics::Position(), base::Position()
## x purrr::reduce() masks GenomicRanges::reduce(), IRanges::reduce()
## x dplyr::rename() masks S4Vectors::rename()
## x dplyr::slice() masks IRanges::slice()
patient_info <- read_csv("F:/BAI HC/Research 2/Covid19/SraRunTable.txt")
##
## -- Column specification --------------------------------------------------------
## cols(
## .default = col_character(),
## AvgSpotLen = col_double(),
## Bases = col_double(),
## Bytes = col_double(),
## ReleaseDate = col_datetime(format = "")
## )
## i Use `spec()` for the full column specifications.
rownames(patient_info) <- patient_info$Run
## Warning: Setting row names on a tibble is deprecated.
# Rename column name
patient_info <- patient_info %>%
rename(library_name = `Library Name`,
sample_name = `Sample Name`)
library_name <- patient_info$library_name
# Search for index of sample groups
case_index <- grep("case", library_name)
control_index <- grep("control", library_name)
case_colon_index <-grep("case_colon", library_name)
control_colon_index<- grep("control_colon", library_name)
case_index
## [1] 1 2 7 8 11 12 13 14 15 22 23 24 25 26 27 28 29 30
control_index
## [1] 3 4 5 6 9 10 16 17 18 19 20 21 31 32 33 34 35 36 37 38
case_colon_index
## [1] 7 8 24 25 26 27 28 29 30
control_colon_index
## [1] 6 9 31 32 33 34 35 36 37 38
# Recode library name into groups
patient_info$condition <- c(rep("", 38))
patient_info$condition[case_index] <- c("covid19_case")
patient_info$condition[control_index] <- c("covid19_control")
patient_info$condition[case_colon_index] <- c("colon_case")
patient_info$condition[control_colon_index] <- c("colon_control")
# Make the SummarizedExperiment object again
covid19_sum_again <-SummarizedExperiment(count_matrix, colData = patient_info )
covid19_sum_again
## class: SummarizedExperiment
## dim: 60683 38
## metadata(0):
## assays(1): ''
## rownames(60683): ENSG00000223972 ENSG00000227232 ... ENSG00000277475
## ENSG00000268674
## rowData names(0):
## colnames(38): SRR12816720 SRR12816722 ... SRR12951228 SRR12951229
## colData names(30): Run AGE ... Tissue condition
library(DESeq2)
# Create a dds object
covid19_dds <-DESeqDataSetFromMatrix(countData = count_matrix,
colData = patient_info,
design = ~ condition)
## Warning in DESeqDataSet(se, design = design, ignoreRank): some variables in
## design formula are characters, converting to factors
# Check row number of the count matrix
nrow(covid19_dds)
## [1] 60683
# Remove rows with no information in the count matrix
# Extract index of rows having information
index <- rowSums(counts(covid19_dds)) >1
covid19_dds <- covid19_dds[index,]
nrow(covid19_dds)
## [1] 49434
# VST transformation for exploring data
# Use blind=TRUE to transform the counts in a unbiased manner
vsd <- varianceStabilizingTransformation(covid19_dds, blind = TRUE,
fitType = "mean")
vsd_covid19<-assay(vsd)
head(vsd_covid19)
## SRR12816720 SRR12816722 SRR12816727 SRR12816730 SRR12816733
## ENSG00000227232 -4.443747 -4.44374665 -4.4437467 -4.443747 -4.443747
## ENSG00000268020 -4.443747 -4.44374665 -4.4437467 -4.443747 -4.443747
## ENSG00000240361 -4.443747 -4.44374665 -4.4437467 -4.443747 -4.443747
## ENSG00000186092 -4.443747 -4.44374665 -0.4255445 -4.443747 -4.443747
## ENSG00000238009 -4.443747 -0.02329052 -4.4437467 -4.443747 -4.443747
## ENSG00000233750 -4.443747 -4.44374665 2.4156054 -4.443747 -4.443747
## SRR12951211 SRR12951215 SRR12951216 SRR12951224 SRR12816718
## ENSG00000227232 -4.443747 -4.443747 -4.443747 -0.2521745 -4.44374665
## ENSG00000268020 -4.443747 -4.443747 -4.443747 -4.4437467 2.31922385
## ENSG00000240361 -4.443747 -4.443747 -4.443747 -4.4437467 2.31922385
## ENSG00000186092 -4.443747 -4.443747 -4.443747 -4.4437467 -4.44374665
## ENSG00000238009 1.931892 -4.443747 -4.443747 -0.2521745 0.09728767
## ENSG00000233750 -4.443747 -4.443747 -4.443747 -4.4437467 -4.44374665
## SRR12816719 SRR12816721 SRR12816723 SRR12816724 SRR12816725
## ENSG00000227232 -0.2728182 -4.443747 -4.443747 -4.4437467 -4.443747
## ENSG00000268020 -4.4437467 -4.443747 -4.443747 -4.4437467 -4.443747
## ENSG00000240361 -4.4437467 -4.443747 -4.443747 -4.4437467 -4.443747
## ENSG00000186092 -4.4437467 -4.443747 -4.443747 -4.4437467 -4.443747
## ENSG00000238009 -4.4437467 -4.443747 -4.443747 -0.2362527 -4.443747
## ENSG00000233750 -4.4437467 -4.443747 -4.443747 -4.4437467 -4.443747
## SRR12816726 SRR12816728 SRR12816729 SRR12816731 SRR12816732
## ENSG00000227232 -4.443747 -4.4437467 -4.443747 -4.443747 -4.4437467
## ENSG00000268020 -4.443747 -4.4437467 -4.443747 -4.443747 -4.4437467
## ENSG00000240361 -4.443747 -4.4437467 -4.443747 -4.443747 -4.4437467
## ENSG00000186092 -4.443747 -4.4437467 -4.443747 -4.443747 -4.4437467
## ENSG00000238009 -4.443747 0.1668512 -4.443747 -4.443747 0.6039478
## ENSG00000233750 -4.443747 0.1668512 -4.443747 -4.443747 -4.4437467
## SRR12816734 SRR12816735 SRR12816736 SRR12951212 SRR12951213
## ENSG00000227232 -4.4437467 -4.443747 -4.443747 -4.443747 -4.443747
## ENSG00000268020 -4.4437467 -4.443747 -4.443747 -4.443747 -4.443747
## ENSG00000240361 -4.4437467 -4.443747 -4.443747 -4.443747 -4.443747
## ENSG00000186092 -4.4437467 -4.443747 -4.443747 -4.443747 -4.443747
## ENSG00000238009 0.4922409 -4.443747 -4.443747 -4.443747 -4.443747
## ENSG00000233750 -4.4437467 -4.443747 -4.443747 -4.443747 -4.443747
## SRR12951214 SRR12951217 SRR12951218 SRR12951219 SRR12951220
## ENSG00000227232 -4.443747 -4.443747 -4.4437467 -4.443747 -4.4437467
## ENSG00000268020 -4.443747 -4.443747 -4.4437467 -4.443747 -4.4437467
## ENSG00000240361 -4.443747 -4.443747 -4.4437467 -4.443747 -4.4437467
## ENSG00000186092 -4.443747 -4.443747 -4.4437467 -4.443747 -4.4437467
## ENSG00000238009 -4.443747 -4.443747 0.7351513 -4.443747 -0.1800744
## ENSG00000233750 -4.443747 -4.443747 -4.4437467 -4.443747 -0.1800744
## SRR12951221 SRR12951222 SRR12951223 SRR12951225 SRR12951226
## ENSG00000227232 -4.443747 -4.443747 -4.443747 -4.443747 -4.4437467
## ENSG00000268020 -4.443747 -4.443747 1.662822 -4.443747 -4.4437467
## ENSG00000240361 -4.443747 -4.443747 1.098281 -4.443747 0.1670276
## ENSG00000186092 -4.443747 -4.443747 1.662822 -4.443747 -4.4437467
## ENSG00000238009 -4.443747 -4.443747 1.662822 -4.443747 1.1086015
## ENSG00000233750 -4.443747 -4.443747 -4.443747 -4.443747 -4.4437467
## SRR12951227 SRR12951228 SRR12951229
## ENSG00000227232 -4.443747 -4.4437467 0.88346276
## ENSG00000268020 1.443775 -4.4437467 -4.44374665
## ENSG00000240361 -4.443747 -4.4437467 -4.44374665
## ENSG00000186092 -4.443747 -4.4437467 -4.44374665
## ENSG00000238009 2.419509 -4.4437467 -0.04880244
## ENSG00000233750 -4.443747 0.5045294 -4.44374665
library(vsn)
meanSdPlot(vsd_covid19)
All the dots do not distribute evenly, indicating no homoscadacity in the data.Therefore, I filtered further to keep only genes with equal or greater than 10 counts for at least 9 samples.
# Filter more
keep <- rowSums(counts(covid19_dds) >= 10) >= 9
covid19_dds_2 <- covid19_dds[keep,]
nrow(covid19_dds_2)
## [1] 25671
# vst transformation again
vsd_2 <- varianceStabilizingTransformation(covid19_dds_2, blind = TRUE,
fitType = "mean")
vsd_covid19_2<-assay(vsd_2)
meanSdPlot(vsd_covid19_2)
The plot looks better but homoscadacity of the counts is still not perfect. Still I want to see if the curreny transformation can capture variance of the sample group yet?
plotPCA(vsd_2, intgroup = c("condition"))
The PCA plot explained 67% variance of all the sample which is okay. PC1 accounting for 54% variance separates the sample groups quite good. Still we see that samples in the two control group (covid19_control and colon_control) clusteringvery cloesly to one anpther while samples in the two cases group quite vary within their group even though they are separated from their control group.
# Sample similarity
sample_dist <- dist(t(assay(vsd_2)))
library("pheatmap")
library("RColorBrewer")
# Distance heat map plot
sampleDistMatrix <- as.matrix( sample_dist )
rownames(sampleDistMatrix) <- paste(vsd_2$condition)
colnames(sampleDistMatrix) <- NULL
colors <- colorRampPalette( rev(brewer.pal(9, "Blues")) )(255)
pheatmap(sampleDistMatrix,
clustering_distance_rows = sample_dist,
clustering_distance_cols = sample_dist,
col = colors)
The distance heatmap shows that covid19_case sample is so varying. They formed three groups with covid19_case group that eitehr close to colon_control or covid19_control.