library(DESeq2)
## Loading required package: S4Vectors
## Loading required package: stats4
## Loading required package: BiocGenerics
## Loading required package: parallel
##
## Attaching package: 'BiocGenerics'
## The following objects are masked from 'package:parallel':
##
## clusterApply, clusterApplyLB, clusterCall, clusterEvalQ,
## clusterExport, clusterMap, parApply, parCapply, parLapply,
## parLapplyLB, parRapply, parSapply, parSapplyLB
## The following objects are masked from 'package:stats':
##
## IQR, mad, sd, var, xtabs
## The following objects are masked from 'package:base':
##
## anyDuplicated, append, as.data.frame, basename, cbind, colnames,
## dirname, do.call, duplicated, eval, evalq, Filter, Find, get, grep,
## grepl, intersect, is.unsorted, lapply, Map, mapply, match, mget,
## order, paste, pmax, pmax.int, pmin, pmin.int, Position, rank,
## rbind, Reduce, rownames, sapply, setdiff, sort, table, tapply,
## union, unique, unsplit, which.max, which.min
##
## Attaching package: 'S4Vectors'
## The following object is masked from 'package:base':
##
## expand.grid
## Loading required package: IRanges
##
## Attaching package: 'IRanges'
## The following object is masked from 'package:grDevices':
##
## windows
## Loading required package: GenomicRanges
## Loading required package: GenomeInfoDb
## Loading required package: SummarizedExperiment
## Loading required package: MatrixGenerics
## Loading required package: matrixStats
##
## Attaching package: 'MatrixGenerics'
## The following objects are masked from 'package:matrixStats':
##
## colAlls, colAnyNAs, colAnys, colAvgsPerRowSet, colCollapse,
## colCounts, colCummaxs, colCummins, colCumprods, colCumsums,
## colDiffs, colIQRDiffs, colIQRs, colLogSumExps, colMadDiffs,
## colMads, colMaxs, colMeans2, colMedians, colMins, colOrderStats,
## colProds, colQuantiles, colRanges, colRanks, colSdDiffs, colSds,
## colSums2, colTabulates, colVarDiffs, colVars, colWeightedMads,
## colWeightedMeans, colWeightedMedians, colWeightedSds,
## colWeightedVars, rowAlls, rowAnyNAs, rowAnys, rowAvgsPerColSet,
## rowCollapse, rowCounts, rowCummaxs, rowCummins, rowCumprods,
## rowCumsums, rowDiffs, rowIQRDiffs, rowIQRs, rowLogSumExps,
## rowMadDiffs, rowMads, rowMaxs, rowMeans2, rowMedians, rowMins,
## rowOrderStats, rowProds, rowQuantiles, rowRanges, rowRanks,
## rowSdDiffs, rowSds, rowSums2, rowTabulates, rowVarDiffs, rowVars,
## rowWeightedMads, rowWeightedMeans, rowWeightedMedians,
## rowWeightedSds, rowWeightedVars
## Loading required package: Biobase
## Welcome to Bioconductor
##
## Vignettes contain introductory material; view with
## 'browseVignettes()'. To cite Bioconductor, see
## 'citation("Biobase")', and for packages 'citation("pkgname")'.
##
## Attaching package: 'Biobase'
## The following object is masked from 'package:MatrixGenerics':
##
## rowMedians
## The following objects are masked from 'package:matrixStats':
##
## anyMissing, rowMedians
count_matrix <-readRDS("F:/BAI HC/Research 2/Covid19/count_matrix_file.RDS")
patient_info <- readRDS("F:/BAI HC/Research 2/Covid19/patient_info_0404.RDS")
# Extract index of only case samples
case_index <- grep("case", patient_info$condition)
patient_info_short <- patient_info[case_index,]
# Extract name of only cases samples
case<- patient_info_short$Run
# Filtered only the case samples in the count matrix
case_count_matrix <- count_matrix[, case]
# Make dds object with only the case samples
case_dds <- DESeqDataSetFromMatrix(countData = case_count_matrix,
colData = patient_info_short,
design = ~ Tissue + sex)
## Warning in DESeqDataSet(se, design = design, ignoreRank): some variables in
## design formula are characters, converting to factors
# Relevel of the condition column
case_dds$condition <-factor(case_dds$condition, levels = c("female", "male"))
# Filter for gene having at least 10 counts in at least 4 samples
# because we have 4 female samples in each group
keep2 <- rowSums(counts(case_dds) >= 10) >= 4
case_dds_2 <- case_dds[keep2,]
nrow(case_dds_2)
## [1] 23977
# Run differential expression analysis on the case samples
case_dds_2 <- DESeq(case_dds_2)
## estimating size factors
## estimating dispersions
## gene-wise dispersion estimates
## mean-dispersion relationship
## final dispersion estimates
## fitting model and testing
male_female_res_new <- results(case_dds_2)
male_female_res_new
## log2 fold change (MLE): sex male vs female
## Wald test p-value: sex male vs female
## DataFrame with 23977 rows and 6 columns
## baseMean log2FoldChange lfcSE stat pvalue
## <numeric> <numeric> <numeric> <numeric> <numeric>
## ENSG00000237094 17.3999 -1.3848871 0.795109 -1.7417579 0.0815508
## ENSG00000225972 901.7937 -0.6595375 0.563477 -1.1704785 0.2418085
## ENSG00000225630 40105.9498 -0.0162836 0.453199 -0.0359303 0.9713380
## ENSG00000237973 1901.3343 0.1634245 0.536129 0.3048229 0.7605011
## ENSG00000229344 594.6672 1.2589064 0.550746 2.2858209 0.0222647
## ... ... ... ... ... ...
## ENSG00000277196 3.57492 -1.307132 1.967531 -0.664351 0.506465
## ENSG00000278384 37.44880 -1.068724 1.093659 -0.977200 0.328470
## ENSG00000277856 9.00393 0.788079 1.649389 0.477801 0.632792
## ENSG00000275063 21.10331 -1.636931 1.210471 -1.352310 0.176276
## ENSG00000271254 34.68263 0.566705 0.736242 0.769727 0.441462
## padj
## <numeric>
## ENSG00000237094 0.666527
## ENSG00000225972 0.816172
## ENSG00000225630 0.996563
## ENSG00000237973 0.977743
## ENSG00000229344 0.460710
## ... ...
## ENSG00000277196 NA
## ENSG00000278384 0.862340
## ENSG00000277856 0.953323
## ENSG00000275063 0.768724
## ENSG00000271254 0.906800
summary(male_female_res_new)
##
## out of 23977 with nonzero total read count
## adjusted p-value < 0.1
## LFC > 0 (up) : 50, 0.21%
## LFC < 0 (down) : 66, 0.28%
## outliers [1] : 1150, 4.8%
## low counts [2] : 1822, 7.6%
## (mean count < 8)
## [1] see 'cooksCutoff' argument of ?results
## [2] see 'independentFiltering' argument of ?results
library(dplyr)
##
## Attaching package: 'dplyr'
## The following object is masked from 'package:Biobase':
##
## combine
## The following object is masked from 'package:matrixStats':
##
## count
## The following objects are masked from 'package:GenomicRanges':
##
## intersect, setdiff, union
## The following object is masked from 'package:GenomeInfoDb':
##
## intersect
## The following objects are masked from 'package:IRanges':
##
## collapse, desc, intersect, setdiff, slice, union
## The following objects are masked from 'package:S4Vectors':
##
## first, intersect, rename, setdiff, setequal, union
## The following objects are masked from 'package:BiocGenerics':
##
## combine, intersect, setdiff, union
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
# Sort the DE gene according to their log fold changes
male_female_sig_sorted <- data.frame(gene= rownames(male_female_res_new),
male_female_res_new,
stringsAsFactors = F) %>%
na.omit() %>%
filter(padj <0.1) %>%
arrange(desc(abs(log2FoldChange)))
head(male_female_sig_sorted, 3)
## gene baseMean log2FoldChange lfcSE stat
## ENSG00000229807 ENSG00000229807 9983.99617 -11.657163 0.7760656 -15.020848
## ENSG00000115386 ENSG00000115386 62.81085 -9.050916 1.4413598 -6.279429
## ENSG00000228411 ENSG00000228411 51.48506 9.042568 1.1582743 7.806931
## pvalue padj
## ENSG00000229807 5.361702e-51 1.126225e-46
## ENSG00000115386 3.398187e-10 5.490687e-07
## ENSG00000228411 5.859740e-15 1.538548e-11
library(org.Hs.eg.db)
## Loading required package: AnnotationDbi
##
## Attaching package: 'AnnotationDbi'
## The following object is masked from 'package:dplyr':
##
## select
##
keytypes(org.Hs.eg.db)
## [1] "ACCNUM" "ALIAS" "ENSEMBL" "ENSEMBLPROT" "ENSEMBLTRANS"
## [6] "ENTREZID" "ENZYME" "EVIDENCE" "EVIDENCEALL" "GENENAME"
## [11] "GO" "GOALL" "IPI" "MAP" "OMIM"
## [16] "ONTOLOGY" "ONTOLOGYALL" "PATH" "PFAM" "PMID"
## [21] "PROSITE" "REFSEQ" "SYMBOL" "UCSCKG" "UNIGENE"
## [26] "UNIPROT"
# Extract the top 20 DE genes
top_20genes <- male_female_sig_sorted$gene[1:20]
# Search the gene name in Ensembl
top20_table <-AnnotationDbi::select(org.Hs.eg.db, keys = top_20genes, keytype = "ENSEMBL",
columns = c("ENSEMBL", "SYMBOL"))
## 'select()' returned 1:1 mapping between keys and columns
top20_table$log2FoldChange <-male_female_sig_sorted$log2FoldChange[1:20]
| ENSEMBL | SYMBOL | log2FoldChange |
|---|---|---|
| ENSG00000229807 | XIST | -11.657163 |
| ENSG00000115386 | REG1A | -9.050916 |
| ENSG00000228411 | NA | 9.042568 |
| ENSG00000260197 | NA | 9.041199 |
| ENSG00000176728 | TTTY14 | 8.696705 |
| ENSG00000168878 | SFTPB | -8.565388 |
| ENSG00000067646 | ZFY | 8.481059 |
| ENSG00000154620 | TMSB4Y | 8.344616 |
| ENSG00000288049 | LOC105377224 | 8.330834 |
| ENSG00000288057 | NA | 7.884468 |
| ENSG00000122852 | SFTPA1 | -7.099688 |
| ENSG00000252633 | RN7SKP282 | 6.936400 |
| ENSG00000185303 | SFTPA2 | -6.905568 |
| ENSG00000267793 | NA | 6.844151 |
| ENSG00000108342 | CSF3 | -6.822419 |
| ENSG00000129824 | RPS4Y1 | 6.820551 |
| ENSG00000215580 | BCORP1 | 6.411762 |
| ENSG00000099725 | PRKY | 6.403907 |
| ENSG00000186471 | AKAP14 | 6.336585 |
| ENSG00000125740 | FOSB | -6.201905 |
Some genes do not have annotation in org.HS.eg database, so I filled them by hand.
top20_table$SYMBOL[3]<- c("CDY4P")
top20_table$SYMBOL[4]<- c("AC010889.1")
top20_table$SYMBOL[10]<- c("AC010086.3")
top20_table$SYMBOL[14]<- c("AC009977.1")
library(ComplexHeatmap)
## Loading required package: grid
## ========================================
## ComplexHeatmap version 2.6.2
## Bioconductor page: http://bioconductor.org/packages/ComplexHeatmap/
## Github page: https://github.com/jokergoo/ComplexHeatmap
## Documentation: http://jokergoo.github.io/ComplexHeatmap-reference
##
## If you use it in published research, please cite:
## Gu, Z. Complex heatmaps reveal patterns and correlations in multidimensional
## genomic data. Bioinformatics 2016.
##
## This message can be suppressed by:
## suppressPackageStartupMessages(library(ComplexHeatmap))
## ========================================
vsd<-varianceStabilizingTransformation(case_dds_2)
case_vsd <- assay(vsd)
# Extract the vst transformation count for only top 20 de genes
top20_index <-match(top20_table$ENSEMBL, rownames(case_vsd))
top20_matrix_vsd <- case_vsd[top20_index,]
Gender <- patient_info_short$sex
Tissue <- patient_info_short$Tissue
gender_hm_name <- HeatmapAnnotation(df=data.frame(Tissue, Gender),
col = list(Tissue = c("lung" = "purple","colon" = "light blue"),
Gender= c("male" = "pink", "female" = "green")))
rownames(top20_matrix_vsd) <- top20_table$SYMBOL
# Check if column names of top20_matrix is the same as
#row names of patient_info_short
colnames(top20_matrix_vsd) == rownames(patient_info_short)
## [1] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
## [16] TRUE TRUE TRUE
# Then we can annotate column name of top20_matrix_vsd by their real sample name
colnames(top20_matrix_vsd)<- patient_info_short$sample_name
# Scale the transformed count for heatmap
top20_mat_scale<-t(apply(top20_matrix_vsd,1,scale))
colnames(top20_mat_scale) <- colnames(top20_matrix_vsd)
pheatmap(top20_mat_scale,
cellwidth = 8, cellheight = 8,
cluster_cols = TRUE,
annotation_names_col = T,
show_colnames = T,
top_annotation=gender_hm_name)
# gender dds object for the whole data set
covid19_gender_dds <- DESeqDataSetFromMatrix(countData = count_matrix,
colData = patient_info,
design = ~ Tissue + sex)
## Warning in DESeqDataSet(se, design = design, ignoreRank): some variables in
## design formula are characters, converting to factors
## Note: levels of factors in the design contain characters other than
## letters, numbers, '_' and '.'. It is recommended (but not required) to use
## only letters, numbers, and delimiters '_' or '.', as these are safe characters
## for column names in R. [This is a message, not a warning or an error]
# Filter the dataset to make it less heavy
keep2_all <- rowSums(counts(covid19_gender_dds) >= 10) >= 4
covid19_gender_dds_2 <- covid19_gender_dds[keep2_all,]
nrow(covid19_gender_dds_2)
## [1] 29824
gender_vsd<-varianceStabilizingTransformation(covid19_gender_dds_2)
## Note: levels of factors in the design contain characters other than
## letters, numbers, '_' and '.'. It is recommended (but not required) to use
## only letters, numbers, and delimiters '_' or '.', as these are safe characters
## for column names in R. [This is a message, not a warning or an error]
gender_vsd_mat <- assay(gender_vsd)
# Extract the vst transformation count for only top 20 de genes
top20_index_2 <-match(top20_table$ENSEMBL, rownames(gender_vsd_mat))
top20_matrix_vsd_2 <- gender_vsd_mat[top20_index_2,]
library(ComplexHeatmap)
Gender_whole <- patient_info$sex
Tissue_whole <- patient_info$Tissue
gender_hm_whole <- HeatmapAnnotation(df=data.frame(Tissue_whole, Gender_whole),
col = list(Tissue = c("lung" = "pink","colon" = "light blue"),
Gender= c("male" = "black", "female" = "brown",
"not collected" = "green")))
rownames(top20_matrix_vsd_2) <- top20_table$SYMBOL
# Check if column names of top20_matrix is the same as
#row names of patient_info
colnames(top20_matrix_vsd_2) == rownames(patient_info)
## [1] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
## [16] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
## [31] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
# Then we can annotate column name of top20_matrix_vsd_2 by their real sample name
colnames(top20_matrix_vsd_2)<- patient_info$sample_name
whole_top20_mat_scale<-t(apply(top20_matrix_vsd_2,1,scale))
colnames(whole_top20_mat_scale) <- colnames(top20_matrix_vsd_2)
pheatmap(whole_top20_mat_scale,
cellwidth = 8, cellheight = 8,
cluster_cols = TRUE,
annotation_names_col = T,
show_colnames = T,
clustering_distance_cols = "correlation",
top_annotation=gender_hm_whole)
I am not sure why the heatmap show gender of the control samples different from your heatmap. I cannot fix it yet.
sessionInfo()
## R version 4.0.5 (2021-03-31)
## Platform: x86_64-w64-mingw32/x64 (64-bit)
## Running under: Windows 10 x64 (build 19041)
##
## Matrix products: default
##
## locale:
## [1] LC_COLLATE=English_United States.1252
## [2] LC_CTYPE=English_United States.1252
## [3] LC_MONETARY=English_United States.1252
## [4] LC_NUMERIC=C
## [5] LC_TIME=English_United States.1252
##
## attached base packages:
## [1] grid parallel stats4 stats graphics grDevices utils
## [8] datasets methods base
##
## other attached packages:
## [1] ComplexHeatmap_2.6.2 knitr_1.33
## [3] org.Hs.eg.db_3.12.0 AnnotationDbi_1.52.0
## [5] dplyr_1.0.5 DESeq2_1.30.1
## [7] SummarizedExperiment_1.20.0 Biobase_2.50.0
## [9] MatrixGenerics_1.2.1 matrixStats_0.58.0
## [11] GenomicRanges_1.42.0 GenomeInfoDb_1.26.7
## [13] IRanges_2.24.1 S4Vectors_0.28.1
## [15] BiocGenerics_0.36.1
##
## loaded via a namespace (and not attached):
## [1] httr_1.4.2 sass_0.3.1 bit64_4.0.5
## [4] jsonlite_1.7.2 splines_4.0.5 bslib_0.2.4
## [7] assertthat_0.2.1 highr_0.9 blob_1.2.1
## [10] GenomeInfoDbData_1.2.4 yaml_2.2.1 pillar_1.6.0
## [13] RSQLite_2.2.7 lattice_0.20-41 glue_1.4.2
## [16] digest_0.6.27 RColorBrewer_1.1-2 XVector_0.30.0
## [19] colorspace_2.0-0 htmltools_0.5.1.1 Matrix_1.2-18
## [22] XML_3.99-0.6 pkgconfig_2.0.3 GetoptLong_1.0.5
## [25] genefilter_1.72.1 zlibbioc_1.36.0 purrr_0.3.4
## [28] xtable_1.8-4 scales_1.1.1 BiocParallel_1.24.1
## [31] tibble_3.1.1 annotate_1.68.0 generics_0.1.0
## [34] ggplot2_3.3.3 ellipsis_0.3.1 cachem_1.0.4
## [37] survival_3.2-11 magrittr_2.0.1 crayon_1.4.1
## [40] memoise_2.0.0 evaluate_0.14 fansi_0.4.2
## [43] Cairo_1.5-12.2 tools_4.0.5 GlobalOptions_0.1.2
## [46] lifecycle_1.0.0 stringr_1.4.0 munsell_0.5.0
## [49] locfit_1.5-9.4 cluster_2.1.2 DelayedArray_0.16.3
## [52] compiler_4.0.5 jquerylib_0.1.4 rlang_0.4.10
## [55] RCurl_1.98-1.3 circlize_0.4.12 rjson_0.2.20
## [58] bitops_1.0-7 rmarkdown_2.7 gtable_0.3.0
## [61] DBI_1.1.1 R6_2.5.0 fastmap_1.1.0
## [64] bit_4.0.4 utf8_1.2.1 clue_0.3-59
## [67] shape_1.4.5 stringi_1.5.3 Rcpp_1.0.6
## [70] png_0.1-7 vctrs_0.3.7 geneplotter_1.68.0
## [73] tidyselect_1.1.1 xfun_0.22