Untitled

library(DESeq2)

## Loading required package: S4Vectors

## Loading required package: stats4

## Loading required package: BiocGenerics

## Loading required package: parallel

## 
## Attaching package: 'BiocGenerics'

## The following objects are masked from 'package:parallel':
## 
##     clusterApply, clusterApplyLB, clusterCall, clusterEvalQ,
##     clusterExport, clusterMap, parApply, parCapply, parLapply,
##     parLapplyLB, parRapply, parSapply, parSapplyLB

## The following objects are masked from 'package:stats':
## 
##     IQR, mad, sd, var, xtabs

## The following objects are masked from 'package:base':
## 
##     anyDuplicated, append, as.data.frame, basename, cbind, colnames,
##     dirname, do.call, duplicated, eval, evalq, Filter, Find, get, grep,
##     grepl, intersect, is.unsorted, lapply, Map, mapply, match, mget,
##     order, paste, pmax, pmax.int, pmin, pmin.int, Position, rank,
##     rbind, Reduce, rownames, sapply, setdiff, sort, table, tapply,
##     union, unique, unsplit, which.max, which.min

## 
## Attaching package: 'S4Vectors'

## The following object is masked from 'package:base':
## 
##     expand.grid

## Loading required package: IRanges

## 
## Attaching package: 'IRanges'

## The following object is masked from 'package:grDevices':
## 
##     windows

## Loading required package: GenomicRanges

## Loading required package: GenomeInfoDb

## Loading required package: SummarizedExperiment

## Loading required package: MatrixGenerics

## Loading required package: matrixStats

## 
## Attaching package: 'MatrixGenerics'

## The following objects are masked from 'package:matrixStats':
## 
##     colAlls, colAnyNAs, colAnys, colAvgsPerRowSet, colCollapse,
##     colCounts, colCummaxs, colCummins, colCumprods, colCumsums,
##     colDiffs, colIQRDiffs, colIQRs, colLogSumExps, colMadDiffs,
##     colMads, colMaxs, colMeans2, colMedians, colMins, colOrderStats,
##     colProds, colQuantiles, colRanges, colRanks, colSdDiffs, colSds,
##     colSums2, colTabulates, colVarDiffs, colVars, colWeightedMads,
##     colWeightedMeans, colWeightedMedians, colWeightedSds,
##     colWeightedVars, rowAlls, rowAnyNAs, rowAnys, rowAvgsPerColSet,
##     rowCollapse, rowCounts, rowCummaxs, rowCummins, rowCumprods,
##     rowCumsums, rowDiffs, rowIQRDiffs, rowIQRs, rowLogSumExps,
##     rowMadDiffs, rowMads, rowMaxs, rowMeans2, rowMedians, rowMins,
##     rowOrderStats, rowProds, rowQuantiles, rowRanges, rowRanks,
##     rowSdDiffs, rowSds, rowSums2, rowTabulates, rowVarDiffs, rowVars,
##     rowWeightedMads, rowWeightedMeans, rowWeightedMedians,
##     rowWeightedSds, rowWeightedVars

## Loading required package: Biobase

## Welcome to Bioconductor
## 
##     Vignettes contain introductory material; view with
##     'browseVignettes()'. To cite Bioconductor, see
##     'citation("Biobase")', and for packages 'citation("pkgname")'.

## 
## Attaching package: 'Biobase'

## The following object is masked from 'package:MatrixGenerics':
## 
##     rowMedians

## The following objects are masked from 'package:matrixStats':
## 
##     anyMissing, rowMedians

count_matrix <-readRDS("F:/BAI HC/Research 2/Covid19/count_matrix_file.RDS")

patient_info <- readRDS("F:/BAI HC/Research 2/Covid19/patient_info_0404.RDS")

# Extract index of only case samples

case_index <- grep("case", patient_info$condition)

patient_info_short <- patient_info[case_index,]

# Extract name of only cases samples

case<- patient_info_short$Run

# Filtered only the case samples in the count matrix

case_count_matrix <- count_matrix[, case]

# Make dds object with only the case samples

case_dds <- DESeqDataSetFromMatrix(countData = case_count_matrix,
                                   colData = patient_info_short,
                                   design = ~ Tissue + sex)

## Warning in DESeqDataSet(se, design = design, ignoreRank): some variables in
## design formula are characters, converting to factors

# Relevel of the condition column
case_dds$condition <-factor(case_dds$condition, levels = c("female", "male"))

# Filter for gene having at least 10 counts in at least 4 samples 
# because we have 4 female samples in each group

keep2 <- rowSums(counts(case_dds) >= 10) >= 4
case_dds_2 <- case_dds[keep2,]
nrow(case_dds_2)

## [1] 23977

# Run differential expression analysis on the case samples
case_dds_2 <- DESeq(case_dds_2)

## estimating size factors

## estimating dispersions

## gene-wise dispersion estimates

## mean-dispersion relationship

## final dispersion estimates

## fitting model and testing

male_female_res_new <- results(case_dds_2)
male_female_res_new

## log2 fold change (MLE): sex male vs female 
## Wald test p-value: sex male vs female 
## DataFrame with 23977 rows and 6 columns
##                   baseMean log2FoldChange     lfcSE       stat    pvalue
##                  <numeric>      <numeric> <numeric>  <numeric> <numeric>
## ENSG00000237094    17.3999     -1.3848871  0.795109 -1.7417579 0.0815508
## ENSG00000225972   901.7937     -0.6595375  0.563477 -1.1704785 0.2418085
## ENSG00000225630 40105.9498     -0.0162836  0.453199 -0.0359303 0.9713380
## ENSG00000237973  1901.3343      0.1634245  0.536129  0.3048229 0.7605011
## ENSG00000229344   594.6672      1.2589064  0.550746  2.2858209 0.0222647
## ...                    ...            ...       ...        ...       ...
## ENSG00000277196    3.57492      -1.307132  1.967531  -0.664351  0.506465
## ENSG00000278384   37.44880      -1.068724  1.093659  -0.977200  0.328470
## ENSG00000277856    9.00393       0.788079  1.649389   0.477801  0.632792
## ENSG00000275063   21.10331      -1.636931  1.210471  -1.352310  0.176276
## ENSG00000271254   34.68263       0.566705  0.736242   0.769727  0.441462
##                      padj
##                 <numeric>
## ENSG00000237094  0.666527
## ENSG00000225972  0.816172
## ENSG00000225630  0.996563
## ENSG00000237973  0.977743
## ENSG00000229344  0.460710
## ...                   ...
## ENSG00000277196        NA
## ENSG00000278384  0.862340
## ENSG00000277856  0.953323
## ENSG00000275063  0.768724
## ENSG00000271254  0.906800

summary(male_female_res_new)

## 
## out of 23977 with nonzero total read count
## adjusted p-value < 0.1
## LFC > 0 (up)       : 50, 0.21%
## LFC < 0 (down)     : 66, 0.28%
## outliers [1]       : 1150, 4.8%
## low counts [2]     : 1822, 7.6%
## (mean count < 8)
## [1] see 'cooksCutoff' argument of ?results
## [2] see 'independentFiltering' argument of ?results

library(dplyr)

## 
## Attaching package: 'dplyr'

## The following object is masked from 'package:Biobase':
## 
##     combine

## The following object is masked from 'package:matrixStats':
## 
##     count

## The following objects are masked from 'package:GenomicRanges':
## 
##     intersect, setdiff, union

## The following object is masked from 'package:GenomeInfoDb':
## 
##     intersect

## The following objects are masked from 'package:IRanges':
## 
##     collapse, desc, intersect, setdiff, slice, union

## The following objects are masked from 'package:S4Vectors':
## 
##     first, intersect, rename, setdiff, setequal, union

## The following objects are masked from 'package:BiocGenerics':
## 
##     combine, intersect, setdiff, union

## The following objects are masked from 'package:stats':
## 
##     filter, lag

## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union

# Sort the DE gene according to their log fold changes
male_female_sig_sorted <- data.frame(gene= rownames(male_female_res_new),
                                     male_female_res_new,
                                     stringsAsFactors = F) %>%
                                     na.omit() %>%
                                     filter(padj <0.1) %>%
                                     arrange(desc(abs(log2FoldChange)))

head(male_female_sig_sorted, 3)

##                            gene   baseMean log2FoldChange     lfcSE       stat
## ENSG00000229807 ENSG00000229807 9983.99617     -11.657163 0.7760656 -15.020848
## ENSG00000115386 ENSG00000115386   62.81085      -9.050916 1.4413598  -6.279429
## ENSG00000228411 ENSG00000228411   51.48506       9.042568 1.1582743   7.806931
##                       pvalue         padj
## ENSG00000229807 5.361702e-51 1.126225e-46
## ENSG00000115386 3.398187e-10 5.490687e-07
## ENSG00000228411 5.859740e-15 1.538548e-11

library(org.Hs.eg.db)

## Loading required package: AnnotationDbi

## 
## Attaching package: 'AnnotationDbi'

## The following object is masked from 'package:dplyr':
## 
##     select

##

keytypes(org.Hs.eg.db)

##  [1] "ACCNUM"       "ALIAS"        "ENSEMBL"      "ENSEMBLPROT"  "ENSEMBLTRANS"
##  [6] "ENTREZID"     "ENZYME"       "EVIDENCE"     "EVIDENCEALL"  "GENENAME"    
## [11] "GO"           "GOALL"        "IPI"          "MAP"          "OMIM"        
## [16] "ONTOLOGY"     "ONTOLOGYALL"  "PATH"         "PFAM"         "PMID"        
## [21] "PROSITE"      "REFSEQ"       "SYMBOL"       "UCSCKG"       "UNIGENE"     
## [26] "UNIPROT"

# Extract the top 20 DE genes
top_20genes <- male_female_sig_sorted$gene[1:20]

# Search the gene name in Ensembl
top20_table <-AnnotationDbi::select(org.Hs.eg.db, keys = top_20genes, keytype = "ENSEMBL",
                      columns = c("ENSEMBL", "SYMBOL"))

## 'select()' returned 1:1 mapping between keys and columns

top20_table$log2FoldChange <-male_female_sig_sorted$log2FoldChange[1:20]

The top 20 DE genes
ENSEMBL	SYMBOL	log2FoldChange
ENSG00000229807	XIST	-11.657163
ENSG00000115386	REG1A	-9.050916
ENSG00000228411	NA	9.042568
ENSG00000260197	NA	9.041199
ENSG00000176728	TTTY14	8.696705
ENSG00000168878	SFTPB	-8.565388
ENSG00000067646	ZFY	8.481059
ENSG00000154620	TMSB4Y	8.344616
ENSG00000288049	LOC105377224	8.330834
ENSG00000288057	NA	7.884468
ENSG00000122852	SFTPA1	-7.099688
ENSG00000252633	RN7SKP282	6.936400
ENSG00000185303	SFTPA2	-6.905568
ENSG00000267793	NA	6.844151
ENSG00000108342	CSF3	-6.822419
ENSG00000129824	RPS4Y1	6.820551
ENSG00000215580	BCORP1	6.411762
ENSG00000099725	PRKY	6.403907
ENSG00000186471	AKAP14	6.336585
ENSG00000125740	FOSB	-6.201905

Some genes do not have annotation in org.HS.eg database, so I filled them by hand.

top20_table$SYMBOL[3]<- c("CDY4P")
top20_table$SYMBOL[4]<- c("AC010889.1")
top20_table$SYMBOL[10]<- c("AC010086.3")
top20_table$SYMBOL[14]<- c("AC009977.1")

library(ComplexHeatmap)

## Loading required package: grid

## ========================================
## ComplexHeatmap version 2.6.2
## Bioconductor page: http://bioconductor.org/packages/ComplexHeatmap/
## Github page: https://github.com/jokergoo/ComplexHeatmap
## Documentation: http://jokergoo.github.io/ComplexHeatmap-reference
## 
## If you use it in published research, please cite:
## Gu, Z. Complex heatmaps reveal patterns and correlations in multidimensional 
##   genomic data. Bioinformatics 2016.
## 
## This message can be suppressed by:
##   suppressPackageStartupMessages(library(ComplexHeatmap))
## ========================================

vsd<-varianceStabilizingTransformation(case_dds_2)
case_vsd <- assay(vsd)

# Extract the vst transformation count for only top 20 de genes
top20_index <-match(top20_table$ENSEMBL, rownames(case_vsd))
top20_matrix_vsd <- case_vsd[top20_index,]

Gender <- patient_info_short$sex
Tissue <- patient_info_short$Tissue

gender_hm_name <- HeatmapAnnotation(df=data.frame(Tissue, Gender),
                  col = list(Tissue = c("lung" = "purple","colon" = "light blue"), 
                             Gender= c("male" = "pink", "female" = "green"))) 
                                                      
rownames(top20_matrix_vsd) <- top20_table$SYMBOL

# Check if  column names of top20_matrix is the same as 
#row names of patient_info_short
colnames(top20_matrix_vsd) == rownames(patient_info_short)

##  [1] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
## [16] TRUE TRUE TRUE

# Then we can annotate column name of top20_matrix_vsd by their real sample name

colnames(top20_matrix_vsd)<- patient_info_short$sample_name

# Scale the transformed count for heatmap
top20_mat_scale<-t(apply(top20_matrix_vsd,1,scale))
colnames(top20_mat_scale) <- colnames(top20_matrix_vsd)

pheatmap(top20_mat_scale,
         cellwidth = 8, cellheight = 8,
         cluster_cols = TRUE,
         annotation_names_col = T,
         show_colnames = T,
         top_annotation=gender_hm_name)

# gender dds object for the whole data set
covid19_gender_dds <- DESeqDataSetFromMatrix(countData = count_matrix,
                                   colData = patient_info,
                                   design = ~ Tissue + sex)

## Warning in DESeqDataSet(se, design = design, ignoreRank): some variables in
## design formula are characters, converting to factors

##   Note: levels of factors in the design contain characters other than
##   letters, numbers, '_' and '.'. It is recommended (but not required) to use
##   only letters, numbers, and delimiters '_' or '.', as these are safe characters
##   for column names in R. [This is a message, not a warning or an error]

# Filter the dataset to make it less heavy
keep2_all <- rowSums(counts(covid19_gender_dds) >= 10) >= 4
covid19_gender_dds_2 <- covid19_gender_dds[keep2_all,]
nrow(covid19_gender_dds_2)

## [1] 29824

gender_vsd<-varianceStabilizingTransformation(covid19_gender_dds_2)

##   Note: levels of factors in the design contain characters other than
##   letters, numbers, '_' and '.'. It is recommended (but not required) to use
##   only letters, numbers, and delimiters '_' or '.', as these are safe characters
##   for column names in R. [This is a message, not a warning or an error]

gender_vsd_mat <- assay(gender_vsd)

# Extract the vst transformation count for only top 20 de genes
top20_index_2 <-match(top20_table$ENSEMBL, rownames(gender_vsd_mat))
top20_matrix_vsd_2 <- gender_vsd_mat[top20_index_2,]

library(ComplexHeatmap)

Gender_whole <- patient_info$sex
Tissue_whole <- patient_info$Tissue

gender_hm_whole <- HeatmapAnnotation(df=data.frame(Tissue_whole, Gender_whole),
                  col = list(Tissue = c("lung" = "pink","colon" = "light blue"), 
                             Gender= c("male" = "black", "female" = "brown", 
                                       "not collected" = "green")))

rownames(top20_matrix_vsd_2) <- top20_table$SYMBOL

# Check if  column names of top20_matrix is the same as 
#row names of patient_info
colnames(top20_matrix_vsd_2) == rownames(patient_info)

##  [1] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
## [16] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
## [31] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE

# Then we can annotate column name of top20_matrix_vsd_2 by their real sample name

colnames(top20_matrix_vsd_2)<- patient_info$sample_name

whole_top20_mat_scale<-t(apply(top20_matrix_vsd_2,1,scale))
colnames(whole_top20_mat_scale) <- colnames(top20_matrix_vsd_2)

pheatmap(whole_top20_mat_scale,
         cellwidth = 8, cellheight = 8,
         cluster_cols = TRUE,
         annotation_names_col = T,
         show_colnames = T,
         clustering_distance_cols = "correlation",
         top_annotation=gender_hm_whole)

I am not sure why the heatmap show gender of the control samples different from your heatmap. I cannot fix it yet.

sessionInfo()

## R version 4.0.5 (2021-03-31)
## Platform: x86_64-w64-mingw32/x64 (64-bit)
## Running under: Windows 10 x64 (build 19041)
## 
## Matrix products: default
## 
## locale:
## [1] LC_COLLATE=English_United States.1252 
## [2] LC_CTYPE=English_United States.1252   
## [3] LC_MONETARY=English_United States.1252
## [4] LC_NUMERIC=C                          
## [5] LC_TIME=English_United States.1252    
## 
## attached base packages:
##  [1] grid      parallel  stats4    stats     graphics  grDevices utils    
##  [8] datasets  methods   base     
## 
## other attached packages:
##  [1] ComplexHeatmap_2.6.2        knitr_1.33                 
##  [3] org.Hs.eg.db_3.12.0         AnnotationDbi_1.52.0       
##  [5] dplyr_1.0.5                 DESeq2_1.30.1              
##  [7] SummarizedExperiment_1.20.0 Biobase_2.50.0             
##  [9] MatrixGenerics_1.2.1        matrixStats_0.58.0         
## [11] GenomicRanges_1.42.0        GenomeInfoDb_1.26.7        
## [13] IRanges_2.24.1              S4Vectors_0.28.1           
## [15] BiocGenerics_0.36.1        
## 
## loaded via a namespace (and not attached):
##  [1] httr_1.4.2             sass_0.3.1             bit64_4.0.5           
##  [4] jsonlite_1.7.2         splines_4.0.5          bslib_0.2.4           
##  [7] assertthat_0.2.1       highr_0.9              blob_1.2.1            
## [10] GenomeInfoDbData_1.2.4 yaml_2.2.1             pillar_1.6.0          
## [13] RSQLite_2.2.7          lattice_0.20-41        glue_1.4.2            
## [16] digest_0.6.27          RColorBrewer_1.1-2     XVector_0.30.0        
## [19] colorspace_2.0-0       htmltools_0.5.1.1      Matrix_1.2-18         
## [22] XML_3.99-0.6           pkgconfig_2.0.3        GetoptLong_1.0.5      
## [25] genefilter_1.72.1      zlibbioc_1.36.0        purrr_0.3.4           
## [28] xtable_1.8-4           scales_1.1.1           BiocParallel_1.24.1   
## [31] tibble_3.1.1           annotate_1.68.0        generics_0.1.0        
## [34] ggplot2_3.3.3          ellipsis_0.3.1         cachem_1.0.4          
## [37] survival_3.2-11        magrittr_2.0.1         crayon_1.4.1          
## [40] memoise_2.0.0          evaluate_0.14          fansi_0.4.2           
## [43] Cairo_1.5-12.2         tools_4.0.5            GlobalOptions_0.1.2   
## [46] lifecycle_1.0.0        stringr_1.4.0          munsell_0.5.0         
## [49] locfit_1.5-9.4         cluster_2.1.2          DelayedArray_0.16.3   
## [52] compiler_4.0.5         jquerylib_0.1.4        rlang_0.4.10          
## [55] RCurl_1.98-1.3         circlize_0.4.12        rjson_0.2.20          
## [58] bitops_1.0-7           rmarkdown_2.7          gtable_0.3.0          
## [61] DBI_1.1.1              R6_2.5.0               fastmap_1.1.0         
## [64] bit_4.0.4              utf8_1.2.1             clue_0.3-59           
## [67] shape_1.4.5            stringi_1.5.3          Rcpp_1.0.6            
## [70] png_0.1-7              vctrs_0.3.7            geneplotter_1.68.0    
## [73] tidyselect_1.1.1       xfun_0.22

Untitled

Phuc

5/10/2021