library(DESeq2)
## Loading required package: S4Vectors
## Loading required package: stats4
## Loading required package: BiocGenerics
## Loading required package: parallel
## 
## Attaching package: 'BiocGenerics'
## The following objects are masked from 'package:parallel':
## 
##     clusterApply, clusterApplyLB, clusterCall, clusterEvalQ,
##     clusterExport, clusterMap, parApply, parCapply, parLapply,
##     parLapplyLB, parRapply, parSapply, parSapplyLB
## The following objects are masked from 'package:stats':
## 
##     IQR, mad, sd, var, xtabs
## The following objects are masked from 'package:base':
## 
##     anyDuplicated, append, as.data.frame, basename, cbind, colnames,
##     dirname, do.call, duplicated, eval, evalq, Filter, Find, get, grep,
##     grepl, intersect, is.unsorted, lapply, Map, mapply, match, mget,
##     order, paste, pmax, pmax.int, pmin, pmin.int, Position, rank,
##     rbind, Reduce, rownames, sapply, setdiff, sort, table, tapply,
##     union, unique, unsplit, which.max, which.min
## 
## Attaching package: 'S4Vectors'
## The following object is masked from 'package:base':
## 
##     expand.grid
## Loading required package: IRanges
## 
## Attaching package: 'IRanges'
## The following object is masked from 'package:grDevices':
## 
##     windows
## Loading required package: GenomicRanges
## Loading required package: GenomeInfoDb
## Loading required package: SummarizedExperiment
## Loading required package: MatrixGenerics
## Loading required package: matrixStats
## 
## Attaching package: 'MatrixGenerics'
## The following objects are masked from 'package:matrixStats':
## 
##     colAlls, colAnyNAs, colAnys, colAvgsPerRowSet, colCollapse,
##     colCounts, colCummaxs, colCummins, colCumprods, colCumsums,
##     colDiffs, colIQRDiffs, colIQRs, colLogSumExps, colMadDiffs,
##     colMads, colMaxs, colMeans2, colMedians, colMins, colOrderStats,
##     colProds, colQuantiles, colRanges, colRanks, colSdDiffs, colSds,
##     colSums2, colTabulates, colVarDiffs, colVars, colWeightedMads,
##     colWeightedMeans, colWeightedMedians, colWeightedSds,
##     colWeightedVars, rowAlls, rowAnyNAs, rowAnys, rowAvgsPerColSet,
##     rowCollapse, rowCounts, rowCummaxs, rowCummins, rowCumprods,
##     rowCumsums, rowDiffs, rowIQRDiffs, rowIQRs, rowLogSumExps,
##     rowMadDiffs, rowMads, rowMaxs, rowMeans2, rowMedians, rowMins,
##     rowOrderStats, rowProds, rowQuantiles, rowRanges, rowRanks,
##     rowSdDiffs, rowSds, rowSums2, rowTabulates, rowVarDiffs, rowVars,
##     rowWeightedMads, rowWeightedMeans, rowWeightedMedians,
##     rowWeightedSds, rowWeightedVars
## Loading required package: Biobase
## Welcome to Bioconductor
## 
##     Vignettes contain introductory material; view with
##     'browseVignettes()'. To cite Bioconductor, see
##     'citation("Biobase")', and for packages 'citation("pkgname")'.
## 
## Attaching package: 'Biobase'
## The following object is masked from 'package:MatrixGenerics':
## 
##     rowMedians
## The following objects are masked from 'package:matrixStats':
## 
##     anyMissing, rowMedians
# Load the count data already filtered
vst_count <-readRDS("F:/BAI HC/Research 2/Covid19/vst_count.RDS")
pca_data <-assay(vst_count)

pca_data <- as.data.frame(pca_data)
pca_data$geneID <-rownames(pca_data)
gene_var_sorted <- readRDS("F:/BAI HC/Research 2/Covid19/gene_var.RDS")
# Filter 500 genes having larger variance
gene_500 <- gene_var_sorted[1:500,]

gene_500_id <-gene_500$gene
# filter the 500 gene count
the_500_count <- pca_data[match(gene_500_id,pca_data$geneID),]
the_500_count <- the_500_count[, 1:38]
#PCA analysis with thr 500 most variable genes
pca500 <- prcomp(t(the_500_count))
# Extract the loading score contributed by each genes in PC2
loading_scores <- pca500$rotation[, 2]

# Sort the genes by their absolute loading scores
gene_sorted <- sort(abs(loading_scores), decreasing = TRUE)

# The 10 genes with the largest loading scores
top_100_genes <- names(gene_sorted[1:100])
# Load the annotation for Human genes
library(org.Hs.eg.db)
## Loading required package: AnnotationDbi
## 
keytypes(org.Hs.eg.db)
##  [1] "ACCNUM"       "ALIAS"        "ENSEMBL"      "ENSEMBLPROT"  "ENSEMBLTRANS"
##  [6] "ENTREZID"     "ENZYME"       "EVIDENCE"     "EVIDENCEALL"  "GENENAME"    
## [11] "GO"           "GOALL"        "IPI"          "MAP"          "OMIM"        
## [16] "ONTOLOGY"     "ONTOLOGYALL"  "PATH"         "PFAM"         "PMID"        
## [21] "PROSITE"      "REFSEQ"       "SYMBOL"       "UCSCKG"       "UNIGENE"     
## [26] "UNIPROT"
# Name of the top 100 genes
top100_table<- AnnotationDbi::select(org.Hs.eg.db,keys=top_100_genes, keytype="ENSEMBL",
columns=c("ENSEMBL","SYMBOL") )
## 'select()' returned 1:many mapping between keys and columns

From the list of 100 genes contributing the most to PC2. There are five genes related to female and male phenotype which are XIST, SPAG6, NKX2-1, SLC6A14 and HSD11B2 .

gender_genes <- c("XIST", "SPAG6", "NKX2-1", "SLC6A14", "HSD11B2")

gender_geneid <- top100_table[match (gender_genes, top100_table$SYMBOL), 1]

pca5_data <-the_500_count[match(gender_geneid, rownames(the_500_count)),]
# PCA analysis with only five genes related to female and male phenotype
pca5 <- prcomp(t(pca5_data))
summary(pca5)
## Importance of components:
##                           PC1    PC2    PC3     PC4     PC5
## Standard deviation     8.5972 6.3857 3.7330 2.22253 1.47078
## Proportion of Variance 0.5446 0.3004 0.1027 0.03639 0.01594
## Cumulative Proportion  0.5446 0.8450 0.9477 0.98406 1.00000
patient_info <- readRDS("F:/BAI HC/Research 2/Covid19/patient_info_0404.RDS")
condition <-patient_info$condition
pca5_dt<- data.frame(sample=rownames(pca5$x),
                       X = pca5$x[, 1],
                       Y = pca5$x[, 2], condition )
head(pca5_dt, 2)
##                  sample        X         Y    condition
## SRR12816720 SRR12816720 4.271815  4.731178 covid19_case
## SRR12816722 SRR12816722 4.027949 11.187179 covid19_case
# Plot the pca results
library(ggplot2)

# Calculate PCA proportion of Variance
pca5_var <- pca5$sdev
pca5_var2 <-pca5_var^2

var_explained <- round(pca5_var2/sum(pca5_var2)*100, 2)

# PCA plot with label
ggplot(data = pca5_dt, aes(x=X, y =Y, label=sample, color=condition))+
  geom_text()+
  xlab(paste("PC1", var_explained[1], "%"))+
  ylab(paste("PC2", var_explained[2], "%"))+
  theme(legend.position = "top")

Figure 1. PCA analysis 38 samples by five gender related genes with label

# PCA plot without lable
ggplot(data = pca5_dt, aes(x=X, y =Y, color=condition))+
  geom_point()+
  xlab(paste("PC1", var_explained[1], "%"))+
  ylab(paste("PC2", var_explained[2], "%"))+
  theme(legend.position = "top")

Figure 2. PCA analysis 38 samples by five gender related genes without label

The gender genes do not separate the control samples from the cases samples for lung tissue and colon tissue, but the male samples are much more closer to one another for covid19 cases and colon cases. Male colon cases formed a cluster at left-most of plot and male covid19 case samples formed a cluster at the upper right of the plot. The female case samples are mingling with control samples. The colon control samples clearly form two different groups. There is the same pattern for the covid19 control samples though less obvious. It looks like we have 5 female and 5 male samples for colon controls, 4 male and 6 female for the covid19 control. The gender of the covid19 control is not balanced and we also have more male codi19 cases than female covid19 cases.

Therefore the gender genes certaintly have some affect the differential expression anlysis.

library(ComplexHeatmap)
## Loading required package: grid
## ========================================
## ComplexHeatmap version 2.6.2
## Bioconductor page: http://bioconductor.org/packages/ComplexHeatmap/
## Github page: https://github.com/jokergoo/ComplexHeatmap
## Documentation: http://jokergoo.github.io/ComplexHeatmap-reference
## 
## If you use it in published research, please cite:
## Gu, Z. Complex heatmaps reveal patterns and correlations in multidimensional 
##   genomic data. Bioinformatics 2016.
## 
## This message can be suppressed by:
##   suppressPackageStartupMessages(library(ComplexHeatmap))
## ========================================
# Sample annotation for the heatmap
condition <- patient_info$condition

hm_name <- HeatmapAnnotation(df=data.frame(condition),
                             col = list(condition = c("covid19_case" = "red",
                                                      "covid19_control" = "green", 
                                                      "colon_case" = "black", 
                                                      "colon_control" = "yellow")))
# Make the gene symbol as rownames of pca5_data

Ensembl_symbol <- top100_table[match(gender_genes, top100_table$SYMBOL),]
rownames(pca5_data) <- Ensembl_symbol$SYMBOL
# The heatmap for 100 gene with larger variance

Heatmap(pca5_data,
clustering_method_rows = "average",
clustering_method_columns = "average",
clustering_distance_rows = "euclidean",
cluster_rows = T,
cluster_columns = T,
show_row_dend = F,
gap=unit(7,"mm"),
show_row_names = T,
show_column_names = T,
top_annotation = hm_name)
## Warning: The input is a data frame, convert it to the matrix.

Figure 3. The heatmap of 38 samples with five gender related genes

The heatmap for 5 gender genes tells the same story as the PCA plot with text. Only XIST shows the clear difference of two gender groups with each groups. The other genes show the difference between colon tissue and lung tissue. There is definitely a unbalance between female and male in covid19 control samples.

Since the five gender related genes was chosen from the list of 100 genes having the most influence on PC2. It may be useful to look at the heatmap of 100 genes contributed the most to PC2 too (Figure 4). Those samples that were in the sample cluster with the control groups not always next to one another in this heatmap, indicating the other 95 genes must have more influence than teh five gender genes in separating the sample groups

# Filter 100 genes contributing the most to PC2 in the 500 gene list
gene_100_loading <- the_500_count[match(top_100_genes, rownames(the_500_count)), ] 


# Heatmap of the 100 genes having the most influence on PC2
Heatmap(gene_100_loading,
clustering_method_rows = "average",
clustering_method_columns = "average",
clustering_distance_rows = "euclidean",
cluster_rows = T,
cluster_columns = T,
show_row_dend = F,
gap=unit(7,"mm"),
show_row_names = F,
show_column_names = T,
top_annotation = hm_name)
## Warning: The input is a data frame, convert it to the matrix.

Figure 4. The heatmap of 38 samples with 100 genes having the most influence on PC2