library(DESeq2)
## Loading required package: S4Vectors
## Loading required package: stats4
## Loading required package: BiocGenerics
## Loading required package: parallel
## 
## Attaching package: 'BiocGenerics'
## The following objects are masked from 'package:parallel':
## 
##     clusterApply, clusterApplyLB, clusterCall, clusterEvalQ,
##     clusterExport, clusterMap, parApply, parCapply, parLapply,
##     parLapplyLB, parRapply, parSapply, parSapplyLB
## The following objects are masked from 'package:stats':
## 
##     IQR, mad, sd, var, xtabs
## The following objects are masked from 'package:base':
## 
##     anyDuplicated, append, as.data.frame, basename, cbind, colnames,
##     dirname, do.call, duplicated, eval, evalq, Filter, Find, get, grep,
##     grepl, intersect, is.unsorted, lapply, Map, mapply, match, mget,
##     order, paste, pmax, pmax.int, pmin, pmin.int, Position, rank,
##     rbind, Reduce, rownames, sapply, setdiff, sort, table, tapply,
##     union, unique, unsplit, which.max, which.min
## 
## Attaching package: 'S4Vectors'
## The following object is masked from 'package:base':
## 
##     expand.grid
## Loading required package: IRanges
## 
## Attaching package: 'IRanges'
## The following object is masked from 'package:grDevices':
## 
##     windows
## Loading required package: GenomicRanges
## Loading required package: GenomeInfoDb
## Loading required package: SummarizedExperiment
## Loading required package: MatrixGenerics
## Loading required package: matrixStats
## 
## Attaching package: 'MatrixGenerics'
## The following objects are masked from 'package:matrixStats':
## 
##     colAlls, colAnyNAs, colAnys, colAvgsPerRowSet, colCollapse,
##     colCounts, colCummaxs, colCummins, colCumprods, colCumsums,
##     colDiffs, colIQRDiffs, colIQRs, colLogSumExps, colMadDiffs,
##     colMads, colMaxs, colMeans2, colMedians, colMins, colOrderStats,
##     colProds, colQuantiles, colRanges, colRanks, colSdDiffs, colSds,
##     colSums2, colTabulates, colVarDiffs, colVars, colWeightedMads,
##     colWeightedMeans, colWeightedMedians, colWeightedSds,
##     colWeightedVars, rowAlls, rowAnyNAs, rowAnys, rowAvgsPerColSet,
##     rowCollapse, rowCounts, rowCummaxs, rowCummins, rowCumprods,
##     rowCumsums, rowDiffs, rowIQRDiffs, rowIQRs, rowLogSumExps,
##     rowMadDiffs, rowMads, rowMaxs, rowMeans2, rowMedians, rowMins,
##     rowOrderStats, rowProds, rowQuantiles, rowRanges, rowRanks,
##     rowSdDiffs, rowSds, rowSums2, rowTabulates, rowVarDiffs, rowVars,
##     rowWeightedMads, rowWeightedMeans, rowWeightedMedians,
##     rowWeightedSds, rowWeightedVars
## Loading required package: Biobase
## Welcome to Bioconductor
## 
##     Vignettes contain introductory material; view with
##     'browseVignettes()'. To cite Bioconductor, see
##     'citation("Biobase")', and for packages 'citation("pkgname")'.
## 
## Attaching package: 'Biobase'
## The following object is masked from 'package:MatrixGenerics':
## 
##     rowMedians
## The following objects are masked from 'package:matrixStats':
## 
##     anyMissing, rowMedians
vst_count <-readRDS("F:/BAI HC/Research 2/Covid19/vst_count.RDS")
pca_data <-assay(vst_count)
# calculate gene variation
gene_var<- rowVars(pca_data)
gene_var2 <- as.data.frame(gene_var)
rownames(gene_var2)<- rownames(pca_data)
# sort the gene variation by decreasing order
gene_var2$gene <- rownames(gene_var2)
gene_var_sorted <- gene_var2[order(gene_var2$gene_var, decreasing = TRUE),]
# Choose the 500 gene having the largest variance in 25671 genes
gene_500 <- gene_var_sorted[1:500,]
# Filter the 500 genes in the vst matrix
gene_id <-gene_500$gene

pca_data <- as.data.frame(pca_data)
pca_data$geneID <-rownames(pca_data)
# filter the 500 gene count
the_500_count <- pca_data[match(gene_id,pca_data$geneID),]
the_500_count <- the_500_count[, 1:38]
#PCA analysis with thr 500 most variable genes
pca500 <- prcomp(t(the_500_count))

summary(pca500)
## Importance of components:
##                            PC1     PC2      PC3      PC4      PC5     PC6
## Standard deviation     72.9533 35.2132 22.13230 19.49839 16.46475 15.7803
## Proportion of Variance  0.5408  0.1260  0.04978  0.03863  0.02755  0.0253
## Cumulative Proportion   0.5408  0.6668  0.71660  0.75524  0.78278  0.8081
##                             PC7      PC8      PC9     PC10     PC11     PC12
## Standard deviation     13.72781 13.16706 12.32751 11.30306 10.26655 10.07981
## Proportion of Variance  0.01915  0.01762  0.01544  0.01298  0.01071  0.01032
## Cumulative Proportion   0.82724  0.84486  0.86030  0.87328  0.88399  0.89432
##                           PC13    PC14    PC15   PC16    PC17    PC18    PC19
## Standard deviation     9.76154 8.80788 8.72259 8.5914 8.17298 7.82330 7.54474
## Proportion of Variance 0.00968 0.00788 0.00773 0.0075 0.00679 0.00622 0.00578
## Cumulative Proportion  0.90400 0.91188 0.91961 0.9271 0.93390 0.94012 0.94591
##                           PC20    PC21    PC22    PC23    PC24    PC25    PC26
## Standard deviation     7.38589 7.14421 7.02537 6.71840 6.46299 6.35948 6.15299
## Proportion of Variance 0.00554 0.00519 0.00502 0.00459 0.00424 0.00411 0.00385
## Cumulative Proportion  0.95145 0.95664 0.96165 0.96624 0.97048 0.97459 0.97844
##                           PC27    PC28   PC29    PC30    PC31    PC32    PC33
## Standard deviation     5.75162 5.50610 5.2535 4.91013 4.59195 4.15461 3.90815
## Proportion of Variance 0.00336 0.00308 0.0028 0.00245 0.00214 0.00175 0.00155
## Cumulative Proportion  0.98180 0.98488 0.9877 0.99014 0.99228 0.99403 0.99558
##                           PC34    PC35    PC36    PC37     PC38
## Standard deviation     3.73693 3.64152 3.11433 2.55438 3.24e-14
## Proportion of Variance 0.00142 0.00135 0.00099 0.00066 0.00e+00
## Cumulative Proportion  0.99700 0.99835 0.99934 1.00000 1.00e+00
# PC1 and PC2 account for the majority of variance
# We focus on PC1 and PC2

pca500_dt<- data.frame(sample=rownames(pca500$x),
                       X = pca500$x[, 1],
                       Y = pca500$x[, 2])
head(pca500_dt, 2)
##                  sample         X         Y
## SRR12816720 SRR12816720 -76.82265 20.721437
## SRR12816722 SRR12816722 -75.40860  7.216925
# See how PC1 and PC2 separate the samples
library(ggplot2)
ggplot(data = pca500_dt, aes(x=X, y =Y, label= sample))+
     geom_text()+
      xlab(paste("PC1 - 54,1%"))+
      ylab(paste("PC2 - 12,6%"))

Figure 1. The regular PCA plot with 500 genes having larger variance

PC2 separates samples into groups, so we focus more on PC2.

# Etxract the loading score contributed by each genes in PC2
loading_scores <- pca500$rotation[, 2]

# Sort the genes by their absolute loading scores
gene_sorted <- sort(abs(loading_scores), decreasing = TRUE)

# The 10 genes with the largest loading scores
top_genes <- names(gene_sorted[1:10])

top_genes
##  [1] "ENSG00000125740" "ENSG00000185303" "ENSG00000136155" "ENSG00000243188"
##  [5] "ENSG00000237654" "ENSG00000285980" "ENSG00000229807" "ENSG00000244137"
##  [9] "ENSG00000114455" "ENSG00000240393"

I used the top 10 genes name to find their index in the 500 genes table and excluded them. Then I did PCA again with 490 genes and plotted the to PC1 and PC1. The plot did not look much different from the PCA plot by 500 genes. Therefore, I tried to exclude the 50 genes having the largest loading scores .

# Name of the 50 top genes
top_genes50 <- names(gene_sorted[1:50])


# Exclude 50 top genes in the 500 genes set

index2 <- match(top_genes50, rownames(the_500_count))

data_2 <- the_500_count[-index2, ]

pca450 <- prcomp(t(data_2))

summary(pca450)
## Importance of components:
##                           PC1      PC2      PC3      PC4      PC5      PC6
## Standard deviation     70.726 29.62242 21.19617 18.29954 15.97236 14.61046
## Proportion of Variance  0.567  0.09946  0.05092  0.03796  0.02892  0.02419
## Cumulative Proportion   0.567  0.66642  0.71734  0.75530  0.78421  0.80841
##                             PC7     PC8     PC9     PC10    PC11    PC12
## Standard deviation     13.28062 12.1397 11.7334 10.86424 9.68368 9.57203
## Proportion of Variance  0.01999  0.0167  0.0156  0.01338 0.01063 0.01038
## Cumulative Proportion   0.82840  0.8451  0.8607  0.87408 0.88471 0.89510
##                           PC13   PC14    PC15    PC16    PC17    PC18   PC19
## Standard deviation     9.07109 8.2432 8.18132 8.10625 7.59440 7.48622 7.2164
## Proportion of Variance 0.00933 0.0077 0.00759 0.00745 0.00654 0.00635 0.0059
## Cumulative Proportion  0.90442 0.9121 0.91971 0.92716 0.93369 0.94005 0.9459
##                           PC20   PC21    PC22    PC23    PC24    PC25    PC26
## Standard deviation     7.01049 6.7072 6.59177 6.42393 6.13875 5.97738 5.69884
## Proportion of Variance 0.00557 0.0051 0.00492 0.00468 0.00427 0.00405 0.00368
## Cumulative Proportion  0.95152 0.9566 0.96154 0.96622 0.97049 0.97454 0.97822
##                           PC27    PC28    PC29    PC30    PC31    PC32    PC33
## Standard deviation     5.54882 5.27544 4.97564 4.56609 4.35406 3.92805 3.71480
## Proportion of Variance 0.00349 0.00315 0.00281 0.00236 0.00215 0.00175 0.00156
## Cumulative Proportion  0.98171 0.98487 0.98767 0.99004 0.99218 0.99393 0.99550
##                           PC34   PC35    PC36    PC37      PC38
## Standard deviation     3.65326 3.3906 3.05347 2.35828 3.141e-14
## Proportion of Variance 0.00151 0.0013 0.00106 0.00063 0.000e+00
## Cumulative Proportion  0.99701 0.9983 0.99937 1.00000 1.000e+00
# plot PC1 and PC2 of the PCA analysis for 450 genes set
pca450_dt<- data.frame(sample=rownames(pca450$x),
                       X = pca450$x[, 1],
                       Y = pca450$x[, 2])

ggplot(data = pca450_dt, aes(x=X, y =Y, label= sample))+
     geom_text()+
      xlab(paste("PC1 - 56,7%"))+
      ylab(paste("PC2 - 9,9%"))

Figure 2. The regular PCA plot with 450 genes having larger variance

I saw a few difference between the PCA plot by 450 genes and the PCA plot by 500 genes earlier. However, representing the samples by names makes it hard to see the difference. Hence, I used the 50 top genes name to filter them out in the original count matrix and plotted the PCA plot again. .

covid19_dds <-readRDS("F:/BAI HC/Research 2/Covid19/covid_dds.RDS")
count_matrix<- readRDS("F:/BAI HC/Research 2/Covid19/count_matrix_file.RDS")
patient_info <-readRDS("F:/BAI HC/Research 2/Covid19/patient_info_final.RDS")
library(SummarizedExperiment)
# Exclude the 50 top genes and do vst transformation again

index_ori <- match(top_genes50, rownames(count_matrix))
count2<- count_matrix[-index_ori, ]

# Make  the  dds object again
 covid19_dds_exc <-DESeqDataSetFromMatrix(countData = count2, 
                                     colData = patient_info,
                                     design = ~ condition)
## Warning in DESeqDataSet(se, design = design, ignoreRank): some variables in
## design formula are characters, converting to factors
keep <- rowSums(counts(covid19_dds_exc) >= 10) >= 9
covid19_dds_exc2 <- covid19_dds_exc[keep,]

nrow(covid19_dds_exc2)
## [1] 25621
# vst transformation thew new dds object
vsd_exc <- varianceStabilizingTransformation(covid19_dds_exc2, blind = TRUE,
  fitType = "mean")

# PCA with 450 genes
plotPCA(vsd_exc, intgroup = c("condition"))

Figure 3. The color PCA plot with 450 genes having larger variance

# Load the previous original PCA plot with vst transformation counts
readRDS("F:/BAI HC/Research 2/Covid19/pca_original_vst.RDS")

Figure 4. The original PCA plot with 500 genes having larger variance

The 50 top genes indeed contributed a lot in separating the control and case of covid19 samples because the covid19 caseswere much clustered together with the help of the 50 top gene in the original PCA plot.

library(biomaRt)
# Load the annotation for Human genes
library(org.Hs.eg.db)
## Loading required package: AnnotationDbi
## 
keytypes(org.Hs.eg.db)
##  [1] "ACCNUM"       "ALIAS"        "ENSEMBL"      "ENSEMBLPROT"  "ENSEMBLTRANS"
##  [6] "ENTREZID"     "ENZYME"       "EVIDENCE"     "EVIDENCEALL"  "GENENAME"    
## [11] "GO"           "GOALL"        "IPI"          "MAP"          "OMIM"        
## [16] "ONTOLOGY"     "ONTOLOGYALL"  "PATH"         "PFAM"         "PMID"        
## [21] "PROSITE"      "REFSEQ"       "SYMBOL"       "UCSCKG"       "UNIGENE"     
## [26] "UNIPROT"
# Name of the top 50 genes
top50_table<- AnnotationDbi::select(org.Hs.eg.db,keys=top_genes50, keytype="ENSEMBL",
columns=c("ENSEMBL","SYMBOL") )
## 'select()' returned 1:1 mapping between keys and columns
loading_50<- gene_sorted[1:50]

top50_table$loadingScore<- loading_50
The top 50 genes
ENSEMBL SYMBOL loadingScore
ENSG00000125740 FOSB 0.1007968
ENSG00000185303 SFTPA2 0.0984603
ENSG00000136155 SCEL 0.0909534
ENSG00000243188 HNRNPA1P17 0.0880618
ENSG00000237654 LOC101929653 0.0864994
ENSG00000285980 NA 0.0863279
ENSG00000229807 XIST 0.0843462
ENSG00000244137 NA 0.0842994
ENSG00000114455 HHLA2 0.0835554
ENSG00000240393 LOC100421670 0.0819178
ENSG00000256612 CYP2B7P 0.0815153
ENSG00000149021 SCGB1A1 0.0813709
ENSG00000260555 NA 0.0809108
ENSG00000196188 CTSE 0.0808729
ENSG00000225383 SFTA1P 0.0807924
ENSG00000168484 SFTPC 0.0806798
ENSG00000102683 SGCG 0.0804112
ENSG00000246430 LINC00968 0.0791863
ENSG00000203859 HSD3B2 0.0778316
ENSG00000172752 COL6A5 0.0770623
ENSG00000257542 OR7E47P 0.0761601
ENSG00000100079 LGALS2 0.0760937
ENSG00000257604 NA 0.0760016
ENSG00000158481 CD1C 0.0750077
ENSG00000156284 CLDN8 0.0749846
ENSG00000224796 RPL32P1 0.0745006
ENSG00000179639 FCER1A 0.0744870
ENSG00000151023 ENKUR 0.0736413
ENSG00000009765 IYD 0.0726802
ENSG00000168878 SFTPB 0.0726620
ENSG00000273540 AGBL1 0.0724173
ENSG00000131355 ADGRE3 0.0717369
ENSG00000248801 C8orf34-AS1 0.0716897
ENSG00000169836 TACR3 0.0713534
ENSG00000249238 NA 0.0705262
ENSG00000175267 VWA3A 0.0703452
ENSG00000137976 DNASE2B 0.0702286
ENSG00000127249 ATP13A4 0.0701582
ENSG00000243955 GSTA1 0.0697348
ENSG00000261026 NA 0.0696392
ENSG00000169876 MUC17 0.0695449
ENSG00000137860 SLC28A2 0.0693700
ENSG00000164746 C7orf57 0.0689716
ENSG00000131400 NAPSA 0.0686940
ENSG00000260695 NA 0.0683462
ENSG00000142677 IL22RA1 0.0683061
ENSG00000155511 GRIA1 0.0681772
ENSG00000211642 IGLV10-54 0.0680579
ENSG00000234763 NA 0.0676263
ENSG00000158488 CD1E 0.0674893

There are a few genes not having symbol name in Ensembl database.