library(DESeq2)
## Loading required package: S4Vectors
## Loading required package: stats4
## Loading required package: BiocGenerics
## Loading required package: parallel
##
## Attaching package: 'BiocGenerics'
## The following objects are masked from 'package:parallel':
##
## clusterApply, clusterApplyLB, clusterCall, clusterEvalQ,
## clusterExport, clusterMap, parApply, parCapply, parLapply,
## parLapplyLB, parRapply, parSapply, parSapplyLB
## The following objects are masked from 'package:stats':
##
## IQR, mad, sd, var, xtabs
## The following objects are masked from 'package:base':
##
## anyDuplicated, append, as.data.frame, basename, cbind, colnames,
## dirname, do.call, duplicated, eval, evalq, Filter, Find, get, grep,
## grepl, intersect, is.unsorted, lapply, Map, mapply, match, mget,
## order, paste, pmax, pmax.int, pmin, pmin.int, Position, rank,
## rbind, Reduce, rownames, sapply, setdiff, sort, table, tapply,
## union, unique, unsplit, which.max, which.min
##
## Attaching package: 'S4Vectors'
## The following object is masked from 'package:base':
##
## expand.grid
## Loading required package: IRanges
##
## Attaching package: 'IRanges'
## The following object is masked from 'package:grDevices':
##
## windows
## Loading required package: GenomicRanges
## Loading required package: GenomeInfoDb
## Loading required package: SummarizedExperiment
## Loading required package: MatrixGenerics
## Loading required package: matrixStats
##
## Attaching package: 'MatrixGenerics'
## The following objects are masked from 'package:matrixStats':
##
## colAlls, colAnyNAs, colAnys, colAvgsPerRowSet, colCollapse,
## colCounts, colCummaxs, colCummins, colCumprods, colCumsums,
## colDiffs, colIQRDiffs, colIQRs, colLogSumExps, colMadDiffs,
## colMads, colMaxs, colMeans2, colMedians, colMins, colOrderStats,
## colProds, colQuantiles, colRanges, colRanks, colSdDiffs, colSds,
## colSums2, colTabulates, colVarDiffs, colVars, colWeightedMads,
## colWeightedMeans, colWeightedMedians, colWeightedSds,
## colWeightedVars, rowAlls, rowAnyNAs, rowAnys, rowAvgsPerColSet,
## rowCollapse, rowCounts, rowCummaxs, rowCummins, rowCumprods,
## rowCumsums, rowDiffs, rowIQRDiffs, rowIQRs, rowLogSumExps,
## rowMadDiffs, rowMads, rowMaxs, rowMeans2, rowMedians, rowMins,
## rowOrderStats, rowProds, rowQuantiles, rowRanges, rowRanks,
## rowSdDiffs, rowSds, rowSums2, rowTabulates, rowVarDiffs, rowVars,
## rowWeightedMads, rowWeightedMeans, rowWeightedMedians,
## rowWeightedSds, rowWeightedVars
## Loading required package: Biobase
## Welcome to Bioconductor
##
## Vignettes contain introductory material; view with
## 'browseVignettes()'. To cite Bioconductor, see
## 'citation("Biobase")', and for packages 'citation("pkgname")'.
##
## Attaching package: 'Biobase'
## The following object is masked from 'package:MatrixGenerics':
##
## rowMedians
## The following objects are masked from 'package:matrixStats':
##
## anyMissing, rowMedians
vst_count <-readRDS("F:/BAI HC/Research 2/Covid19/vst_count.RDS")
pca_data <-assay(vst_count)
# calculate gene variation
gene_var<- rowVars(pca_data)
gene_var2 <- as.data.frame(gene_var)
rownames(gene_var2)<- rownames(pca_data)
# sort the gene variation by decreasing order
gene_var2$gene <- rownames(gene_var2)
gene_var_sorted <- gene_var2[order(gene_var2$gene_var, decreasing = TRUE),]
# Choose the 500 gene having the largest variance in 25671 genes
gene_500 <- gene_var_sorted[1:500,]
# Filter the 500 genes in the vst matrix
gene_id <-gene_500$gene
pca_data <- as.data.frame(pca_data)
pca_data$geneID <-rownames(pca_data)
# filter the 500 gene count
the_500_count <- pca_data[match(gene_id,pca_data$geneID),]
the_500_count <- the_500_count[, 1:38]
#PCA analysis with thr 500 most variable genes
pca500 <- prcomp(t(the_500_count))
summary(pca500)
## Importance of components:
## PC1 PC2 PC3 PC4 PC5 PC6
## Standard deviation 72.9533 35.2132 22.13230 19.49839 16.46475 15.7803
## Proportion of Variance 0.5408 0.1260 0.04978 0.03863 0.02755 0.0253
## Cumulative Proportion 0.5408 0.6668 0.71660 0.75524 0.78278 0.8081
## PC7 PC8 PC9 PC10 PC11 PC12
## Standard deviation 13.72781 13.16706 12.32751 11.30306 10.26655 10.07981
## Proportion of Variance 0.01915 0.01762 0.01544 0.01298 0.01071 0.01032
## Cumulative Proportion 0.82724 0.84486 0.86030 0.87328 0.88399 0.89432
## PC13 PC14 PC15 PC16 PC17 PC18 PC19
## Standard deviation 9.76154 8.80788 8.72259 8.5914 8.17298 7.82330 7.54474
## Proportion of Variance 0.00968 0.00788 0.00773 0.0075 0.00679 0.00622 0.00578
## Cumulative Proportion 0.90400 0.91188 0.91961 0.9271 0.93390 0.94012 0.94591
## PC20 PC21 PC22 PC23 PC24 PC25 PC26
## Standard deviation 7.38589 7.14421 7.02537 6.71840 6.46299 6.35948 6.15299
## Proportion of Variance 0.00554 0.00519 0.00502 0.00459 0.00424 0.00411 0.00385
## Cumulative Proportion 0.95145 0.95664 0.96165 0.96624 0.97048 0.97459 0.97844
## PC27 PC28 PC29 PC30 PC31 PC32 PC33
## Standard deviation 5.75162 5.50610 5.2535 4.91013 4.59195 4.15461 3.90815
## Proportion of Variance 0.00336 0.00308 0.0028 0.00245 0.00214 0.00175 0.00155
## Cumulative Proportion 0.98180 0.98488 0.9877 0.99014 0.99228 0.99403 0.99558
## PC34 PC35 PC36 PC37 PC38
## Standard deviation 3.73693 3.64152 3.11433 2.55438 3.24e-14
## Proportion of Variance 0.00142 0.00135 0.00099 0.00066 0.00e+00
## Cumulative Proportion 0.99700 0.99835 0.99934 1.00000 1.00e+00
# PC1 and PC2 account for the majority of variance
# We focus on PC1 and PC2
pca500_dt<- data.frame(sample=rownames(pca500$x),
X = pca500$x[, 1],
Y = pca500$x[, 2])
head(pca500_dt, 2)
## sample X Y
## SRR12816720 SRR12816720 -76.82265 20.721437
## SRR12816722 SRR12816722 -75.40860 7.216925
# See how PC1 and PC2 separate the samples
library(ggplot2)
ggplot(data = pca500_dt, aes(x=X, y =Y, label= sample))+
geom_text()+
xlab(paste("PC1 - 54,1%"))+
ylab(paste("PC2 - 12,6%"))
Figure 1. The regular PCA plot with 500 genes having larger variance
PC2 separates samples into groups, so we focus more on PC2.
# Etxract the loading score contributed by each genes in PC2
loading_scores <- pca500$rotation[, 2]
# Sort the genes by their absolute loading scores
gene_sorted <- sort(abs(loading_scores), decreasing = TRUE)
# The 10 genes with the largest loading scores
top_genes <- names(gene_sorted[1:10])
top_genes
## [1] "ENSG00000125740" "ENSG00000185303" "ENSG00000136155" "ENSG00000243188"
## [5] "ENSG00000237654" "ENSG00000285980" "ENSG00000229807" "ENSG00000244137"
## [9] "ENSG00000114455" "ENSG00000240393"
I used the top 10 genes name to find their index in the 500 genes table and excluded them. Then I did PCA again with 490 genes and plotted the to PC1 and PC1. The plot did not look much different from the PCA plot by 500 genes. Therefore, I tried to exclude the 50 genes having the largest loading scores .
# Name of the 50 top genes
top_genes50 <- names(gene_sorted[1:50])
# Exclude 50 top genes in the 500 genes set
index2 <- match(top_genes50, rownames(the_500_count))
data_2 <- the_500_count[-index2, ]
pca450 <- prcomp(t(data_2))
summary(pca450)
## Importance of components:
## PC1 PC2 PC3 PC4 PC5 PC6
## Standard deviation 70.726 29.62242 21.19617 18.29954 15.97236 14.61046
## Proportion of Variance 0.567 0.09946 0.05092 0.03796 0.02892 0.02419
## Cumulative Proportion 0.567 0.66642 0.71734 0.75530 0.78421 0.80841
## PC7 PC8 PC9 PC10 PC11 PC12
## Standard deviation 13.28062 12.1397 11.7334 10.86424 9.68368 9.57203
## Proportion of Variance 0.01999 0.0167 0.0156 0.01338 0.01063 0.01038
## Cumulative Proportion 0.82840 0.8451 0.8607 0.87408 0.88471 0.89510
## PC13 PC14 PC15 PC16 PC17 PC18 PC19
## Standard deviation 9.07109 8.2432 8.18132 8.10625 7.59440 7.48622 7.2164
## Proportion of Variance 0.00933 0.0077 0.00759 0.00745 0.00654 0.00635 0.0059
## Cumulative Proportion 0.90442 0.9121 0.91971 0.92716 0.93369 0.94005 0.9459
## PC20 PC21 PC22 PC23 PC24 PC25 PC26
## Standard deviation 7.01049 6.7072 6.59177 6.42393 6.13875 5.97738 5.69884
## Proportion of Variance 0.00557 0.0051 0.00492 0.00468 0.00427 0.00405 0.00368
## Cumulative Proportion 0.95152 0.9566 0.96154 0.96622 0.97049 0.97454 0.97822
## PC27 PC28 PC29 PC30 PC31 PC32 PC33
## Standard deviation 5.54882 5.27544 4.97564 4.56609 4.35406 3.92805 3.71480
## Proportion of Variance 0.00349 0.00315 0.00281 0.00236 0.00215 0.00175 0.00156
## Cumulative Proportion 0.98171 0.98487 0.98767 0.99004 0.99218 0.99393 0.99550
## PC34 PC35 PC36 PC37 PC38
## Standard deviation 3.65326 3.3906 3.05347 2.35828 3.141e-14
## Proportion of Variance 0.00151 0.0013 0.00106 0.00063 0.000e+00
## Cumulative Proportion 0.99701 0.9983 0.99937 1.00000 1.000e+00
# plot PC1 and PC2 of the PCA analysis for 450 genes set
pca450_dt<- data.frame(sample=rownames(pca450$x),
X = pca450$x[, 1],
Y = pca450$x[, 2])
ggplot(data = pca450_dt, aes(x=X, y =Y, label= sample))+
geom_text()+
xlab(paste("PC1 - 56,7%"))+
ylab(paste("PC2 - 9,9%"))
Figure 2. The regular PCA plot with 450 genes having larger variance
I saw a few difference between the PCA plot by 450 genes and the PCA plot by 500 genes earlier. However, representing the samples by names makes it hard to see the difference. Hence, I used the 50 top genes name to filter them out in the original count matrix and plotted the PCA plot again. .
covid19_dds <-readRDS("F:/BAI HC/Research 2/Covid19/covid_dds.RDS")
count_matrix<- readRDS("F:/BAI HC/Research 2/Covid19/count_matrix_file.RDS")
patient_info <-readRDS("F:/BAI HC/Research 2/Covid19/patient_info_final.RDS")
library(SummarizedExperiment)
# Exclude the 50 top genes and do vst transformation again
index_ori <- match(top_genes50, rownames(count_matrix))
count2<- count_matrix[-index_ori, ]
# Make the dds object again
covid19_dds_exc <-DESeqDataSetFromMatrix(countData = count2,
colData = patient_info,
design = ~ condition)
## Warning in DESeqDataSet(se, design = design, ignoreRank): some variables in
## design formula are characters, converting to factors
keep <- rowSums(counts(covid19_dds_exc) >= 10) >= 9
covid19_dds_exc2 <- covid19_dds_exc[keep,]
nrow(covid19_dds_exc2)
## [1] 25621
# vst transformation thew new dds object
vsd_exc <- varianceStabilizingTransformation(covid19_dds_exc2, blind = TRUE,
fitType = "mean")
# PCA with 450 genes
plotPCA(vsd_exc, intgroup = c("condition"))
Figure 3. The color PCA plot with 450 genes having larger variance
# Load the previous original PCA plot with vst transformation counts
readRDS("F:/BAI HC/Research 2/Covid19/pca_original_vst.RDS")
Figure 4. The original PCA plot with 500 genes having larger variance
The 50 top genes indeed contributed a lot in separating the control and case of covid19 samples because the covid19 caseswere much clustered together with the help of the 50 top gene in the original PCA plot.
library(biomaRt)
# Load the annotation for Human genes
library(org.Hs.eg.db)
## Loading required package: AnnotationDbi
##
keytypes(org.Hs.eg.db)
## [1] "ACCNUM" "ALIAS" "ENSEMBL" "ENSEMBLPROT" "ENSEMBLTRANS"
## [6] "ENTREZID" "ENZYME" "EVIDENCE" "EVIDENCEALL" "GENENAME"
## [11] "GO" "GOALL" "IPI" "MAP" "OMIM"
## [16] "ONTOLOGY" "ONTOLOGYALL" "PATH" "PFAM" "PMID"
## [21] "PROSITE" "REFSEQ" "SYMBOL" "UCSCKG" "UNIGENE"
## [26] "UNIPROT"
# Name of the top 50 genes
top50_table<- AnnotationDbi::select(org.Hs.eg.db,keys=top_genes50, keytype="ENSEMBL",
columns=c("ENSEMBL","SYMBOL") )
## 'select()' returned 1:1 mapping between keys and columns
loading_50<- gene_sorted[1:50]
top50_table$loadingScore<- loading_50
| ENSEMBL | SYMBOL | loadingScore |
|---|---|---|
| ENSG00000125740 | FOSB | 0.1007968 |
| ENSG00000185303 | SFTPA2 | 0.0984603 |
| ENSG00000136155 | SCEL | 0.0909534 |
| ENSG00000243188 | HNRNPA1P17 | 0.0880618 |
| ENSG00000237654 | LOC101929653 | 0.0864994 |
| ENSG00000285980 | NA | 0.0863279 |
| ENSG00000229807 | XIST | 0.0843462 |
| ENSG00000244137 | NA | 0.0842994 |
| ENSG00000114455 | HHLA2 | 0.0835554 |
| ENSG00000240393 | LOC100421670 | 0.0819178 |
| ENSG00000256612 | CYP2B7P | 0.0815153 |
| ENSG00000149021 | SCGB1A1 | 0.0813709 |
| ENSG00000260555 | NA | 0.0809108 |
| ENSG00000196188 | CTSE | 0.0808729 |
| ENSG00000225383 | SFTA1P | 0.0807924 |
| ENSG00000168484 | SFTPC | 0.0806798 |
| ENSG00000102683 | SGCG | 0.0804112 |
| ENSG00000246430 | LINC00968 | 0.0791863 |
| ENSG00000203859 | HSD3B2 | 0.0778316 |
| ENSG00000172752 | COL6A5 | 0.0770623 |
| ENSG00000257542 | OR7E47P | 0.0761601 |
| ENSG00000100079 | LGALS2 | 0.0760937 |
| ENSG00000257604 | NA | 0.0760016 |
| ENSG00000158481 | CD1C | 0.0750077 |
| ENSG00000156284 | CLDN8 | 0.0749846 |
| ENSG00000224796 | RPL32P1 | 0.0745006 |
| ENSG00000179639 | FCER1A | 0.0744870 |
| ENSG00000151023 | ENKUR | 0.0736413 |
| ENSG00000009765 | IYD | 0.0726802 |
| ENSG00000168878 | SFTPB | 0.0726620 |
| ENSG00000273540 | AGBL1 | 0.0724173 |
| ENSG00000131355 | ADGRE3 | 0.0717369 |
| ENSG00000248801 | C8orf34-AS1 | 0.0716897 |
| ENSG00000169836 | TACR3 | 0.0713534 |
| ENSG00000249238 | NA | 0.0705262 |
| ENSG00000175267 | VWA3A | 0.0703452 |
| ENSG00000137976 | DNASE2B | 0.0702286 |
| ENSG00000127249 | ATP13A4 | 0.0701582 |
| ENSG00000243955 | GSTA1 | 0.0697348 |
| ENSG00000261026 | NA | 0.0696392 |
| ENSG00000169876 | MUC17 | 0.0695449 |
| ENSG00000137860 | SLC28A2 | 0.0693700 |
| ENSG00000164746 | C7orf57 | 0.0689716 |
| ENSG00000131400 | NAPSA | 0.0686940 |
| ENSG00000260695 | NA | 0.0683462 |
| ENSG00000142677 | IL22RA1 | 0.0683061 |
| ENSG00000155511 | GRIA1 | 0.0681772 |
| ENSG00000211642 | IGLV10-54 | 0.0680579 |
| ENSG00000234763 | NA | 0.0676263 |
| ENSG00000158488 | CD1E | 0.0674893 |
There are a few genes not having symbol name in Ensembl database.