PCA_plot.R

rm(list = ls())
### loading PACKAGE
library(psych)
#library(reshape2)
library(ggplot2)

## 
## Attaching package: 'ggplot2'

## The following objects are masked from 'package:psych':
## 
##     %+%, alpha

library(factoextra)

## Welcome! Want to learn more? See two factoextra-related books at https://goo.gl/ve3WBa

### inout data
data <- read.table("ehbio_salmon.DESeq2.normalized.symbol.txt", header=T, row.names=NULL,sep="\t")
dim(data) #[1] 27186     9

## [1] 27186     9

head(data)

##       id untrt_N61311 untrt_N052611 untrt_N080611 untrt_N061011 trt_N61311
## 1    FN1    245667.66      427435.1     221687.51      371144.2  240187.24
## 2    DCN    212953.14      360796.2     258977.30      408573.1  210002.18
## 3  CEMIP     40996.34      137783.1      53813.92       91066.8   62301.12
## 4 CCDC80    137229.15      232772.2      86258.13      212237.3  136730.76
## 5 IGFBP5     77812.65      288609.2     210628.87      168067.4   96021.74
## 6 COL1A1    146450.41      127367.3     152281.50      140861.1   62358.64
##   trt_N052611 trt_N080611 trt_N061011
## 1   450103.21   280226.19   376518.23
## 2   316009.14   225547.39   393843.74
## 3   223111.85   212724.84   157919.47
## 4   226070.89   124634.56   236237.81
## 5   217439.21   162677.38   168387.36
## 6    53800.47    69160.97    51044.06

summary(data)

##       id             untrt_N61311       untrt_N052611      untrt_N080611     
##  Length:27186       Min.   :     0.00   Min.   :     0.0   Min.   :     0.0  
##  Class :character   1st Qu.:     2.75   1st Qu.:     2.7   1st Qu.:     2.6  
##  Mode  :character   Median :    30.51   Median :    32.6   Median :    29.2  
##                     Mean   :   768.17   Mean   :   814.2   Mean   :   790.7  
##                     3rd Qu.:   461.78   3rd Qu.:   451.9   3rd Qu.:   455.5  
##                     Max.   :283255.42   Max.   :427435.1   Max.   :341607.3  
##  untrt_N061011        trt_N61311         trt_N052611        trt_N080611       
##  Min.   :     0.0   Min.   :     0.00   Min.   :     0.0   Min.   :     0.00  
##  1st Qu.:     3.0   1st Qu.:     2.29   1st Qu.:     3.0   1st Qu.:     2.82  
##  Median :    32.5   Median :    29.53   Median :    30.4   Median :    29.07  
##  Mean   :   787.6   Mean   :   797.35   Mean   :   857.6   Mean   :   843.36  
##  3rd Qu.:   452.2   3rd Qu.:   464.23   3rd Qu.:   456.8   3rd Qu.:   462.32  
##  Max.   :408573.1   Max.   :274728.86   Max.   :450103.2   Max.   :292252.08  
##   trt_N061011      
##  Min.   :     0.0  
##  1st Qu.:     2.1  
##  Median :    29.8  
##  Mean   :   852.9  
##  3rd Qu.:   463.8  
##  Max.   :393843.7

dim(data) #[1] 27186     9

## [1] 27186     9

### Deal with duplicate names
length(unique(data$id)) #[1] 27007

## [1] 27007

rownames_data <- make.names(data[,1],unique=T)
data <- data[,-1,drop=F]
dim(data) #[1] 27186     8

## [1] 27186     8

length(unique(rownames_data))

## [1] 27186

rownames(data) <- rownames_data #[1] 27186
data <- data[rowSums(data)>0,]
dim(data) #[1] 27186     8

## [1] 27186     8

head(data)

##        untrt_N61311 untrt_N052611 untrt_N080611 untrt_N061011 trt_N61311
## FN1       245667.66      427435.1     221687.51      371144.2  240187.24
## DCN       212953.14      360796.2     258977.30      408573.1  210002.18
## CEMIP      40996.34      137783.1      53813.92       91066.8   62301.12
## CCDC80    137229.15      232772.2      86258.13      212237.3  136730.76
## IGFBP5     77812.65      288609.2     210628.87      168067.4   96021.74
## COL1A1    146450.41      127367.3     152281.50      140861.1   62358.64
##        trt_N052611 trt_N080611 trt_N061011
## FN1      450103.21   280226.19   376518.23
## DCN      316009.14   225547.39   393843.74
## CEMIP    223111.85   212724.84   157919.47
## CCDC80   226070.89   124634.56   236237.81
## IGFBP5   217439.21   162677.38   168387.36
## COL1A1    53800.47    69160.97    51044.06

### Remove rows with 0 variance
data <- data[apply(data, 1, var)!=0,]
dim(data) #[1] 27186     8

## [1] 27186     8

#MAD, median absolute deviation
# mad(c(1,2,3)) #[1] 1.4826
# mad(c(1,10,100)) #[1] 13.3434
mads <- apply(data, 1, mad)
head(mads)

##       FN1       DCN     CEMIP    CCDC80    IGFBP5    COL1A1 
## 122696.65 109596.08  83570.35  75194.76  67912.94  64538.02

data <- data[rev(order(mads)),]
#you can keep top 50, 500, 5000 genes
#data_1 <- data[order(mads),][1:20,]
#head(data_1)
dim(data) #[1] 27186     8

## [1] 27186     8

###PCA analysis
# Pay attention to the format of PCA input 
# Rows are samples and columns are variables
data_t <- t(data)
variableL <- ncol(data_t) #[1] 27186
dim(data_t) #[1]     8 27186

## [1]     8 27186

class(data_t) #[1] "matrix" "array"

## [1] "matrix" "array"

class(data) #[1] "data.frame"

## [1] "data.frame"

data_t[, 1:10]

##                    FN1      DCN     CEMIP    CCDC80    IGFBP5    COL1A1
## untrt_N61311  245667.7 212953.1  40996.34 137229.15  77812.65 146450.41
## untrt_N052611 427435.1 360796.2 137783.10 232772.17 288609.20 127367.25
## untrt_N080611 221687.5 258977.3  53813.92  86258.13 210628.87 152281.50
## untrt_N061011 371144.2 408573.1  91066.80 212237.32 168067.42 140861.07
## trt_N61311    240187.2 210002.2  62301.12 136730.76  96021.74  62358.64
## trt_N052611   450103.2 316009.1 223111.85 226070.89 217439.21  53800.47
## trt_N080611   280226.2 225547.4 212724.84 124634.56 162677.38  69160.97
## trt_N061011   376518.2 393843.7 157919.47 236237.81 168387.36  51044.06
##                   GREM1   MT.RNR2      FTL     THBS1
## untrt_N61311  124246.41  63352.88 234852.9  37003.71
## untrt_N052611 137527.21 116052.90 197585.1  51260.17
## untrt_N080611 217280.29 177452.36 287309.9  34506.82
## untrt_N061011 112502.63  77960.27 180266.1  36896.26
## trt_N61311     70740.84  69491.99 157839.9  73328.63
## trt_N052611    85790.55 124660.17 143825.3 182993.79
## trt_N080611   255895.40 146696.67 161717.2 171753.21
## trt_N061011    63291.69  81818.43 120886.5 170443.87

# By default, prcomp will centralized the data using mean.
# Normalize data for PCA by dividing each data by column standard deviation.
# Often, we would normalize data.
# Only when we care about the real number changes other than the trends,
# `scale` can be set to TRUE. 
# We will show the differences of scaling and un-scaling effects.
variableL #[1] 27186

## [1] 27186

dim(data_t) #[1]     8 27186

## [1]     8 27186

pca <- prcomp(data_t[,1:ncol(data_t)], scale=T)
# sdev: standard deviation of the principle components.
# Square to get variance
# percentVar <- pca$sdev^2 / sum( pca$sdev^2)
class(pca)

## [1] "prcomp"

head(pca$rotation)

##                 PC1          PC2         PC3           PC4          PC5
## FN1     0.003375674  0.001603088 0.013032407  0.0033047633  0.005622747
## DCN    -0.000474038  0.002351453 0.013059399 -0.0054618432  0.002763769
## CEMIP   0.007794261 -0.006817366 0.006911881  0.0019985133  0.005560399
## CCDC80  0.003618220  0.005394765 0.011899741 -0.0002095483  0.006838037
## IGFBP5 -0.001894905 -0.006399577 0.009984163  0.0051404839  0.005190594
## COL1A1 -0.012379328  0.001006008 0.000173108  0.0007415088 -0.002957843
##                  PC6           PC7         PC8
## FN1    -0.0018193462  0.0008660537 -0.53680622
## DCN     0.0002170529  0.0046613219 -0.67176512
## CEMIP  -0.0041415418 -0.0036421551 -0.27829064
## CCDC80 -0.0004472703  0.0009623146 -0.36750832
## IGFBP5  0.0038033410  0.0067317282 -0.17525389
## COL1A1  0.0008065496 -0.0010377938  0.05358061

pca$x

##                     PC1         PC2        PC3        PC4        PC5       PC6
## untrt_N61311  -64.94613   76.257162  -53.30529  17.575301  -9.587639  15.79482
## untrt_N052611 -66.85147    2.349272   49.20816  59.893193  92.725828  26.90325
## untrt_N080611 -82.62322  -83.840966  -22.21228   1.180698 -70.198448  53.87055
## untrt_N061011 -73.20773   44.449296   64.21915 -60.753305 -23.918322 -80.40440
## trt_N61311     66.20075   65.743079 -109.31326  15.182844   1.038331 -13.26029
## trt_N052611   101.20186   -6.609691   75.76232  90.605029 -46.959427 -16.05356
## trt_N080611    33.84331 -118.934263  -50.24068 -24.837638  41.994287 -52.44947
## trt_N061011    86.38263   20.586112   45.88189 -98.846123  14.905389  65.59911
##                      PC7           PC8
## untrt_N61311  -92.724984  7.112107e-14
## untrt_N052611  30.884873 -3.571848e-13
## untrt_N080611  34.035003  2.046873e-13
## untrt_N061011  24.256303  3.451179e-13
## trt_N61311     67.389135  3.730197e-13
## trt_N052611   -17.820980 -1.199263e-13
## trt_N080611   -37.076579 -4.631087e-14
## trt_N061011    -8.942772 -4.315068e-13

dim(pca$rotation) #[1] 27186     8

## [1] 27186     8

summary(pca)

## Importance of components:
##                            PC1     PC2     PC3     PC4      PC5      PC6
## Standard deviation     79.3768 69.4475 67.8222 61.4302 51.24433 50.46626
## Proportion of Variance  0.2318  0.1774  0.1692  0.1388  0.09659  0.09368
## Cumulative Proportion   0.2318  0.4092  0.5784  0.7172  0.81377  0.90745
##                             PC7       PC8
## Standard deviation     50.15985 4.644e-13
## Proportion of Variance  0.09255 0.000e+00
## Cumulative Proportion   1.00000 1.000e+00

get_eig(pca)

##         eigenvalue variance.percent cumulative.variance.percent
## Dim.1 6.300681e+03     2.317620e+01                    23.17620
## Dim.2 4.822958e+03     1.774060e+01                    40.91679
## Dim.3 4.599853e+03     1.691993e+01                    57.83673
## Dim.4 3.773671e+03     1.388094e+01                    71.71766
## Dim.5 2.625981e+03     9.659314e+00                    81.37698
## Dim.6 2.546844e+03     9.368218e+00                    90.74520
## Dim.7 2.516011e+03     9.254804e+00                   100.00000
## Dim.8 2.156897e-25     7.933853e-28                   100.00000

class(summary(pca))

## [1] "summary.prcomp"

screeplot(pca,type = "line")

# To check what's in pca
print(str(pca))

## List of 5
##  $ sdev    : num [1:8] 79.4 69.4 67.8 61.4 51.2 ...
##  $ rotation: num [1:27186, 1:8] 0.003376 -0.000474 0.007794 0.003618 -0.001895 ...
##   ..- attr(*, "dimnames")=List of 2
##   .. ..$ : chr [1:27186] "FN1" "DCN" "CEMIP" "CCDC80" ...
##   .. ..$ : chr [1:8] "PC1" "PC2" "PC3" "PC4" ...
##  $ center  : Named num [1:27186] 326621 298338 122465 174021 173705 ...
##   ..- attr(*, "names")= chr [1:27186] "FN1" "DCN" "CEMIP" "CCDC80" ...
##  $ scale   : Named num [1:27186] 90294 82313 71438 59013 67497 ...
##   ..- attr(*, "names")= chr [1:27186] "FN1" "DCN" "CEMIP" "CCDC80" ...
##  $ x       : num [1:8, 1:8] -64.9 -66.9 -82.6 -73.2 66.2 ...
##   ..- attr(*, "dimnames")=List of 2
##   .. ..$ : chr [1:8] "untrt_N61311" "untrt_N052611" "untrt_N080611" "untrt_N061011" ...
##   .. ..$ : chr [1:8] "PC1" "PC2" "PC3" "PC4" ...
##  - attr(*, "class")= chr "prcomp"
## NULL

#PCA display results
#pdf("1.pdf")
fviz_eig(pca, addlabels = TRUE)

#dev.off()
# PCA sample cluster info
fviz_pca_ind(pca, repel=T)

#add color
data_t <- as.data.frame(data_t)
data_t$conditions <- c(rep("untrt",4),rep("trt",4))
fviz_pca_ind(pca,  col.ind=data_t$conditions,mean.point=F, addEllipses = T, legend.title="Groups")

# add 95% Confidence interval
fviz_pca_ind(pca, col.ind=data_t$conditions, mean.point=F, addEllipses = T, legend.title="Groups", ellipse.type="confidence", ellipse.level=0.95)

#Show the most contributing variables (genes)
#Show variables whose correlation with the main axis is greater than 0.99 (the specific numbers are adjusted by yourself)
# Visualize variable with cos2 >= 0.99
fviz_pca_var(pca, select.var = list(cos2 = 0.99), repel=T, col.var = "cos2", geom.var = c("arrow", "text") )

#Show the 10 variables most relevant to the main axis
# Top 10 active variables with the highest cos2
fviz_pca_var(pca, select.var= list(cos2 = 10), repel=T, col.var = "contrib")

#Display the correlation distribution of variables (genes) with the main axis
# Select by names
# top 10 gene with maximum mad
name <- list(name = c("FN1", "DCN", "CEMIP","CCDC80","IGFBP5","COL1A1","GREM1"))
fviz_pca_var(pca, select.var = name)

# top 5 contributing individuals and variable
#6. biPLot
fviz_pca_biplot(pca,
                fill.ind=data_t$conditions,
                palette="joo",
                addEllipses = T, 
                ellipse.type="confidence",
                ellipse.level=0.95,
                mean.point=F,
                col.var="contrib",
                gradient.cols = "RdYlBu",
                select.var = list(contrib = 10),
                ggtheme = theme_minimal())

#################################2d plot
scores = as.data.frame(pca$x) 
data_plot <- scores
data_plot$source <- row.names(data_plot)
data_plot$source <- gsub("_.*.", "", data_plot$source ) 
variance.percent_1 <- round(get_eig(pca)$variance.percent,digits = 2)
ggplot(data_plot,aes(x=PC1,y=PC2,color=source)) + 
  geom_point(aes(alpha=0.5)) +
  theme(panel.background = element_blank(),
        legend.position = "",
        panel.border=element_rect(fill=NA),
        panel.grid.major = element_blank(),
        panel.grid.minor = element_blank(),
        strip.background=element_blank(),
        axis.text.x=element_text(colour="black"),
        axis.text.y=element_text(colour="black"),
        axis.ticks=element_line(colour="black"),
        plot.margin=unit(c(1,1,1,1),"line")) +
  xlab(paste0("PC1"," (",variance.percent_1[1],"%)")) +
  ylab(paste0("PC1"," (",variance.percent_1[2],"%)")) +
  scale_alpha_continuous(guide=FALSE) +
  stat_ellipse( level = 0.95)

ggsave(filename = paste0(Sys.Date(),"-","pca-2d",".tif"), 
       plot = last_plot(), device = "tiff",
       width =12, height = 10, units = "cm",
       dpi = 300, limitsize = TRUE, compression = "lzw")
#########################3d plot
library("scatterplot3d")
scores = as.data.frame(pca$x) 
row.names(scores)

## [1] "untrt_N61311"  "untrt_N052611" "untrt_N080611" "untrt_N061011"
## [5] "trt_N61311"    "trt_N052611"   "trt_N080611"   "trt_N061011"

scores$source <- gsub("_.*.", "", row.names(scores))
scores$source <- as.factor(scores$source)
colors <- c("#999999", "#E69F00")
colors <- colors[as.numeric(scores$source)]
variance.percent <- round(get_eig(pca)$variance.percent,digits = 2)
s3d <- scatterplot3d(scores[,c(1,3,2)], pch = 16, box =F,grid = T,type="h",
                     main="PCA",color=colors,
                     angle = 25, scale.y= 1,highlight.3d=F,
                     xlab= paste0("PC1"," (",variance.percent[1],"%)"),
                     ylab= paste0("PC3"," (",variance.percent[3],"%)"),
                     zlab= paste0("PC2"," (",variance.percent[2],"%)"))

text(s3d$xyz.convert(scores[,c(1,3,2)]), labels = rownames(scores),
     cex= 0.7, col = "steelblue")

legend("top", legend = levels(scores$source),
       col =  c("#999999", "#E69F00"), pch = 16,inset = -0.15, xpd = T, horiz = T)

PCA_plot.R

liyix

2020-09-13