rm(list = ls())
### loading PACKAGE
library(psych)
#library(reshape2)
library(ggplot2)
##
## Attaching package: 'ggplot2'
## The following objects are masked from 'package:psych':
##
## %+%, alpha
library(factoextra)
## Welcome! Want to learn more? See two factoextra-related books at https://goo.gl/ve3WBa
### inout data
data <- read.table("ehbio_salmon.DESeq2.normalized.symbol.txt", header=T, row.names=NULL,sep="\t")
dim(data) #[1] 27186 9
## [1] 27186 9
head(data)
## id untrt_N61311 untrt_N052611 untrt_N080611 untrt_N061011 trt_N61311
## 1 FN1 245667.66 427435.1 221687.51 371144.2 240187.24
## 2 DCN 212953.14 360796.2 258977.30 408573.1 210002.18
## 3 CEMIP 40996.34 137783.1 53813.92 91066.8 62301.12
## 4 CCDC80 137229.15 232772.2 86258.13 212237.3 136730.76
## 5 IGFBP5 77812.65 288609.2 210628.87 168067.4 96021.74
## 6 COL1A1 146450.41 127367.3 152281.50 140861.1 62358.64
## trt_N052611 trt_N080611 trt_N061011
## 1 450103.21 280226.19 376518.23
## 2 316009.14 225547.39 393843.74
## 3 223111.85 212724.84 157919.47
## 4 226070.89 124634.56 236237.81
## 5 217439.21 162677.38 168387.36
## 6 53800.47 69160.97 51044.06
summary(data)
## id untrt_N61311 untrt_N052611 untrt_N080611
## Length:27186 Min. : 0.00 Min. : 0.0 Min. : 0.0
## Class :character 1st Qu.: 2.75 1st Qu.: 2.7 1st Qu.: 2.6
## Mode :character Median : 30.51 Median : 32.6 Median : 29.2
## Mean : 768.17 Mean : 814.2 Mean : 790.7
## 3rd Qu.: 461.78 3rd Qu.: 451.9 3rd Qu.: 455.5
## Max. :283255.42 Max. :427435.1 Max. :341607.3
## untrt_N061011 trt_N61311 trt_N052611 trt_N080611
## Min. : 0.0 Min. : 0.00 Min. : 0.0 Min. : 0.00
## 1st Qu.: 3.0 1st Qu.: 2.29 1st Qu.: 3.0 1st Qu.: 2.82
## Median : 32.5 Median : 29.53 Median : 30.4 Median : 29.07
## Mean : 787.6 Mean : 797.35 Mean : 857.6 Mean : 843.36
## 3rd Qu.: 452.2 3rd Qu.: 464.23 3rd Qu.: 456.8 3rd Qu.: 462.32
## Max. :408573.1 Max. :274728.86 Max. :450103.2 Max. :292252.08
## trt_N061011
## Min. : 0.0
## 1st Qu.: 2.1
## Median : 29.8
## Mean : 852.9
## 3rd Qu.: 463.8
## Max. :393843.7
dim(data) #[1] 27186 9
## [1] 27186 9
### Deal with duplicate names
length(unique(data$id)) #[1] 27007
## [1] 27007
rownames_data <- make.names(data[,1],unique=T)
data <- data[,-1,drop=F]
dim(data) #[1] 27186 8
## [1] 27186 8
length(unique(rownames_data))
## [1] 27186
rownames(data) <- rownames_data #[1] 27186
data <- data[rowSums(data)>0,]
dim(data) #[1] 27186 8
## [1] 27186 8
head(data)
## untrt_N61311 untrt_N052611 untrt_N080611 untrt_N061011 trt_N61311
## FN1 245667.66 427435.1 221687.51 371144.2 240187.24
## DCN 212953.14 360796.2 258977.30 408573.1 210002.18
## CEMIP 40996.34 137783.1 53813.92 91066.8 62301.12
## CCDC80 137229.15 232772.2 86258.13 212237.3 136730.76
## IGFBP5 77812.65 288609.2 210628.87 168067.4 96021.74
## COL1A1 146450.41 127367.3 152281.50 140861.1 62358.64
## trt_N052611 trt_N080611 trt_N061011
## FN1 450103.21 280226.19 376518.23
## DCN 316009.14 225547.39 393843.74
## CEMIP 223111.85 212724.84 157919.47
## CCDC80 226070.89 124634.56 236237.81
## IGFBP5 217439.21 162677.38 168387.36
## COL1A1 53800.47 69160.97 51044.06
### Remove rows with 0 variance
data <- data[apply(data, 1, var)!=0,]
dim(data) #[1] 27186 8
## [1] 27186 8
#MAD, median absolute deviation
# mad(c(1,2,3)) #[1] 1.4826
# mad(c(1,10,100)) #[1] 13.3434
mads <- apply(data, 1, mad)
head(mads)
## FN1 DCN CEMIP CCDC80 IGFBP5 COL1A1
## 122696.65 109596.08 83570.35 75194.76 67912.94 64538.02
data <- data[rev(order(mads)),]
#you can keep top 50, 500, 5000 genes
#data_1 <- data[order(mads),][1:20,]
#head(data_1)
dim(data) #[1] 27186 8
## [1] 27186 8
###PCA analysis
# Pay attention to the format of PCA input
# Rows are samples and columns are variables
data_t <- t(data)
variableL <- ncol(data_t) #[1] 27186
dim(data_t) #[1] 8 27186
## [1] 8 27186
class(data_t) #[1] "matrix" "array"
## [1] "matrix" "array"
class(data) #[1] "data.frame"
## [1] "data.frame"
data_t[, 1:10]
## FN1 DCN CEMIP CCDC80 IGFBP5 COL1A1
## untrt_N61311 245667.7 212953.1 40996.34 137229.15 77812.65 146450.41
## untrt_N052611 427435.1 360796.2 137783.10 232772.17 288609.20 127367.25
## untrt_N080611 221687.5 258977.3 53813.92 86258.13 210628.87 152281.50
## untrt_N061011 371144.2 408573.1 91066.80 212237.32 168067.42 140861.07
## trt_N61311 240187.2 210002.2 62301.12 136730.76 96021.74 62358.64
## trt_N052611 450103.2 316009.1 223111.85 226070.89 217439.21 53800.47
## trt_N080611 280226.2 225547.4 212724.84 124634.56 162677.38 69160.97
## trt_N061011 376518.2 393843.7 157919.47 236237.81 168387.36 51044.06
## GREM1 MT.RNR2 FTL THBS1
## untrt_N61311 124246.41 63352.88 234852.9 37003.71
## untrt_N052611 137527.21 116052.90 197585.1 51260.17
## untrt_N080611 217280.29 177452.36 287309.9 34506.82
## untrt_N061011 112502.63 77960.27 180266.1 36896.26
## trt_N61311 70740.84 69491.99 157839.9 73328.63
## trt_N052611 85790.55 124660.17 143825.3 182993.79
## trt_N080611 255895.40 146696.67 161717.2 171753.21
## trt_N061011 63291.69 81818.43 120886.5 170443.87
# By default, prcomp will centralized the data using mean.
# Normalize data for PCA by dividing each data by column standard deviation.
# Often, we would normalize data.
# Only when we care about the real number changes other than the trends,
# `scale` can be set to TRUE.
# We will show the differences of scaling and un-scaling effects.
variableL #[1] 27186
## [1] 27186
dim(data_t) #[1] 8 27186
## [1] 8 27186
pca <- prcomp(data_t[,1:ncol(data_t)], scale=T)
# sdev: standard deviation of the principle components.
# Square to get variance
# percentVar <- pca$sdev^2 / sum( pca$sdev^2)
class(pca)
## [1] "prcomp"
head(pca$rotation)
## PC1 PC2 PC3 PC4 PC5
## FN1 0.003375674 0.001603088 0.013032407 0.0033047633 0.005622747
## DCN -0.000474038 0.002351453 0.013059399 -0.0054618432 0.002763769
## CEMIP 0.007794261 -0.006817366 0.006911881 0.0019985133 0.005560399
## CCDC80 0.003618220 0.005394765 0.011899741 -0.0002095483 0.006838037
## IGFBP5 -0.001894905 -0.006399577 0.009984163 0.0051404839 0.005190594
## COL1A1 -0.012379328 0.001006008 0.000173108 0.0007415088 -0.002957843
## PC6 PC7 PC8
## FN1 -0.0018193462 0.0008660537 -0.53680622
## DCN 0.0002170529 0.0046613219 -0.67176512
## CEMIP -0.0041415418 -0.0036421551 -0.27829064
## CCDC80 -0.0004472703 0.0009623146 -0.36750832
## IGFBP5 0.0038033410 0.0067317282 -0.17525389
## COL1A1 0.0008065496 -0.0010377938 0.05358061
pca$x
## PC1 PC2 PC3 PC4 PC5 PC6
## untrt_N61311 -64.94613 76.257162 -53.30529 17.575301 -9.587639 15.79482
## untrt_N052611 -66.85147 2.349272 49.20816 59.893193 92.725828 26.90325
## untrt_N080611 -82.62322 -83.840966 -22.21228 1.180698 -70.198448 53.87055
## untrt_N061011 -73.20773 44.449296 64.21915 -60.753305 -23.918322 -80.40440
## trt_N61311 66.20075 65.743079 -109.31326 15.182844 1.038331 -13.26029
## trt_N052611 101.20186 -6.609691 75.76232 90.605029 -46.959427 -16.05356
## trt_N080611 33.84331 -118.934263 -50.24068 -24.837638 41.994287 -52.44947
## trt_N061011 86.38263 20.586112 45.88189 -98.846123 14.905389 65.59911
## PC7 PC8
## untrt_N61311 -92.724984 7.112107e-14
## untrt_N052611 30.884873 -3.571848e-13
## untrt_N080611 34.035003 2.046873e-13
## untrt_N061011 24.256303 3.451179e-13
## trt_N61311 67.389135 3.730197e-13
## trt_N052611 -17.820980 -1.199263e-13
## trt_N080611 -37.076579 -4.631087e-14
## trt_N061011 -8.942772 -4.315068e-13
dim(pca$rotation) #[1] 27186 8
## [1] 27186 8
summary(pca)
## Importance of components:
## PC1 PC2 PC3 PC4 PC5 PC6
## Standard deviation 79.3768 69.4475 67.8222 61.4302 51.24433 50.46626
## Proportion of Variance 0.2318 0.1774 0.1692 0.1388 0.09659 0.09368
## Cumulative Proportion 0.2318 0.4092 0.5784 0.7172 0.81377 0.90745
## PC7 PC8
## Standard deviation 50.15985 4.644e-13
## Proportion of Variance 0.09255 0.000e+00
## Cumulative Proportion 1.00000 1.000e+00
get_eig(pca)
## eigenvalue variance.percent cumulative.variance.percent
## Dim.1 6.300681e+03 2.317620e+01 23.17620
## Dim.2 4.822958e+03 1.774060e+01 40.91679
## Dim.3 4.599853e+03 1.691993e+01 57.83673
## Dim.4 3.773671e+03 1.388094e+01 71.71766
## Dim.5 2.625981e+03 9.659314e+00 81.37698
## Dim.6 2.546844e+03 9.368218e+00 90.74520
## Dim.7 2.516011e+03 9.254804e+00 100.00000
## Dim.8 2.156897e-25 7.933853e-28 100.00000
class(summary(pca))
## [1] "summary.prcomp"
screeplot(pca,type = "line")

# To check what's in pca
print(str(pca))
## List of 5
## $ sdev : num [1:8] 79.4 69.4 67.8 61.4 51.2 ...
## $ rotation: num [1:27186, 1:8] 0.003376 -0.000474 0.007794 0.003618 -0.001895 ...
## ..- attr(*, "dimnames")=List of 2
## .. ..$ : chr [1:27186] "FN1" "DCN" "CEMIP" "CCDC80" ...
## .. ..$ : chr [1:8] "PC1" "PC2" "PC3" "PC4" ...
## $ center : Named num [1:27186] 326621 298338 122465 174021 173705 ...
## ..- attr(*, "names")= chr [1:27186] "FN1" "DCN" "CEMIP" "CCDC80" ...
## $ scale : Named num [1:27186] 90294 82313 71438 59013 67497 ...
## ..- attr(*, "names")= chr [1:27186] "FN1" "DCN" "CEMIP" "CCDC80" ...
## $ x : num [1:8, 1:8] -64.9 -66.9 -82.6 -73.2 66.2 ...
## ..- attr(*, "dimnames")=List of 2
## .. ..$ : chr [1:8] "untrt_N61311" "untrt_N052611" "untrt_N080611" "untrt_N061011" ...
## .. ..$ : chr [1:8] "PC1" "PC2" "PC3" "PC4" ...
## - attr(*, "class")= chr "prcomp"
## NULL
#PCA display results
#pdf("1.pdf")
fviz_eig(pca, addlabels = TRUE)

#dev.off()
# PCA sample cluster info
fviz_pca_ind(pca, repel=T)

#add color
data_t <- as.data.frame(data_t)
data_t$conditions <- c(rep("untrt",4),rep("trt",4))
fviz_pca_ind(pca, col.ind=data_t$conditions,mean.point=F, addEllipses = T, legend.title="Groups")

# add 95% Confidence interval
fviz_pca_ind(pca, col.ind=data_t$conditions, mean.point=F, addEllipses = T, legend.title="Groups", ellipse.type="confidence", ellipse.level=0.95)

#Show the most contributing variables (genes)
#Show variables whose correlation with the main axis is greater than 0.99 (the specific numbers are adjusted by yourself)
# Visualize variable with cos2 >= 0.99
fviz_pca_var(pca, select.var = list(cos2 = 0.99), repel=T, col.var = "cos2", geom.var = c("arrow", "text") )

#Show the 10 variables most relevant to the main axis
# Top 10 active variables with the highest cos2
fviz_pca_var(pca, select.var= list(cos2 = 10), repel=T, col.var = "contrib")

#Display the correlation distribution of variables (genes) with the main axis
# Select by names
# top 10 gene with maximum mad
name <- list(name = c("FN1", "DCN", "CEMIP","CCDC80","IGFBP5","COL1A1","GREM1"))
fviz_pca_var(pca, select.var = name)

# top 5 contributing individuals and variable
#6. biPLot
fviz_pca_biplot(pca,
fill.ind=data_t$conditions,
palette="joo",
addEllipses = T,
ellipse.type="confidence",
ellipse.level=0.95,
mean.point=F,
col.var="contrib",
gradient.cols = "RdYlBu",
select.var = list(contrib = 10),
ggtheme = theme_minimal())

#################################2d plot
scores = as.data.frame(pca$x)
data_plot <- scores
data_plot$source <- row.names(data_plot)
data_plot$source <- gsub("_.*.", "", data_plot$source )
variance.percent_1 <- round(get_eig(pca)$variance.percent,digits = 2)
ggplot(data_plot,aes(x=PC1,y=PC2,color=source)) +
geom_point(aes(alpha=0.5)) +
theme(panel.background = element_blank(),
legend.position = "",
panel.border=element_rect(fill=NA),
panel.grid.major = element_blank(),
panel.grid.minor = element_blank(),
strip.background=element_blank(),
axis.text.x=element_text(colour="black"),
axis.text.y=element_text(colour="black"),
axis.ticks=element_line(colour="black"),
plot.margin=unit(c(1,1,1,1),"line")) +
xlab(paste0("PC1"," (",variance.percent_1[1],"%)")) +
ylab(paste0("PC1"," (",variance.percent_1[2],"%)")) +
scale_alpha_continuous(guide=FALSE) +
stat_ellipse( level = 0.95)

ggsave(filename = paste0(Sys.Date(),"-","pca-2d",".tif"),
plot = last_plot(), device = "tiff",
width =12, height = 10, units = "cm",
dpi = 300, limitsize = TRUE, compression = "lzw")
#########################3d plot
library("scatterplot3d")
scores = as.data.frame(pca$x)
row.names(scores)
## [1] "untrt_N61311" "untrt_N052611" "untrt_N080611" "untrt_N061011"
## [5] "trt_N61311" "trt_N052611" "trt_N080611" "trt_N061011"
scores$source <- gsub("_.*.", "", row.names(scores))
scores$source <- as.factor(scores$source)
colors <- c("#999999", "#E69F00")
colors <- colors[as.numeric(scores$source)]
variance.percent <- round(get_eig(pca)$variance.percent,digits = 2)
s3d <- scatterplot3d(scores[,c(1,3,2)], pch = 16, box =F,grid = T,type="h",
main="PCA",color=colors,
angle = 25, scale.y= 1,highlight.3d=F,
xlab= paste0("PC1"," (",variance.percent[1],"%)"),
ylab= paste0("PC3"," (",variance.percent[3],"%)"),
zlab= paste0("PC2"," (",variance.percent[2],"%)"))
text(s3d$xyz.convert(scores[,c(1,3,2)]), labels = rownames(scores),
cex= 0.7, col = "steelblue")
legend("top", legend = levels(scores$source),
col = c("#999999", "#E69F00"), pch = 16,inset = -0.15, xpd = T, horiz = T)
