0.Import files

##Read Expression Files

ExpressionFile <- "C:/Users/astellas/Desktop/2206_test/Tpm_matrix.txt"
ExpList <- read.table(file=ExpressionFile, header=TRUE, sep="\t", quote="")
ExpList[is.na(ExpList)] <- 0

##Read Label Files

Label_matrix <- "C:/Users/astellas/Desktop/2206_test/Label_matrix.txt"
Label_matrix <- read.table(file=Label_matrix, header=TRUE, sep="\t", quote="")
group <- data.frame(con = Label_matrix$Group.ID)
genotypes = Label_matrix$Group.ID

##only protein-coding

ExpList2 <- ExpList[(ExpList$Biotype=="protein-coding"),]

##log2(TPM+1) transformed

ExpList3 <- ExpList2[,c(5:ncol(ExpList2))]
rownames(ExpList3) <- as.character(ExpList2$GeneName)
ExpListLog <- log2(ExpList3+1)

##filter out max>0 #※要相談

ExpListLog$max <- apply(ExpListLog,1,max)  
ExpListLog2 <- ExpListLog
ExpListLog2 <- ExpListLog2[ExpListLog2$max>0,-as.numeric(ncol(ExpListLog2))]

##draw box plot（Quality Check）

par(cex.axis=0.5)
boxplot(ExpListLog2[,c(1:54)])

1.階層的クラスタリング

##必要なライブラリの読み込み

suppressPackageStartupMessages(library(tidyverse))
suppressPackageStartupMessages(library(proxy))
suppressPackageStartupMessages(library(ggdendro))
suppressPackageStartupMessages(library(gplots))
suppressPackageStartupMessages(library(stats))

##距離行列の生成

data.dist<- dist(t(ExpListLog2[,c(1:ncol(ExpListLog2))]),method = "Euclidean")

##階層的クラスタリングの実行

hcOut <- hclust(data.dist, method="ward.D2")

##階層的クラスタリングの可視化

plot(hcOut, hang=-.1,cex=.8)

2.ヒートマップ

##ヒートマップの作成と出力（最長距離法）

heatmapdata<-ExpListLog2[1:15,]
heatmap(as.matrix(heatmapdata), margin=c(4,8), main="Title", cexRow = 1, cexCol = .4)

so_color =  colorRampPalette(c("gray","yellow","orange","red","red4"))(n=199)

##ヒートマップの作成と出力（Ward法）

suppressPackageStartupMessages(library(amap))
heatmapdata<-ExpListLog2[1:15,]
d1<-Dist(heatmapdata, method="pearson")
d2<-Dist(t(heatmapdata), method="pearson")
c1<-hclust(d1, method="ward.D")
c2<-hclust(d2, method="ward.D")
Colour <- colorRampPalette(c("gray","yellow","orange","red","red4"))(n=199)
heatmap(as.matrix(heatmapdata),
  dendrogram = "both",
  Colv=as.dendrogram(c2),
  Rowv=as.dendrogram(c1),
  scale="none",
  key = T,
  col=Colour,
  density.info="none",
  main="Heatmap (Ward algorythm)",
  trace = "none",
  cexRow = 1, cexCol = .4)

## Warning in plot.window(...): "dendrogram" はグラフィックスパラメータではありませ
## ん

## Warning in plot.window(...): "key" はグラフィックスパラメータではありません

## Warning in plot.window(...): "density.info" はグラフィックスパラメータではありま
## せん

## Warning in plot.window(...): "trace" はグラフィックスパラメータではありません

## Warning in plot.xy(xy, type, ...): "dendrogram" はグラフィックスパラメータではあ
## りません

## Warning in plot.xy(xy, type, ...): "key" はグラフィックスパラメータではありませ
## ん

## Warning in plot.xy(xy, type, ...): "density.info" はグラフィックスパラメータでは
## ありません

## Warning in plot.xy(xy, type, ...): "trace" はグラフィックスパラメータではありま
## せん

## Warning in title(...): "dendrogram" はグラフィックスパラメータではありません

## Warning in title(...): "key" はグラフィックスパラメータではありません

## Warning in title(...): "density.info" はグラフィックスパラメータではありません

## Warning in title(...): "trace" はグラフィックスパラメータではありません

3.主成分分析（PCA、UMAP）

##必要なライブラリの読み込み

suppressPackageStartupMessages(library(umap))
suppressPackageStartupMessages(library(ggplot2))
suppressPackageStartupMessages(library(cowplot))
suppressPackageStartupMessages(library(ggplot2))

PCA

##PCAの定義

pcs = prcomp(t(ExpListLog2), center = TRUE)
percentVar = round(((pcs$sdev) ^ 2 / sum((pcs$sdev) ^ 2)* 100), 2)

##PCAの寄与率

summary(pcs)$importance

##                             PC1      PC2      PC3      PC4      PC5      PC6
## Standard deviation     32.43453 19.68110 12.25057 9.344967 6.678653 4.792006
## Proportion of Variance  0.50386  0.18552  0.07188 0.041830 0.021360 0.011000
## Cumulative Proportion   0.50386  0.68938  0.76126 0.803090 0.824450 0.835450
##                             PC7      PC8      PC9    PC10     PC11     PC12
## Standard deviation     4.560889 4.049757 3.915877 3.63174 3.416281 3.267087
## Proportion of Variance 0.009960 0.007860 0.007340 0.00632 0.005590 0.005110
## Cumulative Proportion  0.845410 0.853270 0.860610 0.86693 0.872520 0.877630
##                            PC13     PC14     PC15     PC16    PC17     PC18
## Standard deviation     3.128997 3.102384 3.091893 3.015076 3.00025 2.945122
## Proportion of Variance 0.004690 0.004610 0.004580 0.004350 0.00431 0.004150
## Cumulative Proportion  0.882320 0.886930 0.891510 0.895860 0.90018 0.904330
##                            PC19     PC20    PC21     PC22     PC23     PC24
## Standard deviation     2.885202 2.871433 2.82279 2.767247 2.735249 2.710498
## Proportion of Variance 0.003990 0.003950 0.00382 0.003670 0.003580 0.003520
## Cumulative Proportion  0.908320 0.912270 0.91608 0.919750 0.923330 0.926850
##                            PC25     PC26    PC27     PC28     PC29     PC30
## Standard deviation     2.651883 2.590401 2.58981 2.545676 2.528982 2.512041
## Proportion of Variance 0.003370 0.003210 0.00321 0.003100 0.003060 0.003020
## Cumulative Proportion  0.930220 0.933430 0.93665 0.939750 0.942810 0.945840
##                            PC31     PC32     PC33     PC34     PC35     PC36
## Standard deviation     2.462259 2.451445 2.430107 2.407363 2.383432 2.348663
## Proportion of Variance 0.002900 0.002880 0.002830 0.002780 0.002720 0.002640
## Cumulative Proportion  0.948740 0.951620 0.954450 0.957220 0.959940 0.962590
##                           PC37     PC38     PC39     PC40     PC41     PC42
## Standard deviation     2.33343 2.304268 2.287721 2.267466 2.263839 2.220279
## Proportion of Variance 0.00261 0.002540 0.002510 0.002460 0.002450 0.002360
## Cumulative Proportion  0.96519 0.967740 0.970240 0.972710 0.975160 0.977520
##                            PC43     PC44     PC45     PC46     PC47     PC48
## Standard deviation     2.192552 2.160124 2.152073 2.115604 2.104637 2.073349
## Proportion of Variance 0.002300 0.002230 0.002220 0.002140 0.002120 0.002060
## Cumulative Proportion  0.979820 0.982060 0.984280 0.986420 0.988540 0.990600
##                            PC49     PC50     PC51     PC52     PC53
## Standard deviation     2.053052 2.035781 1.978957 1.950353 1.882348
## Proportion of Variance 0.002020 0.001980 0.001880 0.001820 0.001700
## Cumulative Proportion  0.992620 0.994610 0.996480 0.998300 1.000000
##                                PC54
## Standard deviation     4.028429e-14
## Proportion of Variance 0.000000e+00
## Cumulative Proportion  1.000000e+00

##PCAの可視化

ggplot(as.data.frame(pcs$x), aes(PC1,PC2), environment = environment()) + 
      geom_point(size = 2, aes(colour = genotypes)) +  
      theme(legend.text = element_text(size = 16, face = "bold"),
            legend.title = element_text(size = 16, colour = "black", face = "bold"),
            plot.title = element_text(size = 0, face ="bold"),
            axis.title = element_text(size = 18, face = "bold"),
            axis.text.x = element_text(size = 16, face = "bold", color = "black"),
            axis.text.y = element_text(size = 16, face = "bold", color = "black"),
            plot.margin = unit(c(0.5,0.5,0.5,0.5), "cm"))

UMAP

custom.config = umap.defaults
custom.config$random_state = 1636

UMAP_data <- pcs$x[,1:30]

UMAP_res <- umap(UMAP_data, config=custom.config)
  
df_umap <- data.frame(x = UMAP_res$layout[,1],
                 y = UMAP_res$layout[,2],
                 Celltype=Label_matrix$Group,
                 Day=Label_matrix$Day)

plot1<- ggplot(df_umap, aes(x, y,color=as.factor(Day))) + 
  geom_point(size=3) + theme_classic() + labs(color='Day') +
  scale_color_manual(values = c("black", "blue","green","red","yellow","chocolate")) +
          theme(legend.text  = element_text(size=18),
                legend.title = element_text(size=18),
              axis.title.x = element_text(size = 16),
              axis.title.y = element_text(size = 16),
              axis.text.x = element_text(size = 16),
              axis.text.y = element_text(size = 16))

plot2<- ggplot(df_umap, aes(x, y,color=Celltype))+ 
  geom_point(size=3) + theme_classic() + 
            theme(legend.text  = element_text(size=12),
                legend.title = element_blank(),
              axis.title.x = element_text(size = 16),
              axis.title.y = element_text(size = 16),
              axis.text.x = element_text(size = 16),
              axis.text.y = element_text(size = 16)) +
  scale_color_manual(values = c("palevioletred1",
                                "chocolate",
                                "violet",
                                "navy",
                                "blue",
                                "cyan",
                                "green",
                                "darkgreen",
                                "yellowgreen",
                                "lightseagreen",
                                "yellow",
                                "orange",
                                "gold4",
                                "red"))

plot_grid(plot1, plot2)

##UMAPの保存

save_plot(filename = "TPM_to_UMAP.png",
          plot_grid(plot1, plot2),
          base_height = 4,
          base_width = 10)

ggsave(filename = "TPM_to_UMAP_day.png",plot = plot1,
       width = 8,
       height = 6)

ggsave(filename = "TPM_to_UMAP_label.png",plot = plot2,
       width = 8,
       height = 5.5)

4.発現差異解析 MA-plot,GO analysis

##必要なライブラリの読み込み

suppressPackageStartupMessages(library(DESeq2))
suppressPackageStartupMessages(library(dplyr))
suppressPackageStartupMessages(library(cowplot))
suppressPackageStartupMessages(library(tictoc))
suppressPackageStartupMessages(library(ggplot2))
suppressPackageStartupMessages(library(scales))
suppressPackageStartupMessages(library(VennDiagram))
suppressPackageStartupMessages(library(openxlsx))
suppressPackageStartupMessages(library(BiocParallel))
suppressPackageStartupMessages(library(stats))

##カウントマトリクスの読み込み（Raw data）

Raw_count_matrix <- "C:/Users/astellas/Desktop/2206_test/Raw_count_matrix.txt"
data_raw <- read.table(file=Raw_count_matrix, header=TRUE, sep="\t", quote="")
data_raw[is.na(data_raw)] <- 0

##only protein-coding

data_raw2 <- data_raw[(data_raw$Biotype=="protein-coding"),]

##数値のみデータに変更

rownames(data_raw2) <- data_raw2$GeneID
data <- data_raw2[,c(5:ncol(data_raw2))]

##Read Label Files

Label_matrix2 <- "C:/Users/astellas/Desktop/2206_test/Label_matrix2.txt"
Label_matrix2 <- read.table(file=Label_matrix2, header=TRUE, sep="\t", quote="")
group2 <- data.frame(con = Label_matrix2$Group.ID)
genotypes2 = Label_matrix2$Group.ID

##Datasetの作成（所要時間2分程度）

dds <- DESeqDataSetFromMatrix(countData = data,
                              colData = group2, 
                              design = ~ con)

## Warning in DESeqDataSet(se, design = design, ignoreRank): some variables in
## design formula are characters, converting to factors

tic()
dds <- DESeq(dds)

## estimating size factors

## estimating dispersions

## gene-wise dispersion estimates

## mean-dispersion relationship

## final dispersion estimates

## fitting model and testing

toc()

## 53.79 sec elapsed

save(dds, file = "dds.rda")

##比較サンプルの選択

load("dds.rda")
res1 <- results(dds, 
                alpha=0.1, 
                tidy=TRUE, 
                contrast = c("con","ENG.d10","ENG.d3"))

##DEGの定義

res1 <- tibble::as_tibble(res1)
DEG1 <-dplyr::filter(res1, padj <= 0.1 &(log2FoldChange>= 1|log2FoldChange<= -1))
UP1 <-dplyr::filter(DEG1, log2FoldChange>= 1)
DOWN1 <-dplyr::filter(DEG1, log2FoldChange<= -1)
UP1_ID <- UP1$row
DOWN1_ID <- DOWN1$row
length(UP1_ID)

## [1] 2049

length(DOWN1_ID)

## [1] 771

##MA plot

# define threashold for coloring
res1$col <- res1$padj <= 0.1
# replace NA to FALSE
res1$col <- tidyr::replace_na(res1$col, FALSE) 

g1 <- ggplot(data = res1 ,aes(x=baseMean,y=log2FoldChange))
g1 <- g1 + geom_point(size=1,aes(color = col)) + scale_x_log10(oob = scales::squish_infinite) + 
        scale_colour_manual(values = c("FALSE" = "grey", "TRUE" = "red"))
g1 <- g1 + geom_hline(yintercept = c(1,-1),color="dodgerblue")
g1 <- g1 + theme_bw() + theme(legend.position = "none")
g1 <- g1 + ggtitle("MA-Plot ENG.d1 vs ENG.d3") + 
        theme(title = element_text(size=24),
              axis.title.x = element_text(size = 22),
              axis.title.y = element_text(size = 22),
              axis.text.x = element_text(size = 20),
              axis.text.y = element_text(size = 20))
g1 <- g1 + scale_y_continuous(breaks=seq(-6,6,by=2),
                              limits=c(-6, 6),oob=squish)
g1

## Warning: Transformation introduced infinite values in continuous x-axis

## Warning: Removed 2788 rows containing missing values (geom_point).

##MA plotの保存

ID_name_biotype <- data_raw2[1:3]
test <- subset(ID_name_biotype, ID_name_biotype$GeneID %in% DEG1$row)
DEG1.2 <- cbind(test, DEG1)
write.table(DEG1.2,
            file = "MA-Plot ENG.d1 vs ENG.d3",
            sep = "\t", 
            row.names = F,
            quote=F)

##GO analysis

UP1_geneList<-as.data.frame(cbind(ExpList[ExpList$GeneID %in% UP1$row,],UP1))
write.table(UP1_geneList, "C:/Users/astellas/Desktop/2206_test/UP1_geneList.txt")
DOWN1_geneList<-as.data.frame(cbind(ExpList[ExpList$GeneID %in% DOWN1$row,],DOWN1))
write.table(DOWN1_geneList, "C:/Users/astellas/Desktop/2206_test/DOWN1_geneList.txt")

→ https://david.ncifcrf.gov/summary.jsp

CHO cell analysis

Saki Saito

2022-06-13

0.Import files

1.階層的クラスタリング

2.ヒートマップ

3.主成分分析（PCA、UMAP）

PCA

UMAP

4.発現差異解析 MA-plot,GO analysis