##Read Expression Files
ExpressionFile <- "C:/Users/astellas/Desktop/2206_test/Tpm_matrix.txt"
ExpList <- read.table(file=ExpressionFile, header=TRUE, sep="\t", quote="")
ExpList[is.na(ExpList)] <- 0
##Read Label Files
Label_matrix <- "C:/Users/astellas/Desktop/2206_test/Label_matrix.txt"
Label_matrix <- read.table(file=Label_matrix, header=TRUE, sep="\t", quote="")
group <- data.frame(con = Label_matrix$Group.ID)
genotypes = Label_matrix$Group.ID
##only protein-coding
ExpList2 <- ExpList[(ExpList$Biotype=="protein-coding"),]
##log2(TPM+1) transformed
ExpList3 <- ExpList2[,c(5:ncol(ExpList2))]
rownames(ExpList3) <- as.character(ExpList2$GeneName)
ExpListLog <- log2(ExpList3+1)
##filter out max>0 #※要相談
ExpListLog$max <- apply(ExpListLog,1,max)
ExpListLog2 <- ExpListLog
ExpListLog2 <- ExpListLog2[ExpListLog2$max>0,-as.numeric(ncol(ExpListLog2))]
##draw box plot(Quality Check)
par(cex.axis=0.5)
boxplot(ExpListLog2[,c(1:54)])
##必要なライブラリの読み込み
suppressPackageStartupMessages(library(tidyverse))
suppressPackageStartupMessages(library(proxy))
suppressPackageStartupMessages(library(ggdendro))
suppressPackageStartupMessages(library(gplots))
suppressPackageStartupMessages(library(stats))
##距離行列の生成
data.dist<- dist(t(ExpListLog2[,c(1:ncol(ExpListLog2))]),method = "Euclidean")
##階層的クラスタリングの実行
hcOut <- hclust(data.dist, method="ward.D2")
##階層的クラスタリングの可視化
plot(hcOut, hang=-.1,cex=.8)
##ヒートマップの作成と出力(最長距離法)
heatmapdata<-ExpListLog2[1:15,]
heatmap(as.matrix(heatmapdata), margin=c(4,8), main="Title", cexRow = 1, cexCol = .4)
so_color = colorRampPalette(c("gray","yellow","orange","red","red4"))(n=199)
##ヒートマップの作成と出力(Ward法)
suppressPackageStartupMessages(library(amap))
heatmapdata<-ExpListLog2[1:15,]
d1<-Dist(heatmapdata, method="pearson")
d2<-Dist(t(heatmapdata), method="pearson")
c1<-hclust(d1, method="ward.D")
c2<-hclust(d2, method="ward.D")
Colour <- colorRampPalette(c("gray","yellow","orange","red","red4"))(n=199)
heatmap(as.matrix(heatmapdata),
dendrogram = "both",
Colv=as.dendrogram(c2),
Rowv=as.dendrogram(c1),
scale="none",
key = T,
col=Colour,
density.info="none",
main="Heatmap (Ward algorythm)",
trace = "none",
cexRow = 1, cexCol = .4)
## Warning in plot.window(...): "dendrogram" はグラフィックスパラメータではありませ
## ん
## Warning in plot.window(...): "key" はグラフィックスパラメータではありません
## Warning in plot.window(...): "density.info" はグラフィックスパラメータではありま
## せん
## Warning in plot.window(...): "trace" はグラフィックスパラメータではありません
## Warning in plot.xy(xy, type, ...): "dendrogram" はグラフィックスパラメータではあ
## りません
## Warning in plot.xy(xy, type, ...): "key" はグラフィックスパラメータではありませ
## ん
## Warning in plot.xy(xy, type, ...): "density.info" はグラフィックスパラメータでは
## ありません
## Warning in plot.xy(xy, type, ...): "trace" はグラフィックスパラメータではありま
## せん
## Warning in title(...): "dendrogram" はグラフィックスパラメータではありません
## Warning in title(...): "key" はグラフィックスパラメータではありません
## Warning in title(...): "density.info" はグラフィックスパラメータではありません
## Warning in title(...): "trace" はグラフィックスパラメータではありません
##必要なライブラリの読み込み
suppressPackageStartupMessages(library(umap))
suppressPackageStartupMessages(library(ggplot2))
suppressPackageStartupMessages(library(cowplot))
suppressPackageStartupMessages(library(ggplot2))
##PCAの定義
pcs = prcomp(t(ExpListLog2), center = TRUE)
percentVar = round(((pcs$sdev) ^ 2 / sum((pcs$sdev) ^ 2)* 100), 2)
##PCAの寄与率
summary(pcs)$importance
## PC1 PC2 PC3 PC4 PC5 PC6
## Standard deviation 32.43453 19.68110 12.25057 9.344967 6.678653 4.792006
## Proportion of Variance 0.50386 0.18552 0.07188 0.041830 0.021360 0.011000
## Cumulative Proportion 0.50386 0.68938 0.76126 0.803090 0.824450 0.835450
## PC7 PC8 PC9 PC10 PC11 PC12
## Standard deviation 4.560889 4.049757 3.915877 3.63174 3.416281 3.267087
## Proportion of Variance 0.009960 0.007860 0.007340 0.00632 0.005590 0.005110
## Cumulative Proportion 0.845410 0.853270 0.860610 0.86693 0.872520 0.877630
## PC13 PC14 PC15 PC16 PC17 PC18
## Standard deviation 3.128997 3.102384 3.091893 3.015076 3.00025 2.945122
## Proportion of Variance 0.004690 0.004610 0.004580 0.004350 0.00431 0.004150
## Cumulative Proportion 0.882320 0.886930 0.891510 0.895860 0.90018 0.904330
## PC19 PC20 PC21 PC22 PC23 PC24
## Standard deviation 2.885202 2.871433 2.82279 2.767247 2.735249 2.710498
## Proportion of Variance 0.003990 0.003950 0.00382 0.003670 0.003580 0.003520
## Cumulative Proportion 0.908320 0.912270 0.91608 0.919750 0.923330 0.926850
## PC25 PC26 PC27 PC28 PC29 PC30
## Standard deviation 2.651883 2.590401 2.58981 2.545676 2.528982 2.512041
## Proportion of Variance 0.003370 0.003210 0.00321 0.003100 0.003060 0.003020
## Cumulative Proportion 0.930220 0.933430 0.93665 0.939750 0.942810 0.945840
## PC31 PC32 PC33 PC34 PC35 PC36
## Standard deviation 2.462259 2.451445 2.430107 2.407363 2.383432 2.348663
## Proportion of Variance 0.002900 0.002880 0.002830 0.002780 0.002720 0.002640
## Cumulative Proportion 0.948740 0.951620 0.954450 0.957220 0.959940 0.962590
## PC37 PC38 PC39 PC40 PC41 PC42
## Standard deviation 2.33343 2.304268 2.287721 2.267466 2.263839 2.220279
## Proportion of Variance 0.00261 0.002540 0.002510 0.002460 0.002450 0.002360
## Cumulative Proportion 0.96519 0.967740 0.970240 0.972710 0.975160 0.977520
## PC43 PC44 PC45 PC46 PC47 PC48
## Standard deviation 2.192552 2.160124 2.152073 2.115604 2.104637 2.073349
## Proportion of Variance 0.002300 0.002230 0.002220 0.002140 0.002120 0.002060
## Cumulative Proportion 0.979820 0.982060 0.984280 0.986420 0.988540 0.990600
## PC49 PC50 PC51 PC52 PC53
## Standard deviation 2.053052 2.035781 1.978957 1.950353 1.882348
## Proportion of Variance 0.002020 0.001980 0.001880 0.001820 0.001700
## Cumulative Proportion 0.992620 0.994610 0.996480 0.998300 1.000000
## PC54
## Standard deviation 4.028429e-14
## Proportion of Variance 0.000000e+00
## Cumulative Proportion 1.000000e+00
##PCAの可視化
ggplot(as.data.frame(pcs$x), aes(PC1,PC2), environment = environment()) +
geom_point(size = 2, aes(colour = genotypes)) +
theme(legend.text = element_text(size = 16, face = "bold"),
legend.title = element_text(size = 16, colour = "black", face = "bold"),
plot.title = element_text(size = 0, face ="bold"),
axis.title = element_text(size = 18, face = "bold"),
axis.text.x = element_text(size = 16, face = "bold", color = "black"),
axis.text.y = element_text(size = 16, face = "bold", color = "black"),
plot.margin = unit(c(0.5,0.5,0.5,0.5), "cm"))
custom.config = umap.defaults
custom.config$random_state = 1636
UMAP_data <- pcs$x[,1:30]
UMAP_res <- umap(UMAP_data, config=custom.config)
df_umap <- data.frame(x = UMAP_res$layout[,1],
y = UMAP_res$layout[,2],
Celltype=Label_matrix$Group,
Day=Label_matrix$Day)
plot1<- ggplot(df_umap, aes(x, y,color=as.factor(Day))) +
geom_point(size=3) + theme_classic() + labs(color='Day') +
scale_color_manual(values = c("black", "blue","green","red","yellow","chocolate")) +
theme(legend.text = element_text(size=18),
legend.title = element_text(size=18),
axis.title.x = element_text(size = 16),
axis.title.y = element_text(size = 16),
axis.text.x = element_text(size = 16),
axis.text.y = element_text(size = 16))
plot2<- ggplot(df_umap, aes(x, y,color=Celltype))+
geom_point(size=3) + theme_classic() +
theme(legend.text = element_text(size=12),
legend.title = element_blank(),
axis.title.x = element_text(size = 16),
axis.title.y = element_text(size = 16),
axis.text.x = element_text(size = 16),
axis.text.y = element_text(size = 16)) +
scale_color_manual(values = c("palevioletred1",
"chocolate",
"violet",
"navy",
"blue",
"cyan",
"green",
"darkgreen",
"yellowgreen",
"lightseagreen",
"yellow",
"orange",
"gold4",
"red"))
plot_grid(plot1, plot2)
##UMAPの保存
save_plot(filename = "TPM_to_UMAP.png",
plot_grid(plot1, plot2),
base_height = 4,
base_width = 10)
ggsave(filename = "TPM_to_UMAP_day.png",plot = plot1,
width = 8,
height = 6)
ggsave(filename = "TPM_to_UMAP_label.png",plot = plot2,
width = 8,
height = 5.5)
##必要なライブラリの読み込み
suppressPackageStartupMessages(library(DESeq2))
suppressPackageStartupMessages(library(dplyr))
suppressPackageStartupMessages(library(cowplot))
suppressPackageStartupMessages(library(tictoc))
suppressPackageStartupMessages(library(ggplot2))
suppressPackageStartupMessages(library(scales))
suppressPackageStartupMessages(library(VennDiagram))
suppressPackageStartupMessages(library(openxlsx))
suppressPackageStartupMessages(library(BiocParallel))
suppressPackageStartupMessages(library(stats))
##カウントマトリクスの読み込み(Raw data)
Raw_count_matrix <- "C:/Users/astellas/Desktop/2206_test/Raw_count_matrix.txt"
data_raw <- read.table(file=Raw_count_matrix, header=TRUE, sep="\t", quote="")
data_raw[is.na(data_raw)] <- 0
##only protein-coding
data_raw2 <- data_raw[(data_raw$Biotype=="protein-coding"),]
##数値のみデータに変更
rownames(data_raw2) <- data_raw2$GeneID
data <- data_raw2[,c(5:ncol(data_raw2))]
##Read Label Files
Label_matrix2 <- "C:/Users/astellas/Desktop/2206_test/Label_matrix2.txt"
Label_matrix2 <- read.table(file=Label_matrix2, header=TRUE, sep="\t", quote="")
group2 <- data.frame(con = Label_matrix2$Group.ID)
genotypes2 = Label_matrix2$Group.ID
##Datasetの作成(所要時間2分程度)
dds <- DESeqDataSetFromMatrix(countData = data,
colData = group2,
design = ~ con)
## Warning in DESeqDataSet(se, design = design, ignoreRank): some variables in
## design formula are characters, converting to factors
tic()
dds <- DESeq(dds)
## estimating size factors
## estimating dispersions
## gene-wise dispersion estimates
## mean-dispersion relationship
## final dispersion estimates
## fitting model and testing
toc()
## 53.79 sec elapsed
save(dds, file = "dds.rda")
##比較サンプルの選択
load("dds.rda")
res1 <- results(dds,
alpha=0.1,
tidy=TRUE,
contrast = c("con","ENG.d10","ENG.d3"))
##DEGの定義
res1 <- tibble::as_tibble(res1)
DEG1 <-dplyr::filter(res1, padj <= 0.1 &(log2FoldChange>= 1|log2FoldChange<= -1))
UP1 <-dplyr::filter(DEG1, log2FoldChange>= 1)
DOWN1 <-dplyr::filter(DEG1, log2FoldChange<= -1)
UP1_ID <- UP1$row
DOWN1_ID <- DOWN1$row
length(UP1_ID)
## [1] 2049
length(DOWN1_ID)
## [1] 771
##MA plot
# define threashold for coloring
res1$col <- res1$padj <= 0.1
# replace NA to FALSE
res1$col <- tidyr::replace_na(res1$col, FALSE)
g1 <- ggplot(data = res1 ,aes(x=baseMean,y=log2FoldChange))
g1 <- g1 + geom_point(size=1,aes(color = col)) + scale_x_log10(oob = scales::squish_infinite) +
scale_colour_manual(values = c("FALSE" = "grey", "TRUE" = "red"))
g1 <- g1 + geom_hline(yintercept = c(1,-1),color="dodgerblue")
g1 <- g1 + theme_bw() + theme(legend.position = "none")
g1 <- g1 + ggtitle("MA-Plot ENG.d1 vs ENG.d3") +
theme(title = element_text(size=24),
axis.title.x = element_text(size = 22),
axis.title.y = element_text(size = 22),
axis.text.x = element_text(size = 20),
axis.text.y = element_text(size = 20))
g1 <- g1 + scale_y_continuous(breaks=seq(-6,6,by=2),
limits=c(-6, 6),oob=squish)
g1
## Warning: Transformation introduced infinite values in continuous x-axis
## Warning: Removed 2788 rows containing missing values (geom_point).
##MA plotの保存
ID_name_biotype <- data_raw2[1:3]
test <- subset(ID_name_biotype, ID_name_biotype$GeneID %in% DEG1$row)
DEG1.2 <- cbind(test, DEG1)
write.table(DEG1.2,
file = "MA-Plot ENG.d1 vs ENG.d3",
sep = "\t",
row.names = F,
quote=F)
##GO analysis
UP1_geneList<-as.data.frame(cbind(ExpList[ExpList$GeneID %in% UP1$row,],UP1))
write.table(UP1_geneList, "C:/Users/astellas/Desktop/2206_test/UP1_geneList.txt")
DOWN1_geneList<-as.data.frame(cbind(ExpList[ExpList$GeneID %in% DOWN1$row,],DOWN1))
write.table(DOWN1_geneList, "C:/Users/astellas/Desktop/2206_test/DOWN1_geneList.txt")