EDA using ggplot

Venn graph

library(tidyverse)
library(ggforce)  #add geom_circle
data <- data.frame(x = c(0,1,-1),
                   y = c(-0.5, 1, 1),
                  tx = c(0, 1.5, -1.5),
                  ty = c(-1, 1.3, 1.3),
                  cat = c("Domain Experience", "Math", 
                          "Computer Science"))

data %>% 
  ggplot(aes(x0 = x, y0 = y, r = 1.5, fill = cat))+
  geom_circle(alpha = 0.25, size = 1, color = "black", show.legend = FALSE)+
  geom_text(aes(x = tx, y = ty, label = cat), size = 7)+
  annotate(geom = "text", x = 0, y = 1.5, label = "Machine \n Learning", color = "purple", size = 5)+
  annotate(geom = "text", x = -0.9, y = 0, label = "Traditional \n Software", color = "darkorange", size = 5)+
  annotate(geom = "text", x = 0.9, y = 0, label = "Traditional \n Research", color = "darkgreen", size = 5)+
  annotate(geom = "text", x = 0, y = 0.5, label = "Data \n Science", color = "blue", size = 5)+
  theme_void()

# set plot dispaly size
options(repr.plot.height = 8 , repr.plot.width = 8)

library(VennDiagram)
set1 <- sample(1:10000, 2000)
set2 <- sample(1:10000, 2000)
set3 <- sample(1:10000, 2000)
set4 <- sample(1:10000, 2000)
set5 <- sample(1:10000, 2000)
colors <- c("blue","green","darkred","gray","orange")

venn.diagram(x = list(set1, set2, set3, set4, set5),
             category.names = c("s1","s2","s3","s4","s5"),
             output = TRUE,
             filename = 'datadaft_venn.png',
             imagetype = "png",
             scaled = FALSE,
             col = "black",
             fill = colors,
             cat.col = colors,
             cat.cex = 2,
             margin = 0.15)

## [1] 1

options(repr.plot.height = 12, repr.plot.width = 12)
library("png")
pp <- readPNG("datadaft_venn.png")
plot.new()
rasterImage(pp,0,0,1.1,1.1)

Histogram

library(ggplot2)
library(tidyverse)
file1 <- "C:/Users/yizha/Desktop/Spring 2020/ggplot2/Anr.lib.stat.txt"
dat1 <- read.table(file1,sep = "\t",check.names = F,header = T,comment.char = "")
head(dat1)

##        Species_Name Homologous_Number  Ratio
## 1   Citrus sinensis             32266 0.1802
## 2 Citrus clementina             36792 0.2054
## 3   Theobroma cacao              7004 0.0391
## 4    Vitis vinifera              7936 0.0443
## 5  Ricinus communis              3893 0.0217
## 6   Jatropha curcas              5179 0.0289

# simple hist
## order the data on Homogolous
dat1 <- dat1[order(dat1[,2],decreasing = T),]
head(dat1)

##           Species_Name Homologous_Number  Ratio
## 11               Other             46114 0.2575
## 2    Citrus clementina             36792 0.2054
## 1      Citrus sinensis             32266 0.1802
## 7  Gossypium raimondii             18921 0.1056
## 8  Populus trichocarpa             16782 0.0937
## 4       Vitis vinifera              7936 0.0443

# change the order of 'Species_Name' ( from high to low)
dat1$Species_Name <- factor(dat1$Species_Name, levels = dat1$Species_Name, ordered = T)
dat1 %>% 
  ggplot(aes(x = Species_Name, y = Homologous_Number))+
  geom_bar(stat = "identity", width = 0.8, fill = "lightblue") # if width=1, no space between columns

# put the 'other' to the end of the hist
other_row <- dat1[c(1)，] # extract row "other"
dat1 <- dat1[c(-1),] # remove row "other"
dat1 <- rbind(dat1,other_row) # combind, put "other" to the en
dat1$Species_Name <- factor(dat1$Species_Name, levels = dat1$Species_Name, ordered = T)

dat1 %>% 
  ggplot(aes(x = Species_Name, y = Homologous_Number))+
  geom_bar(stat = "identity", width = 0.8, fill = "darkred", color = "black")

# color: frame color
RColorBrewer::display.brewer.all() # color package

dat1 %>% 
  ggplot(aes(x = Species_Name, y = Homologous_Number, fill = Species_Name))+
  geom_bar(stat = "identity", width = 0.8)+
  scale_fill_brewer(palette = "Paired",direction = 1) # color direction in the package

# add text
dat1 %>% 
  ggplot(aes(x = Species_Name, y = Homologous_Number))+
  geom_bar(stat = "identity", width = 0.8, fill = "darkred") +
  geom_text(aes(label=paste(as.character(Ratio*100),"%")), vjust = -0.5,size=3)+
# add labs()
  labs(x="species",y="Unigenes_Num",title="Result of Nr",fill ="species") +
# add title()
  ggtitle(label = "Nr",subtitle = "Homologous_Number") +
# edit theme(), adjust plot.title or plot.subtitle
  theme(plot.title = element_text(size = 25,face = "bold",vjust = 0.5,hjust = 0.5),
         axis.text.x=element_text(size = 5,face = "bold",vjust =1 ,hjust = 1,angle = 30),
        panel.background = element_rect(fill = "transparent", color = NA))

library(ggplot2)
library(tidyverse)
file2 <- "C:/Users/yizha/Desktop/Spring 2020/ggplot2/nr.lib.stat.txt"
dat2 <- read.table(file2,sep = "\t",check.names = F,header = T,comment.char = "") # load the data
head(dat2)

##        Species_Name Cultivar Homologous_Number  Ratio
## 1   Citrus sinensis        A             32266 0.1802
## 2 Citrus clementina        A             36792 0.2054
## 3   Theobroma cacao        A              7004 0.0391
## 4    Vitis vinifera        A              7936 0.0443
## 5  Ricinus communis        A              3893 0.0217
## 6   Jatropha curcas        A              5179 0.0289

# set the order
dat2 <- dat2[order(dat2$Homologous_Number, decreasing = T),]
head(dat2)

##         Species_Name Cultivar Homologous_Number  Ratio
## 22             Other        B             52179 0.2726
## 11             Other        A             46114 0.2575
## 2  Citrus clementina        A             36792 0.2054
## 13 Citrus clementina        B             34862 0.1822
## 1    Citrus sinensis        A             32266 0.1802
## 12   Citrus sinensis        B             29890 0.1562

dodge

other_row_2 <- dat2[c(1,2),] # extract row "other"
dat2 <- dat2[c(-1,-2),] # remove row "other"
dat2 <- rbind(dat2,other_row_2) # combind, put "other" to the end
level <- unique(dat2$Species_Name)

dat2$Species_Name=factor(dat2$Species_Name,levels = level, order=T) # set the order
dat2$Cultivar=factor(dat2$Cultivar,levels = c("A","B"), order=T)  # default is c("A","B")

dat2 %>% 
  ggplot(aes(x = Species_Name, y = Homologous_Number, fill = Cultivar)) +
  geom_bar(stat="identity", width=0.8, position = "dodge")+
  labs(x="Species",y="Unigenes_Num",fill = "Cultivar")+
  ggtitle(label="Nr",subtitle = "Homologous_Number")+
  scale_fill_manual(values=c("darkred","lightblue"))+ # manually fill the color
                                                      # or  scale_fill_brewer(palette = "Paired",direction = 1)
  geom_text(aes(label=paste(as.character(Ratio*100),"%")),position=position_dodge(width = 0.7),vjust = -0.5,size=2)+   # set position_dodge width in case the number% overlap
  theme(axis.text.x=element_text(size = 10,face = "bold", vjust = 1, hjust = 1,angle = 45),
        axis.text.y=element_text(size = 10,face = "bold", vjust = 0.5, hjust = 0.5),
        axis.title.x = element_text(size = 15,face = "bold", vjust = 0.5, hjust = 0.5),
        axis.title.y = element_text(size = 15,face = "bold", vjust = 0.5, hjust = 0.5),
        panel.background = element_rect(fill = "transparent",color = NA))

stack

dat2 %>% 
  ggplot(aes(x = Species_Name, y = Homologous_Number, fill = Cultivar)) +
  geom_bar(stat="identity", width=0.8, position = position_stack(reverse = T))+
  labs(x="Species",y="Unigenes_Num",fill = "Cultivar")+
  ggtitle(label="Nr",subtitle = "Homologous_Number")+
  scale_fill_manual(values=c("gray","pink"))+ # manually fill the color
                                                      # or  scale_fill_brewer(palette = "Paired",direction = 1)
  geom_text(aes(label=paste(as.character(Ratio*100),"%")),position=position_stack(vjust = 0.5, reverse = T),size=2)+   # set position_dodge width in case the number% overlap
  theme(axis.text.x=element_text(size = 10,face = "bold", vjust = 1, hjust = 1,angle = 45),
        axis.text.y=element_text(size = 10,face = "bold", vjust = 0.5, hjust = 0.5),
        axis.title.x = element_text(size = 15,face = "bold", vjust = 0.5, hjust = 0.5),
        axis.title.y = element_text(size = 15,face = "bold", vjust = 0.5, hjust = 0.5),
        panel.background = element_rect(fill = "transparent",color = NA))

Pie & Line chart

dat1 %>% 
  ggplot(aes(x = "", y = Homologous_Number, fill = Species_Name))+
  geom_bar(stat = "identity", width = 1, postion = "stack")+
  coord_polar(theta = "y")+
  scale_fill_brewer(palette = "Set3", direction = -1)+
  geom_text(aes(label = paste(as.character(Ratio*100),"%")),size = 3, position = position_stack(vjust = 0.5))+
   ggtitle(label="Nr",subtitle = "Homologous_Number")

Line chart

file3 <- "C:/Users/yizha/Desktop/Spring 2020/ggplot2/rice.melt.txt"
dat3 <- read.table(file3,sep = "\t",check.names = F,header = T,comment.char = "") # load the data
head(dat3)

##   Cultivar Day Temperature Weight   SD
## 1     CY-1  30          35  26.07 1.05
## 2     CY-1  35          35  15.90 2.74
## 3     CY-1  40          35  20.97 0.22
## 4     CY-1  45          35  17.62 0.23
## 5     CY-1  50          35  13.28 0.48
## 6   NJ9108  30          35  32.79 0.48

# convert the 'temperature' to factor
dat3$Temperature <- factor(dat3$Temperature, ordered = T)

dat3 %>% 
  ggplot(aes(x = Day, y = Weight, ymin = Weight - SD, ymax = Weight + SD, color = Temperature, shape = Cultivar))+
  geom_line(size = 1)+
  geom_point(size = 3 )+
  geom_errorbar(size = 0.01, width = 0.8)+
  scale_fill_brewer(palette = "Dark2")

Density & Boxplot & Freqploy

Density

library(reshape2)
dat4 <- read.table("C:/Users/yizha/Desktop/Spring 2020/ggplot2/gene_fpkm.xls",sep="\t",check.names=F,header=T)      # load the data
head(dat4,3)

##          ID  CK-WT-1  CK-WT-2
## 1 AT1G01010 3.581590 7.072660
## 2 AT1G01020 5.036775 1.794573
## 3 AT1G01030 5.035380 2.676170

dat4 <- dat4 %>% 
  gather(sample,FPKM,2:3)    # from wide to long

dat4 %>% 
  ggplot(aes(x = log10(FPKM), color = sample, fill = sample))+  # 
  geom_density(alpha = 0.25, size = 0.5)+
  xlim(-3, 5)+
  labs(x = "log10(FPKM)", color = "sample", fill = "sample")+ # 'color' and 'fill' must be same with ggplot(aes)
  ggtitle("Gene expression density")+
  theme_bw()    # theme extract: bw() is one of the themes

Boxplot

dat4 %>% 
  ggplot(aes(x = sample,y = log10(FPKM),fill = sample))+
  geom_boxplot(size=0.5,width=0.8,notch=T)+
  ylim(-3,5)+
  ggtitle("Gene expression distribution")+
  labs(y="log10FPKM",x="",fill="Samples")

Hist ~ facet

count

dat4 %>% 
  ggplot(aes(x = log10(FPKM), fill = sample))+
  geom_histogram(binwidth = 1, alpha = 0.5, size = 1, stat = "bin")+
  xlim(-3,5)+
  facet_grid(.~ sample)  # left&right  #if  facet_grid(sample ~ .)   up&down

2. density

dat4 %>% 
  ggplot(aes(x = log10(FPKM), y = ..density.., fill = sample))+
  geom_histogram(binwidth = 1, alpha = 0.5, size = 1, stat = "bin")+
  xlim(-3,5)+
  facet_grid(sample ~.)+
  geom_density(aes(color = sample), alpha = 0, size = 1)

Freqploy

dat4 %>% 
  ggplot(aes(x = log10(FPKM), fill = sample))+ # freqply y default is ..density..
  geom_histogram(binwidth = 1, alpha = 0.5, size = 1, stat = "bin")+
  xlim(-3,5)+
  geom_freqpoly(binwidth = 1, alpha = 0.5, size = 1, stat = "bin")+
  facet_grid(.~ sample)

Volcano & MAplot

dat5 <- read.table("C:/Users/yizha/Desktop/Spring 2020/ggplot2/CK-WT_vs_T-WT.xls",sep="\t",check.names=F,header=T)      # load the data
head(dat5)

##          ID CK-WT-1_FPKM CK-WT-2_FPKM CK-WT-3_FPKM T-WT-1_FPKM T-WT-2_FPKM
## 1 AT1G01010     3.581590     7.072660    5.5734000    4.665280    5.223600
## 2 AT1G01020     5.036775     1.794573    2.6558100    2.097996    1.597660
## 3 AT1G01030     5.035380     2.676170    2.5668800    8.025280    2.525640
## 4 AT1G01040     2.692716     1.499502    1.8464462    3.303312    2.604233
## 5 AT1G01050   114.336000   135.570000  118.7410000  193.546000  238.989000
## 6 AT1G01060     1.801876    16.201413    0.9205497  196.855310   51.067321
##   T-WT-3_FPKM     pValue         FDR     log2FC
## 1    3.010800 0.36822533 0.545003049 -0.3422960
## 2    2.490461 0.21193571 0.372536538 -0.6102622
## 3    2.142350 0.63578012 0.771219419  0.2404737
## 4    1.850929 0.38333440 0.559036960  0.3004839
## 5  246.084000 0.00004980 0.000436363  0.8311114
## 6   10.836779 0.01677208 0.055346930  1.7569446

line_FC <- 2
line_FDR <- 0.01
col <- c("red","blue","grey")
AFPKM <- c(2:4)
BFPKM <- c(5:7)
# set the range
dat5[dat5[,"FDR"] < line_FDR & dat5[,"log2FC"] >= log2(line_FC),ncol(dat5)+1] = "Up"
dat5[dat5[,"FDR"] < line_FDR & -log2(line_FC)  >= dat5[,"log2FC"],ncol(dat5)] = "Down"
dat5[dat5[,"FDR"] >= line_FDR | log2(line_FC) > abs(dat5[,"log2FC"]),ncol(dat5)] = "Normal"
colnames(dat5)[ncol(dat5)]="Regulate"
head(dat5)

##          ID CK-WT-1_FPKM CK-WT-2_FPKM CK-WT-3_FPKM T-WT-1_FPKM T-WT-2_FPKM
## 1 AT1G01010     3.581590     7.072660    5.5734000    4.665280    5.223600
## 2 AT1G01020     5.036775     1.794573    2.6558100    2.097996    1.597660
## 3 AT1G01030     5.035380     2.676170    2.5668800    8.025280    2.525640
## 4 AT1G01040     2.692716     1.499502    1.8464462    3.303312    2.604233
## 5 AT1G01050   114.336000   135.570000  118.7410000  193.546000  238.989000
## 6 AT1G01060     1.801876    16.201413    0.9205497  196.855310   51.067321
##   T-WT-3_FPKM     pValue         FDR     log2FC Regulate
## 1    3.010800 0.36822533 0.545003049 -0.3422960   Normal
## 2    2.490461 0.21193571 0.372536538 -0.6102622   Normal
## 3    2.142350 0.63578012 0.771219419  0.2404737   Normal
## 4    1.850929 0.38333440 0.559036960  0.3004839   Normal
## 5  246.084000 0.00004980 0.000436363  0.8311114   Normal
## 6   10.836779 0.01677208 0.055346930  1.7569446   Normal

Volcano

volcano <- dat5
# c("red","blue","grey") = c("Up", "Down", "Normal")
volcano$Regulate <- factor(volcano$Regulate, levels = c("Up", "Down", "Normal"), order = T)
# x = log2FC  y = -log10(FDR)
volcano %>% 
  ggplot(aes(x = log2FC, y = -log10(FDR)))+
  geom_point(aes(color = Regulate), alpha = 0.5)+
  scale_color_manual(values  = col)+
  geom_hline(yintercept=c(-log10(line_FDR)),linetype=4)+ # set the range line - horizonal
  geom_vline(xintercept=c(-log2(line_FC),log2(line_FC)),linetype=4) # range line - vertical

MAplot

MA <- dat5[,c("ID","log2FC","Regulate")]
head(MA,2)

##          ID     log2FC Regulate
## 1 AT1G01010 -0.3422960   Normal
## 2 AT1G01020 -0.6102622   Normal

MA[,4]=1/2*log2(rowMeans(dat5[,2:4])*rowMeans(dat5[,5:7]))
colnames(MA)[4]="1/2log2FPKM"
head(MA,3)

##          ID     log2FC Regulate 1/2log2FPKM
## 1 AT1G01010 -0.3422960   Normal    2.269860
## 2 AT1G01020 -0.6102622   Normal    1.352543
## 3 AT1G01030  0.2404737   Normal    1.928807

# set color
MA$Regulate <- factor(MA$Regulate,levels = c("Up","Down","Normal"),order=T)
MA %>% 
  ggplot(aes(x=`1/2log2FPKM`,y=log2FC))+
  geom_point(aes(color=Regulate),alpha=0.5)+
  xlim(-5,15)+
  geom_hline(yintercept=c(-log2(line_FC),log2(line_FC)),linetype=4)

Scatter plot

hist flip

dat6 <- read.table("C:/Users/yizha/Desktop/Spring 2020/ggplot2/GO.enrich.txt",sep="\t",check.names=F,header=T)      # load the data
head(dat6)

##   GO_accession          Term_type                 Description Rich_factor
## 1   GO:0006098 Biological Process     pentose-phosphate shunt    4.348735
## 2   GO:0006364 Biological Process             rRNA processing    3.950522
## 3   GO:0009523 Cellular Component              photosystem II    6.107451
## 4   GO:0009534 Cellular Component       chloroplast thylakoid    4.027377
## 5   GO:0009543 Cellular Component chloroplast thylakoid lumen    4.978900
## 6   GO:0009637 Biological Process      response to blue light    3.819835
##     Pvalue DEGs
## 1 0.00e+00  104
## 2 0.00e+00  125
## 3 0.00e+00   23
## 4 1.80e-13   39
## 5 2.30e-13   45
## 6 2.34e-13   52

# order on pvalue
go <- dat6[order(dat6[dat6$Pvalue < 0.01,"Pvalue"],decreasing = F),]
head(go)

##   GO_accession          Term_type                 Description Rich_factor
## 1   GO:0006098 Biological Process     pentose-phosphate shunt    4.348735
## 2   GO:0006364 Biological Process             rRNA processing    3.950522
## 3   GO:0009523 Cellular Component              photosystem II    6.107451
## 4   GO:0009534 Cellular Component       chloroplast thylakoid    4.027377
## 5   GO:0009543 Cellular Component chloroplast thylakoid lumen    4.978900
## 6   GO:0009637 Biological Process      response to blue light    3.819835
##     Pvalue DEGs
## 1 0.00e+00  104
## 2 0.00e+00  125
## 3 0.00e+00   23
## 4 1.80e-13   39
## 5 2.30e-13   45
## 6 2.34e-13   52

# only get the first 20 of pvalue
if(nrow(go)>=20){
  go=go[1:20,]
}
head(go)

##   GO_accession          Term_type                 Description Rich_factor
## 1   GO:0006098 Biological Process     pentose-phosphate shunt    4.348735
## 2   GO:0006364 Biological Process             rRNA processing    3.950522
## 3   GO:0009523 Cellular Component              photosystem II    6.107451
## 4   GO:0009534 Cellular Component       chloroplast thylakoid    4.027377
## 5   GO:0009543 Cellular Component chloroplast thylakoid lumen    4.978900
## 6   GO:0009637 Biological Process      response to blue light    3.819835
##     Pvalue DEGs
## 1 0.00e+00  104
## 2 0.00e+00  125
## 3 0.00e+00   23
## 4 1.80e-13   39
## 5 2.30e-13   45
## 6 2.34e-13   52

# order on term-type
go <- go[order(go$Term_type,decreasing = F),]
head(go)

##   GO_accession          Term_type                      Description Rich_factor
## 1   GO:0006098 Biological Process          pentose-phosphate shunt    4.348735
## 2   GO:0006364 Biological Process                  rRNA processing    3.950522
## 6   GO:0009637 Biological Process           response to blue light    3.819835
## 7   GO:0009644 Biological Process response to high light intensity    2.408318
## 8   GO:0009657 Biological Process             plastid organization    6.939471
## 9   GO:0009744 Biological Process              response to sucrose    2.400133
##     Pvalue DEGs
## 1 0.00e+00  104
## 2 0.00e+00  125
## 6 2.34e-13   52
## 7 4.50e-13   70
## 8 4.52e-13   40
## 9 4.70e-13   73

# make the levels order
go$Description=factor(go$Description,levels = rev(go$Description),ordered = T)

go %>% 
  ggplot(aes(x = Description, y = -log10(Pvalue), fill = Term_type))+
  geom_bar(stat="identity",width = 0.8)+
  coord_flip()+
  geom_text(aes(label=as.character(DEGs)),position = "stack",vjust=0,hjust=0,size=3)

Scatter Plot

dat7 <- read.table("C:/Users/yizha/Desktop/Spring 2020/ggplot2/KEGG.enrich.txt",sep="\t",check.names=F,header=T)      # load the data
head(dat7)

##     ko_id                                Kegg_pathway Rich_factor       Pvalue
## 1 ko00195                              Photosynthesis    3.543363 0.000000e+00
## 2 ko00196           Photosynthesis - antenna proteins    5.575221 1.500000e-10
## 3 ko03008           Ribosome biogenesis in eukaryotes    2.464696 4.920000e-06
## 4 ko00710 Carbon fixation in photosynthetic organisms    2.567654 4.510000e-05
## 5 ko01200                           Carbon metabolism    1.738592 7.350000e-05
## 6 ko03030                             DNA replication    2.453097 1.772599e-03
##   DEGs
## 1   39
## 2   18
## 3   34
## 4   26
## 5   62
## 6   18

kegg <- dat7[order(dat7[dat7$Pvalue < 0.01,"Pvalue"],decreasing = F),]
head(kegg)

##     ko_id                                Kegg_pathway Rich_factor       Pvalue
## 1 ko00195                              Photosynthesis    3.543363 0.000000e+00
## 2 ko00196           Photosynthesis - antenna proteins    5.575221 1.500000e-10
## 3 ko03008           Ribosome biogenesis in eukaryotes    2.464696 4.920000e-06
## 4 ko00710 Carbon fixation in photosynthetic organisms    2.567654 4.510000e-05
## 5 ko01200                           Carbon metabolism    1.738592 7.350000e-05
## 6 ko03030                             DNA replication    2.453097 1.772599e-03
##   DEGs
## 1   39
## 2   18
## 3   34
## 4   26
## 5   62
## 6   18

if(nrow(kegg)>=20){
  kegg=kegg[1:20,]}

enrich = 0.01
minPvalue = 1e-15
xlab=""
ylab="-log10(Pvalue)"
title=""

kegg$Kegg_pathway=factor(kegg$Kegg_pathway,levels=rev(kegg$Kegg_pathway),ordered = T)

kegg %>% 
  ggplot(aes(x=Kegg_pathway,y=-log10(Pvalue),fill=Kegg_pathway))+
  geom_bar(stat="identity",width = 0.8)+
  coord_flip()+
  geom_text(aes(label=as.character(DEGs)),position = "stack",vjust=0,hjust=0,size=3)+
  labs(x=xlab,y=ylab,titile=title)

kegg %>% 
  ggplot(aes(x=Kegg_pathway,y=Rich_factor))+
  geom_point(aes(color=-log10(Pvalue),size=DEGs),alpha=0.8)+
  coord_flip()+
  scale_color_gradient(low = "green",high = "red")

Pheatmap

library(pheatmap)
heatmap <- read.table("C:/Users/yizha/Desktop/Spring 2020/ggplot2/All.DEG_final_3000.xls",sep="\t",check.names=F,header=T) 

mat <- heatmap[,c(-1)]
head(mat)

##      CK-WT-1     CK-WT-2     CK-WT-3  CK-tdr1-1  CK-tdr1-2  CK-tdr1-3
## 1   3.741490   7.3618000   5.8173400  5.7113100  7.9705400 10.3762000
## 2   5.235280   2.7707000   2.6685900  3.2263200  1.3210500  1.9672600
## 3   2.821317   1.5633947   1.9316282  3.1948090  2.6008540  2.3012776
## 4 118.660000 140.1430000 123.3830000 97.2229000 95.2539000 91.8525000
## 5   1.873769  16.9090246   0.9559375  0.4774184  0.5273923  0.4333881
## 6   1.710346   0.7802436   2.7996091  4.7297117  4.3637146  3.3732144
##       NaWT-1     NaWT-2     NaWT-3  Natdr1-1  Natdr1-2  Natdr1-3   Na-WT-1
## 1   6.299490   5.550620   5.847790 15.588100 14.763600 17.746700  9.849430
## 2   2.402590   3.230770   4.803780  2.043010  2.295240  1.952860  1.833770
## 3   2.044360   2.104095   2.630611  2.289544  2.801336  2.450605  3.419992
## 4 121.808000 125.135000 106.507000 81.699700 84.849500 80.389500 95.484700
## 5  39.106578  36.530250  19.456131 13.222704 10.889778 12.258010 74.498380
## 6   2.278358   2.804400   2.954253  3.170043  4.664306  4.024336  6.885807
##     Na-WT-2   Na-WT-3 Na-tdr1-1 Na-tdr1-2 Na-tdr1-3      SWT-1      SWT-2
## 1  7.979490  6.841060 13.291200 25.370000 19.599400   4.966870   5.533490
## 2  1.707570  0.809564  3.231720  1.804780  2.060440   8.492560   2.655640
## 3  3.509510  2.388678  3.109170  3.559722  2.699009   3.533361   2.779014
## 4 93.470600 78.313200 77.220800 93.267700 81.656100 204.019000 251.292000
## 5 80.831650 22.768773  1.109817  2.482441 34.885769 209.860515  54.164781
## 6  6.565324  4.117812  1.413332  1.311610  1.672657   4.831237   3.658215
##        SWT-3   Stdr1-1    Stdr1-2    Stdr1-3
## 1   3.188160  6.936500  7.8840200 11.2504000
## 2   2.255700  4.352980  4.5214700  7.1135100
## 3   1.973365  5.528830  4.9230430  5.6020740
## 4 257.625000 85.227500 77.5347000 84.9365000
## 5  11.474413  3.502854  0.9064844  0.3295299
## 6   6.806488  2.155406  2.1667223  1.7398119

dim(mat)

## [1] 3000   24

col=c("blue","white","red")
color=colorRampPalette(col)(100) #change the color
pheatmap(mat, scale = "row", show_rownames = F, color = color, cellwidth = 300/ncol(mat), cellheight = 300/nrow(mat)) # normalize the variables    # display_numbers = T

annotation_col <- read.table("C:/Users/yizha/Desktop/Spring 2020/ggplot2/annotation_col.xls",sep="\t",check.names=F,header=T) 

str(annotation_col)

## 'data.frame':    24 obs. of  4 variables:
##  $ Sample   : chr  "CK-WT-1" "CK-WT-2" "CK-WT-3" "CK-tdr1-1" ...
##  $ Group    : chr  "CK" "CK" "CK" "CK" ...
##  $ Genotype : chr  "WT" "WT" "WT" "tdr1" ...
##  $ Treatment: chr  "CK-WT" "CK-WT" "CK-WT" "CK-tdr1" ...

pheatmap(mat,scale="row")

Expolore Data Analysis

Yao Lin

4/28/2020

EDA using ggplot

Venn graph

Histogram

Pie & Line chart

Density & Boxplot & Freqploy

Volcano & MAplot

Scatter plot

Pheatmap