####Run through this example and try to understand what is going on with the data
####So, lets load RMarkdown
library(rmarkdown)
knitr::opts_chunk$set(echo = TRUE, message=FALSE,warning=FALSE,collapse = TRUE)
library(reshape2)
library(ggplot2)
library(dplyr)
library(plotly)
library(viridis)
library(data.table)
library(pheatmap)
library(tidyverse)
library(ggthemes)
library(clipr)
library(tidyr)
library(Rcpp)
mycolors<-c(viridis(15))
felix_cols<-mycolors[c(5,2)]
felix_4cols<-mycolors[c(15,10,8,2)]
plain_cols1<-c("blue","gray")
plain_cols2<-c("red","gray")
pats_cols<-colorRampPalette(c("#FDE725FF", "white","#440154FF"))(21)
leos_cols<-colorRampPalette(c("white","blue"))(10)
## load the dataset
## first colomn is the name of gene codes for protein, MCF10A is health breast cell,others are all cancer cell,1,2means from differfent replicate.compare them with MCF10A to find out how different in normal and cancer cell.p<0.05=protein significantly different
BC_cell<-read_csv(file="breast_cancer_cells.csv")
#and then view it
view(BC_cell)
#make a matrix of just raw the data
BC_cellmat<-BC_cell %>% select(MCF10A_1:SKBR3_2) %>% as.matrix() %>% round(.,2)
head(BC_cellmat)
## MCF10A_1 MCF10A_2 MCF7_1 MCF7_2 MDA231_1 MDA231_2 MDA468_1 MDA468_2
## [1,] 9.54 4.58 5.07 5.42 25.43 27.42 4.56 3.88
## [2,] 14.00 11.58 6.49 6.64 9.80 10.31 6.84 8.75
## [3,] 10.22 8.29 11.55 10.82 12.48 10.11 9.54 8.46
## [4,] 9.00 6.35 13.40 14.82 14.94 11.33 6.87 7.92
## [5,] 8.21 4.44 12.08 9.82 16.51 12.34 15.43 8.50
## [6,] 12.51 15.84 8.05 8.38 6.78 8.46 4.48 7.18
## SKBR3_1 SKBR3_2
## [1,] 8.46 5.64
## [2,] 11.91 13.67
## [3,] 9.10 9.43
## [4,] 8.54 6.82
## [5,] 7.93 4.75
## [6,] 13.27 15.05
# what does that look like?
## make a heatmap from the data
pheatmap(BC_cellmat, color=pats_cols,cellwidth=30,cellheight=.03,cluster_cols=FALSE,cluster_rows=TRUE,legend=TRUE,fontsize = 7,scale="column")
##INTERPRETATION##
## What can you see in this figure? are the repeated measures/reps similar or different? What does this say about the precision and accuracy of them?
##How does the control compare to the variables? Is this what you might expect? Why? What would you look for in the literature to support this idea?
#looks like _______ line is different than the other cells lines. We will take a closer look at that in the next section, once we've done some further data manipulations
## first, make new columns that combine the two reps for each variable by averaging them
BC_cell$MCF10A_avg <- rowMeans(BC_cell[, c("MCF10A_1", "MCF10A_2")], na.rm = TRUE)
BC_cell$MCF7_avg <- rowMeans(BC_cell[, c("MCF7_1", "MCF7_2")], na.rm = TRUE)
BC_cell$MDA231_avg <- rowMeans(BC_cell[, c("MDA231_1", "MDA231_2")], na.rm = TRUE)
BC_cell$MDA468_avg <- rowMeans(BC_cell[, c("MDA468_1", "MDA468_2")], na.rm = TRUE)
BC_cell$SKBR3_avg <- rowMeans(BC_cell[, c("SKBR3_1", "SKBR3_2")], na.rm = TRUE)
#did it create your new columns?YES
## then make some new columns that store the log ratios of the variable means you just created, as compared to the control
BC_cell <- BC_cell %>% mutate(
log_MCF7 = log2(MCF7_avg / MCF10A_avg),
log_MDA231 = log2(MDA231_avg / MCF10A_avg),
log_MDA468 = log2(MDA468_avg / MCF10A_avg),
log_SKBR3 = log2(SKBR3_avg / MCF10A_avg))
head(BC_cell)
## # A tibble: 6 × 26
## Gene_Symbol Description Peptides MCF10A_1 MCF10A_2 MCF7_1 MCF7_2 MDA231_1
## <chr> <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 NES Nestin OS=Homo … 7 9.54 4.58 5.07 5.42 25.4
## 2 ZC3HC1 Isoform 2 of Nu… 2 14 11.6 6.49 6.64 9.8
## 3 CAMSAP1L1 Isoform 2 of Ca… 5 10.2 8.29 11.6 10.8 12.5
## 4 COBRA1 Negative elonga… 3 9 6.35 13.4 14.8 14.9
## 5 HAUS6 Isoform 3 of HA… 5 8.21 4.44 12.1 9.82 16.5
## 6 CHCHD4 Isoform 2 of Mi… 5 12.5 15.8 8.05 8.38 6.78
## # ℹ 18 more variables: MDA231_2 <dbl>, MDA468_1 <dbl>, MDA468_2 <dbl>,
## # SKBR3_1 <dbl>, SKBR3_2 <dbl>, pvalue_MCF7_vs_MCF10A <dbl>,
## # pvalue_MDA231_vs_MCF10A <dbl>, pvalue_MDA468_vs_MCF10A <dbl>,
## # pvalue_SKBR3_vs_MCF10A <dbl>, MCF10A_avg <dbl>, MCF7_avg <dbl>,
## # MDA231_avg <dbl>, MDA468_avg <dbl>, SKBR3_avg <dbl>, log_MCF7 <dbl>,
## # log_MDA231 <dbl>, log_MDA468 <dbl>, log_SKBR3 <dbl>
#did it create your new columns?yes
## ALTERNATIVE CHOICE: make a matrix of just log values the data and a heat map of that. How do the two heat maps compare? This is not possible if your values contain 0's
BC_cell_logmat<-BC_cell %>% select(log_MCF7: log_SKBR3) %>% as.matrix() %>% round(.,2)
pheatmap(BC_cell_logmat, color=pats_cols,cellwidth=30,cellheight=.03,cluster_cols=FALSE,cluster_rows=FALSE,legend=TRUE,fontsize = 7,scale="column")
##BASIC VOLCANO PLOT
## Add a column that stores the negative log of the pvalue of interest (in this case, MCF7)
BC_cell<-BC_cell %>% mutate(neglog_MCF7=-log10(pvalue_MCF7_vs_MCF10A))
## Use ggplot to plot the log ratio of again MCF7/MCF10A st the -log p-value MCF7/MCF10A
volcano_plot<-BC_cell %>% ggplot(aes(x=log_MCF7,y=neglog_MCF7,description=Gene_Symbol))+
geom_point(alpha=0.6,color="red")
#to view it, type:
volcano_plot
## BETTER VOLCANO PLOT\change until around 3
## Define the significant ones (by using ifelse and setting # parameters) so they can be colored
BC_cell<-BC_cell %>% mutate(significance=ifelse((log_MCF7>2 & neglog_MCF7>4.27),"UP", ifelse((log_MCF7<c(-2) & neglog_MCF7>4.27),"DOWN","NOT SIG")))
## Some standard colors
plain_cols3<-c("red","gray","blue")
## volcano plot as before with some added things
better_volcano_plot<-BC_cell %>% ggplot(aes(x=log_MCF7,y=neglog_MCF7,description=Gene_Symbol,color=significance))+
geom_point(alpha=0.7)+
scale_color_manual(values=plain_cols3)+
xlim(-6,6)+
theme_bw()+
theme(axis.text = element_text(colour = "black",size=14))+
theme(text = element_text(size=14))+
labs(x="log ratio of MCF7 proteomic analysis compared to control",y="-log(p-value)")
#to view it, type
better_volcano_plot
#check viewer and / or plots to see it
## use ggplotly to see hover over the points to see the gene names. Record these for the next step
ggplotly(better_volcano_plot)
##INTERPRETATION##
## Significant up regulated are: VAV2 , C17orf28 ,EPPK1 ,SYT7 ,CORO1A ,DPP3, RHPN2 and AGR3
## Significant down regulated are: ADCK3. CTSZ and APOA1
#Why? How?
# Make a pivot longer table of the C19 data for Control_2h_1:Virus_24h_2
BC_long<-pivot_longer(BC_cell, cols = c(MCF10A_1:SKBR3_2), names_to = 'variable')%>% select(-c(pvalue_MCF7_vs_MCF10A:significance))%>%select(-Description,-Peptides)
BC_long$order <- as.character(BC_long$variable)
## based on the gene symbols from plotly, select a few proteins from the table. Select just the data and gene symbols and pivot longer
Examples_Down<-BC_long %>% filter(Gene_Symbol=="CTSZ" | Gene_Symbol=="APOA1" | Gene_Symbol=="ADCK3")
## make barplots facetted by Gene Symbol (when working with other data sets - change x=order to x = variable)
Examples_Down %>%
ggplot(aes(x=order,y=value))+
geom_bar(stat="identity",fill="red")+
facet_wrap(~Gene_Symbol)+
theme_bw()+
theme(axis.text = element_text(colour = "Black",size=10))+
theme(text = element_text(size=10))+
theme(axis.text.x = element_text(angle=90, hjust=0))+
labs(x="sample",y="relative intensity")
#check viewer and / or plots to see it
Examples_Down
## # A tibble: 30 × 4
## Gene_Symbol variable value order
## <chr> <chr> <dbl> <chr>
## 1 CTSZ MCF10A_1 19.8 MCF10A_1
## 2 CTSZ MCF10A_2 19.8 MCF10A_2
## 3 CTSZ MCF7_1 0.83 MCF7_1
## 4 CTSZ MCF7_2 0.91 MCF7_2
## 5 CTSZ MDA231_1 11.2 MDA231_1
## 6 CTSZ MDA231_2 15.4 MDA231_2
## 7 CTSZ MDA468_1 2.48 MDA468_1
## 8 CTSZ MDA468_2 3.35 MDA468_2
## 9 CTSZ SKBR3_1 11.6 SKBR3_1
## 10 CTSZ SKBR3_2 14.7 SKBR3_2
## # ℹ 20 more rows
##INTERPRETATION## ## What can you see in this figure? are the repeated measures/reps similar or different? What does this say about the precision and accuracy of them? ##How does the control compare to the variables? Is this what you might expect? Why? What would you look for in the literature to support this idea?
## Same process for the upregulated ones
Examples_Up<-BC_long %>% filter(Gene_Symbol=="C17orf28" | Gene_Symbol=="VAV2" | Gene_Symbol=="EPPK1")
Examples_Up %>%
ggplot(aes(x=order,y=value))+
geom_bar(stat="identity",fill="Blue")+
facet_wrap(~Gene_Symbol)+
theme_bw()+
theme(axis.text = element_text(colour = "Black",size=10))+
theme(text = element_text(size=10))+
theme(axis.text.x = element_text(angle=90, hjust=0))+
labs(x="sample",y="relative intensity")
#check viewer and / or plots to see it
##INTERPRETATION## ## What can you see in this figure? are the repeated measures/reps similar or different? What does this say about the precision and accuracy of them? ##How does the control compare to the variables? Is this what you might expect? Why? What would you look for in the literature to support this idea?
#interpretation HINT:insert a chunk and create two seprate lines of code that filter for your specific upregulated genes/proteins of interest and selects for only their gene symbols and descriptions. Do this for the downregulated as well. This will generate two list of the descriptors for each gene of interest, helping you understand your figures. Be sure to view it, not just ask for the head of the table generated.
Downregulated<-BC_cell %>% filter(Gene_Symbol=="CTSZ" | Gene_Symbol=="APOA1" | Gene_Symbol=="HLA-A" | Gene_Symbol=="ADCK3") %>% select(Gene_Symbol, Description, Peptides)
Downregulated
## # A tibble: 7 × 3
## Gene_Symbol Description Peptides
## <chr> <chr> <dbl>
## 1 HLA-A HLA class I histocompatibility antigen, A-23 alpha chain… 3
## 2 HLA-A HLA class I histocompatibility antigen, A-30 alpha chain… 1
## 3 HLA-A HLA class I histocompatibility antigen, A-2 alpha chain … 21
## 4 CTSZ Cathepsin Z OS=Homo sapiens GN=CTSZ PE=1 SV=1 8
## 5 ADCK3 Isoform 3 of Chaperone activity of bc1 complex-like, mit… 2
## 6 HLA-A HLA class I histocompatibility antigen, A-1 alpha chain … 1
## 7 APOA1 Apolipoprotein A-I OS=Homo sapiens GN=APOA1 PE=1 SV=1 2
Upregulated<-BC_cell %>% filter(Gene_Symbol=="C17orf28" | Gene_Symbol=="VAV2" | Gene_Symbol=="EPPK1" )%>% select(Gene_Symbol, Description, Peptides)
Upregulated
## # A tibble: 4 × 3
## Gene_Symbol Description Peptides
## <chr> <chr> <dbl>
## 1 EPPK1 Uncharacterized protein OS=Homo sapiens GN=EPPK1 PE=4 SV… 4
## 2 EPPK1 Epiplakin OS=Homo sapiens GN=EPPK1 PE=1 SV=2 71
## 3 VAV2 Isoform 2 of Guanine nucleotide exchange factor VAV2 OS=… 4
## 4 C17orf28 Isoform 2 of UPF0663 transmembrane protein C17orf28 OS=H… 3
##now you can knit this and publish to save and share your code. Use this to work with either the brain or breast cells and the Part_C_template to complete your lab 6 ELN. ##Annotate when you have trouble and reference which line of code you need help on ## good luck and have fun!