####Run through this example and try to understand what is going on with the data
####So, lets load RMarkdown
library(rmarkdown)
set messages to FALSE on everything (prevents certain boring things from being shown in the results)
knitr::opts_chunk$set(echo = TRUE, message=FALSE,warning=FALSE,collapse = TRUE)
PACKAGES
library(reshape2)
library(ggplot2)
library(dplyr)
library(plotly)
library(viridis)
library(data.table)
library(pheatmap)
library(tidyverse)
library(ggthemes)
library(clipr)
library(tidyr)
library(Rcpp)
COLORS
mycolors<-c(viridis(15))
felix_cols<-mycolors[c(5,2)]
felix_4cols<-mycolors[c(15,10,8,2)]
plain_cols1<-c("blue","gray")
plain_cols2<-c("red","gray")
pats_cols<-colorRampPalette(c("#FDE725FF", "white","#440154FF"))(21)
leos_cols<-colorRampPalette(c("white","blue"))(10)
LOAD DATA AND MAKE AN OVERALL HEATMAP
## load the dataset
Breast_cancer_data<-read_csv(file="breast_cancer_cells.csv")
#and then view it
Breast_cancer_logs<-Breast_cancer_data %>% mutate (log_MCF7=log2((MCF7_1+MCF7_2)/(MCF10A_1+MCF10A_2)), log_MDA231=log2((MDA231_1+MDA231_2)/(MCF10A_1+MCF10A_2)), log_MDA468=log2((MDA468_1+MDA468_2)/(MCF10A_1+MCF10A_2)), log_SKBR3=log2((SKBR3_2)/(MCF10A_1+MCF10A_2)))
now let’s visualize the dataset and look for initial trends. We can do this by making a matrix so and then a heatmap to visualize the data
#make a matrix of just raw the data
Breast_cancer_mat1<-Breast_cancer_logs %>% select(log_MCF7:log_SKBR3) %>% as.matrix(.) %>% round(.,2)
# what does that look like?
head(Breast_cancer_mat1)
## log_MCF7 log_MDA231 log_MDA468 log_SKBR3
## [1,] -0.43 1.90 -0.74 -1.32
## [2,] -0.96 -0.35 -0.71 -0.90
## [3,] 0.27 0.29 -0.04 -0.97
## [4,] 0.88 0.78 -0.05 -1.17
## [5,] 0.79 1.19 0.92 -1.41
## [6,] -0.79 -0.90 -1.28 -0.91
## make a heatmap from the data
pheatmap(Breast_cancer_mat1, color=pats_cols,cellwidth=30,cellheight=.03,cluster_cols=FALSE,cluster_rows=FALSE,legend=TRUE,fontsize = 7,scale="column")
##INTERPRETATION##
## What can you see in this figure? are the repeated measures/reps similar or different? What does this say about the precision and accuracy of them?
##How does the control compare to the variables? Is this what you might expect? Why? What would you look for in the literature to support this idea?
#looks like 24hrs is different than the other cells lines. We can take a closer look at that in the next section, once we've done some further data manipulations
Data Manipulation
## first, make new columns that combine the two reps for each variable by averaging them
Breast_cancer_data2<-Breast_cancer_data %>% mutate(
mean_MCF10A= ((MCF10A_1 + MCF10A_2)/2),
mean_MCF7= ((MCF7_1 + MCF7_2)/2),
mean_MDA231= ((MDA231_1 + MDA231_2)/2),
mean_MDA468= ((MDA468_1 + MDA468_2)/2),
mean_SKBR3= ((SKBR3_1 + SKBR3_2)/2))
view(Breast_cancer_data2)
#did it create your new columns?
## then make some new columns that store the log ratios of the variable means you just created, as compared to the control
Breast_cancer_data2<-Breast_cancer_data2 %>% mutate(log_MCF7=log2(mean_MCF7/mean_MCF10A),log_MDA231=log2(mean_MDA231/mean_MCF10A), log_MDA468=log2(mean_MDA468/mean_MCF10A), log_SKBR3=log2(mean_SKBR3/mean_MCF10A))
colnames(Breast_cancer_data2)
## [1] "Gene_Symbol" "Description"
## [3] "Peptides" "MCF10A_1"
## [5] "MCF10A_2" "MCF7_1"
## [7] "MCF7_2" "MDA231_1"
## [9] "MDA231_2" "MDA468_1"
## [11] "MDA468_2" "SKBR3_1"
## [13] "SKBR3_2" "pvalue_MCF7_vs_MCF10A"
## [15] "pvalue_MDA231_vs_MCF10A" "pvalue_MDA468_vs_MCF10A"
## [17] "pvalue_SKBR3_vs_MCF10A" "mean_MCF10A"
## [19] "mean_MCF7" "mean_MDA231"
## [21] "mean_MDA468" "mean_SKBR3"
## [23] "log_MCF7" "log_MDA231"
## [25] "log_MDA468" "log_SKBR3"
#did it create your new columns?
## ALTERNATIVE CHOICE: make a matrix of just log values the data and a heat map of that. How do the two heat maps compare? This is not possible if your values contain 0's
Breast_cancer_data2<-Breast_cancer_data2 %>% select(log_MCF7:log_SKBR3) %>% as.matrix() %>% round(.,2)
pheatmap(Breast_cancer_data2, color=pats_cols,cellwidth=30,cellheight=.03,cluster_cols=FALSE,cluster_rows=TRUE,legend=TRUE,fontsize = 7,scale="column")
Diving deeping with VOLCANO PLOTS
##BASIC VOLCANO PLOT
## Add a column that stores the negative log of the pvalue of interest
Breast_cancer_negative_logs<-Breast_cancer_logs %>% mutate(neglog_SKBR3=-log10(pvalue_SKBR3_vs_MCF10A))
## Use ggplot to plot the log ratio at __ hours against the -log p-value at __ hours
volcano_plot<-Breast_cancer_negative_logs %>% ggplot(aes(x=log_SKBR3,y=neglog_SKBR3,description=Gene_Symbol))+
geom_point(alpha=0.7,color="royalblue")
#to view it, type:
volcano_plot
## BETTER VOLCANO PLOT
## Define the significant ones (by using ifelse and setting # parameters) so they can be colored
Breast_cancer_better_volcano<-Breast_cancer_negative_logs %>% mutate(significance=ifelse((log_SKBR3>2 & neglog_SKBR3>2.99),"UP", ifelse((log_SKBR3<c(-2) & neglog_SKBR3>2.99),"DOWN","NOT SIG")))
## Some standard colors
plain_cols3<-c("red","gray","blue")
## volcano plot as before with some added things
better_volcano_plot<-Breast_cancer_better_volcano %>% ggplot(aes(x=log_SKBR3,y=neglog_SKBR3,description=Gene_Symbol,color=significance))+
geom_point(alpha=0.7)+
scale_color_manual(values=plain_cols3)+
xlim(-6,6)+
theme_bw()+
theme(axis.text = element_text(colour = "black",size=14))+
theme(text = element_text(size=14))+
labs(x="log ratio at SKBR3 compared to control",y="-log(p-value_SKBR3)")
#to view it, type
better_volcano_plot
#check viewer and / or plots to see it
## use ggplotly to see hover over the points to see the gene names. Record these for the next step
ggplotly(better_volcano_plot)
##INTERPRETATION## ## Significant up regulated are: , and _____ ## Significant down regulated are: _____. _____ and ____ #Why? How?
Barplots of significant points of interest
EXAMPLES OF A COUPLE PROTEINS or GENES
# Make a pivot longer table of the C19 data for Control_2h_1:Virus_24h_2
breast_cancer_long<-pivot_longer(Breast_cancer_better_volcano, cols = c(MCF7_1:SKBR3_2), names_to = 'variable')%>% select(-c(pvalue_MCF7_vs_MCF10A:significance))
head(breast_cancer_long)
## # A tibble: 6 × 7
## Gene_Symbol Description Peptides MCF10A_1 MCF10A_2 variable value
## <chr> <chr> <dbl> <dbl> <dbl> <chr> <dbl>
## 1 NES Nestin OS=Homo sapiens … 7 9.54 4.58 MCF7_1 5.07
## 2 NES Nestin OS=Homo sapiens … 7 9.54 4.58 MCF7_2 5.42
## 3 NES Nestin OS=Homo sapiens … 7 9.54 4.58 MDA231_1 25.4
## 4 NES Nestin OS=Homo sapiens … 7 9.54 4.58 MDA231_2 27.4
## 5 NES Nestin OS=Homo sapiens … 7 9.54 4.58 MDA468_1 4.56
## 6 NES Nestin OS=Homo sapiens … 7 9.54 4.58 MDA468_2 3.88
## # A tibble: 6 × 7
## Gene_Symbol Description Peptides MCF10A_1 MCF10A_2 variable value
## <chr> <chr> <int> <dbl> <dbl> <chr> <dbl>
## 1 NES " Nestin OS=Homo sapien… 7 9.54 4.58 MCF7_1 5.07
## 2 NES " Nestin OS=Homo sapien… 7 9.54 4.58 MCF7_2 5.42
## 3 NES " Nestin OS=Homo sapien… 7 9.54 4.58 MDA231_1 25.4
## 4 NES " Nestin OS=Homo sapien… 7 9.54 4.58 MDA231_2 27.4
## 5 NES " Nestin OS=Homo sapien… 7 9.54 4.58 MDA468_1 4.56
## 6 NES " Nestin OS=Homo sapien… 7 9.54 4.58 MDA468_2 3.88
## based on the gene symbols from plotly, select a few proteins from the table. Select just the data and gene symbols and pivot longer
Examples_Down<-breast_cancer_long %>% filter(Gene_Symbol=="APOA1" | Gene_Symbol=="HLA-A" | Gene_Symbol=="MYO1B")
head(Examples_Down)
## # A tibble: 6 × 7
## Gene_Symbol Description Peptides MCF10A_1 MCF10A_2 variable value
## <chr> <chr> <dbl> <dbl> <dbl> <chr> <dbl>
## 1 HLA-A HLA class I histocompat… 3 5.42 6.62 MCF7_1 2.22
## 2 HLA-A HLA class I histocompat… 3 5.42 6.62 MCF7_2 2.69
## 3 HLA-A HLA class I histocompat… 3 5.42 6.62 MDA231_1 4.56
## 4 HLA-A HLA class I histocompat… 3 5.42 6.62 MDA231_2 4.23
## 5 HLA-A HLA class I histocompat… 3 5.42 6.62 MDA468_1 36.3
## 6 HLA-A HLA class I histocompat… 3 5.42 6.62 MDA468_2 30.1
## # A tibble: 6 × 7
## Gene_Symbol Description Peptides MCF10A_1 MCF10A_2 variable value
## <chr> <chr> <int> <dbl> <dbl> <chr> <dbl>
## 1 HLA-A " HLA class I histocomp… 3 5.42 6.62 MCF7_1 2.22
## 2 HLA-A " HLA class I histocomp… 3 5.42 6.62 MCF7_2 2.69
## 3 HLA-A " HLA class I histocomp… 3 5.42 6.62 MDA231_1 4.56
## 4 HLA-A " HLA class I histocomp… 3 5.42 6.62 MDA231_2 4.23
## 5 HLA-A " HLA class I histocomp… 3 5.42 6.62 MDA468_1 36.3
## 6 HLA-A " HLA class I histocomp… 3 5.42 6.62 MDA468_2 30.1
## make barplots facetted by Gene Symbol (when working with other data sets - change x=order to x = variable)
Example_plot_down<-Examples_Down %>%
ggplot(aes(x=variable,y=value))+
geom_bar(stat="identity",fill="red")+
facet_wrap(~Gene_Symbol)+
theme_bw()+
theme(axis.text = element_text(colour = "black",size=10))+
theme(text = element_text(size=14))+
theme(axis.text.x = element_text(angle=45, hjust=1))+
labs(x="sample",y="relative intensity")
Example_plot_down
##INTERPRETATION## ## What can you see in this figure? are the repeated measures/reps similar or different? What does this say about the precision and accuracy of them? ##How does the control compare to the variables? Is this what you might expect? Why? What would you look for in the literature to support this idea?
## Same process for the upregulated ones
Examples_Up<-breast_cancer_long %>% filter(Gene_Symbol=="TDP2" | Gene_Symbol=="KRT23" | Gene_Symbol=="DENND4C")
head(Examples_Up)
## # A tibble: 6 × 7
## Gene_Symbol Description Peptides MCF10A_1 MCF10A_2 variable value
## <chr> <chr> <dbl> <dbl> <dbl> <chr> <dbl>
## 1 TDP2 Isoform 2 of Tyrosyl-DN… 3 2.36 1.72 MCF7_1 5.89
## 2 TDP2 Isoform 2 of Tyrosyl-DN… 3 2.36 1.72 MCF7_2 4.71
## 3 TDP2 Isoform 2 of Tyrosyl-DN… 3 2.36 1.72 MDA231_1 5.94
## 4 TDP2 Isoform 2 of Tyrosyl-DN… 3 2.36 1.72 MDA231_2 7.38
## 5 TDP2 Isoform 2 of Tyrosyl-DN… 3 2.36 1.72 MDA468_1 30.6
## 6 TDP2 Isoform 2 of Tyrosyl-DN… 3 2.36 1.72 MDA468_2 16.0
## # A tibble: 6 × 7
## Gene_Symbol Description Peptides MCF10A_1 MCF10A_2 variable value
## <chr> <chr> <int> <dbl> <dbl> <chr> <dbl>
## 1 TDP2 " Isoform 2 of Tyrosyl-… 3 2.36 1.72 MCF7_1 5.89
## 2 TDP2 " Isoform 2 of Tyrosyl-… 3 2.36 1.72 MCF7_2 4.71
## 3 TDP2 " Isoform 2 of Tyrosyl-… 3 2.36 1.72 MDA231_1 5.94
## 4 TDP2 " Isoform 2 of Tyrosyl-… 3 2.36 1.72 MDA231_2 7.38
## 5 TDP2 " Isoform 2 of Tyrosyl-… 3 2.36 1.72 MDA468_1 30.6
## 6 TDP2 " Isoform 2 of Tyrosyl-… 3 2.36 1.72 MDA468_2 16.0
Example_plot_up<-Examples_Up %>%
ggplot(aes(x=variable,y=value))+
geom_bar(stat="identity",fill="red")+
facet_wrap(~Gene_Symbol)+
theme_bw()+
theme(axis.text = element_text(colour = "black",size=10))+
theme(text = element_text(size=14))+
theme(axis.text.x = element_text(angle=45, hjust=1))+
labs(x="sample",y="relative intensity")
Example_plot_up
#check viewer and / or plots to see it
##INTERPRETATION## ## What can you see in this figure? are the repeated measures/reps similar or different? What does this say about the precision and accuracy of them? ##How does the control compare to the variables? Is this what you might expect? Why? What would you look for in the literature to support this idea?
#interpretation HINT:insert a chunk and create two seprate lines of code that filter for your specific upregulated genes/proteins of interest and selects for only their gene symbols and descriptions. Do this for the downregulated as well. This will generate two list of the descriptors for each gene of interest, helping you understand your figures. Be sure to view it, not just ask for the head of the table generated.
WRAP UP ##now you can knit this and publish to save and share your code. Use this to work with either the brain or breast cells and the Part_C_template to complete your lab 6 ELN. ##Annotate when you have trouble and reference which line of code you need help on ## good luck and have fun!