####Run through this example and try to understand what is going on with the data
####So, lets load RMarkdown
library(rmarkdown)
knitr::opts_chunk$set(echo = TRUE, message=FALSE,warning=FALSE,collapse = TRUE)
library(reshape2)
library(ggplot2)
library(dplyr)
library(plotly)
library(viridis)
library(data.table)
library(pheatmap)
library(tidyverse)
library(ggthemes)
library(clipr)
library(tidyr)
library(Rcpp)
mycolors<-c(viridis(15))
felix_cols<-mycolors[c(5,2)]
felix_4cols<-mycolors[c(15,10,8,2)]
plain_cols1<-c("blue","gray")
plain_cols2<-c("red","gray")
pats_cols<-colorRampPalette(c("#FDE725FF", "white","#440154FF"))(21)
leos_cols<-colorRampPalette(c("white","blue"))(10)
## load the dataset
data <- read.csv("breast_cancer_cells.csv")
#View(data)
#names(data)
#head(data)
#and then view it
#make a matrix of just raw the data
raw_matrix <- as.matrix(data[, c("MCF10A_1","MCF10A_2",
"MCF7_1","MCF7_2",
"MDA231_1","MDA231_2",
"MDA468_1","MDA468_2",
"SKBR3_1","SKBR3_2")])
rownames(raw_matrix) <- data$Gene_Symbol
head(raw_matrix)
## MCF10A_1 MCF10A_2 MCF7_1 MCF7_2 MDA231_1 MDA231_2 MDA468_1 MDA468_2
## NES 9.54 4.58 5.07 5.42 25.43 27.42 4.56 3.88
## ZC3HC1 14.00 11.58 6.49 6.64 9.80 10.31 6.84 8.75
## CAMSAP1L1 10.22 8.29 11.55 10.82 12.48 10.11 9.54 8.46
## COBRA1 9.00 6.35 13.40 14.82 14.94 11.33 6.87 7.92
## HAUS6 8.21 4.44 12.08 9.82 16.51 12.34 15.43 8.50
## CHCHD4 12.51 15.84 8.05 8.38 6.78 8.46 4.48 7.18
## SKBR3_1 SKBR3_2
## NES 8.46 5.64
## ZC3HC1 11.91 13.67
## CAMSAP1L1 9.10 9.43
## COBRA1 8.54 6.82
## HAUS6 7.93 4.75
## CHCHD4 13.27 15.05
# what does that look like?
## make a heatmap from the data
pheatmap(raw_matrix,
color = pats_cols,
show_rownames = FALSE,
cluster_cols = FALSE,
cluster_rows = TRUE,
main = "Breast Cancer Cell Lines Heatmap",
scale = "column")
##INTERPRETATION##
## What can you see in this figure? are the repeated measures/reps similar or different? What does this say about the precision and accuracy of them?
#Replicates that look similar suggest good consistency
##How does the control compare to the variables? Is this what you mightexpect? Why? What would you look for in the literature to support this idea?
#Some cancer cell lines appear different from the MCF10A control.
#looks like _______ line is different than the other cells lines. We will take a closer look at that in the next section, once we've done some further data manipulations
## first, make new columns that combine the two reps for each variable by averaging them
data <- data %>%
mutate(
MCF10A_mean = (MCF10A_1 + MCF10A_2) / 2,
MCF7_mean = (MCF7_1 + MCF7_2) / 2,
MDA231_mean = (MDA231_1 + MDA231_2) / 2,
MDA468_mean = (MDA468_1 + MDA468_2) / 2,
SKBR3_mean = (SKBR3_1 + SKBR3_2) / 2
)
#View(data)
#did it create your new columns?
## then make some new columns that store the log ratios of the variable means you just created, as compared to the control
#did it create your new columns?
## ALTERNATIVE CHOICE: make a matrix of just log values the data and a heat map of that. How do the two heat maps compare? This is not possible if your values contain 0's
data <- data %>%
mutate(
log2_MCF7_vs_MCF10A = log2(MCF7_mean / MCF10A_mean),
log2_MDA231_vs_MCF10A = log2(MDA231_mean / MCF10A_mean),
log2_MDA468_vs_MCF10A = log2(MDA468_mean / MCF10A_mean),
log2_SKBR3_vs_MCF10A = log2(SKBR3_mean / MCF10A_mean)
)
colnames(data)
## [1] "Gene_Symbol" "Description"
## [3] "Peptides" "MCF10A_1"
## [5] "MCF10A_2" "MCF7_1"
## [7] "MCF7_2" "MDA231_1"
## [9] "MDA231_2" "MDA468_1"
## [11] "MDA468_2" "SKBR3_1"
## [13] "SKBR3_2" "pvalue_MCF7_vs_MCF10A"
## [15] "pvalue_MDA231_vs_MCF10A" "pvalue_MDA468_vs_MCF10A"
## [17] "pvalue_SKBR3_vs_MCF10A" "MCF10A_mean"
## [19] "MCF7_mean" "MDA231_mean"
## [21] "MDA468_mean" "SKBR3_mean"
## [23] "log2_MCF7_vs_MCF10A" "log2_MDA231_vs_MCF10A"
## [25] "log2_MDA468_vs_MCF10A" "log2_SKBR3_vs_MCF10A"
# Positive log2 values mean the protein is higher than MCF10A.
# Negative log2 values mean the protein is lower than MCF10A.
##BASIC VOLCANO PLOT
data <- data %>%
mutate(neglog10_p_MCF7 = -log10(pvalue_MCF7_vs_MCF10A))
volcano_plot <- data %>%
ggplot(aes(x = log2_MCF7_vs_MCF10A, y = neglog10_p_MCF7, description = Gene_Symbol)) +
geom_point(alpha = 0.7, color = "blue") +
theme_minimal() +
labs(
title = "Volcano Plot: MCF7 vs MCF10A",
x = "log2 fold change (MCF7 / MCF10A)",
y = "-log10(p-value)"
)
volcano_plot
# Proteins on the right are upregulated in MCF7.
# Proteins on the left are downregulated in MCF7.
# Higher points are more statistically significant.
## BETTER VOLCANO PLOT
data <- data %>%
mutate(significance_MCF7 = ifelse(
log2_MCF7_vs_MCF10A > 1 & neglog10_p_MCF7 > 1.3, "UP",
ifelse(log2_MCF7_vs_MCF10A < -1 & neglog10_p_MCF7 > 1.3, "DOWN", "NOT SIG")
))
plain_cols3 <- c("red", "gray", "blue")
better_volcano_plot <- data %>%
ggplot(aes(x = log2_MCF7_vs_MCF10A,
y = neglog10_p_MCF7,
description = Gene_Symbol,
color = significance_MCF7)) +
geom_point(alpha = 0.7) +
scale_color_manual(values = plain_cols3) +
theme_bw() +
labs(
x = "log2 fold change (MCF7 / MCF10A)",
y = "-log10(p-value)",
title = "Better Volcano Plot: MCF7 vs MCF10A"
)
better_volcano_plot
## Define the significant ones (by using ifelse and setting # parameters) so they can be colored
## Some standard colors
## volcano plot as before with some added things
#to view it, type
#check viewer and / or plots to see it
## use ggplotly to see hover over the points to see the gene names. Record these for the next step
ggplotly(better_volcano_plot)
##INTERPRETATION##
## Significant up regulated are: the blue points on the right side with high -log10(p-value)
## Significant down regulated are: the red points on the left side with high -log10(p-value)
#Why? How?
#The volcano plot compares gene activity between MCF7 and MCF10A cells. Blue dots represent genes that are significantly more active in MCF7 (up-regulated), while red dots represent genes that are significantly less active in MCF7 (down-regulated). Gray dots show genes with no major difference. To be colored blue or red, a gene must have both a large enough difference in activity (fold change) and a high confidence that the difference is real (p-value). On the plot, the farther left or right a dot is, the bigger the expression difference; the higher the dot, the more statistically reliable the result.
# Make a pivot longer table of the C19 data for Control_2h_1:Virus_24h_2
bc_long <- pivot_longer(data,
cols = c(MCF10A_1:SKBR3_2),
names_to = "variable",
values_to = "value") %>%
select(-c(pvalue_MCF7_vs_MCF10A,
pvalue_MDA231_vs_MCF10A,
pvalue_MDA468_vs_MCF10A,
pvalue_SKBR3_vs_MCF10A,
MCF10A_mean,
MCF7_mean,
MDA231_mean,
MDA468_mean,
SKBR3_mean,
log2_MCF7_vs_MCF10A,
log2_MDA231_vs_MCF10A,
log2_MDA468_vs_MCF10A,
log2_SKBR3_vs_MCF10A,
neglog10_p_MCF7,
significance_MCF7))
head(bc_long)
## # A tibble: 6 × 5
## Gene_Symbol Description Peptides variable value
## <chr> <chr> <int> <chr> <dbl>
## 1 NES " Nestin OS=Homo sapiens GN=NES PE=1 SV=2" 7 MCF10A_1 9.54
## 2 NES " Nestin OS=Homo sapiens GN=NES PE=1 SV=2" 7 MCF10A_2 4.58
## 3 NES " Nestin OS=Homo sapiens GN=NES PE=1 SV=2" 7 MCF7_1 5.07
## 4 NES " Nestin OS=Homo sapiens GN=NES PE=1 SV=2" 7 MCF7_2 5.42
## 5 NES " Nestin OS=Homo sapiens GN=NES PE=1 SV=2" 7 MDA231_1 25.4
## 6 NES " Nestin OS=Homo sapiens GN=NES PE=1 SV=2" 7 MDA231_2 27.4
## based on the gene symbols from plotly, select a few proteins from the table. Select just the data and gene symbols and pivot longer
## make barplots facetted by Gene Symbol (when working with other data sets - change x=order to x = variable)
down_genes <- c("HLA-B", "PLAU", "SDPR")
Examples_Down <- bc_long %>%
filter(Gene_Symbol %in% down_genes)
head(Examples_Down)
## # A tibble: 6 × 5
## Gene_Symbol Description Peptides variable value
## <chr> <chr> <int> <chr> <dbl>
## 1 HLA-B " HLA class I histocompatibility antigen,… 4 MCF10A_1 12.0
## 2 HLA-B " HLA class I histocompatibility antigen,… 4 MCF10A_2 7.37
## 3 HLA-B " HLA class I histocompatibility antigen,… 4 MCF7_1 2.51
## 4 HLA-B " HLA class I histocompatibility antigen,… 4 MCF7_2 2.47
## 5 HLA-B " HLA class I histocompatibility antigen,… 4 MDA231_1 15.1
## 6 HLA-B " HLA class I histocompatibility antigen,… 4 MDA231_2 12.0
Example_plot_down <- Examples_Down %>%
ggplot(aes(x = factor(variable,
levels = c("MCF10A_1","MCF10A_2",
"MCF7_1","MCF7_2",
"MDA231_1","MDA231_2",
"MDA468_1","MDA468_2",
"SKBR3_1","SKBR3_2")),
y = value)) +
geom_bar(stat = "identity", fill = "red") +
facet_wrap(~ Gene_Symbol) +
theme_bw() +
theme(axis.text = element_text(colour = "black", size = 10)) +
theme(text = element_text(size = 14)) +
theme(axis.text.x = element_text(angle = 45, hjust = 1)) +
labs(x = "sample", y = "relative intensity")
Example_plot_down
# These downregulated genes show lower expression in MCF7 than in MCF10A.
# Replicates that look similar suggest better consistency.
#check viewer and / or plots to see it
##INTERPRETATION## ## What can you see in this figure? are the repeated measures/reps similar or different? What does this say about the precision and accuracy of them? #These downregulated genes show lower expression in MCF7 than in MCF10A ##How does the control compare to the variables? Is this what you might expect? Why? What would you look for in the literature to support this idea? #Replicates that look similar suggest better consistency.
## Same process for the upregulated ones
up_genes <- c("ARFGEF3", "KRT19", "KMO")
Examples_Up <- bc_long %>%
filter(Gene_Symbol %in% up_genes)
head(Examples_Up)
## # A tibble: 6 × 5
## Gene_Symbol Description Peptides variable value
## <chr> <chr> <int> <chr> <dbl>
## 1 ARFGEF3 " Brefeldin A-inhibited guanine nucleoti… 1 MCF10A_1 0.001
## 2 ARFGEF3 " Brefeldin A-inhibited guanine nucleoti… 1 MCF10A_2 0.001
## 3 ARFGEF3 " Brefeldin A-inhibited guanine nucleoti… 1 MCF7_1 13.8
## 4 ARFGEF3 " Brefeldin A-inhibited guanine nucleoti… 1 MCF7_2 12.3
## 5 ARFGEF3 " Brefeldin A-inhibited guanine nucleoti… 1 MDA231_1 2.62
## 6 ARFGEF3 " Brefeldin A-inhibited guanine nucleoti… 1 MDA231_2 4.54
Example_plot_up <- Examples_Up %>%
ggplot(aes(x = factor(variable,
levels = c("MCF10A_1","MCF10A_2",
"MCF7_1","MCF7_2",
"MDA231_1","MDA231_2",
"MDA468_1","MDA468_2",
"SKBR3_1","SKBR3_2")),
y = value)) +
geom_bar(stat = "identity", fill = "royalblue") +
facet_wrap(~ Gene_Symbol) +
theme_bw() +
theme(axis.text = element_text(colour = "black", size = 10)) +
theme(text = element_text(size = 14)) +
theme(axis.text.x = element_text(angle = 45, hjust = 1)) +
labs(x = "sample", y = "relative intensity")
Example_plot_up
#check viewer and / or plots to see it
##INTERPRETATION## ## What can you see in this figure? are the
repeated measures/reps similar or different? What does this say about
the precision and accuracy of them? # These upregulated genes show
higher expression in MCF7 than in MCF10A ##How does the control compare
to the variables? Is this what you might expect? Why? What would you
look for in the literature to support this idea?
#THe bar plots help confirm what was seen in the volcano plot
#interpretation HINT:insert a chunk and create two seprate lines of code
that filter for your specific upregulated genes/proteins of interest and
selects for only their gene symbols and descriptions. Do this for the
downregulated as well. This will generate two list of the descriptors
for each gene of interest, helping you understand your figures. Be sure
to view it, not just ask for the head of the table generated.
down_info <- data %>%
filter(Gene_Symbol == "HLA-B" | Gene_Symbol == "PLAU" | Gene_Symbol == "SDPR") %>%
select(Gene_Symbol, Description)
#View(down_info)
up_info <- data %>%
filter(Gene_Symbol == "ARFGEF3" | Gene_Symbol == "KRT19" | Gene_Symbol == "KMO") %>%
select(Gene_Symbol, Description)
#View(up_info)
##now you can knit this and publish to save and share your code. Use this to work with either the brain or breast cells and the Part_C_template to complete your lab 6 ELN. ##Annotate when you have trouble and reference which line of code you need help on ## good luck and have fun!