####Run through this example and try to understand what is going on with the data
####So, lets load RMarkdown
library(rmarkdown)
knitr::opts_chunk$set(echo = TRUE, message=FALSE,warning=FALSE,collapse = TRUE)
library(reshape2)
library(ggplot2)
library(dplyr)
library(plotly)
library(viridis)
library(data.table)
library(pheatmap)
library(tidyverse)
library(ggthemes)
library(clipr)
library(tidyr)
library(Rcpp)
mycolors<-c(viridis(15))
felix_cols<-mycolors[c(5,2)]
felix_4cols<-mycolors[c(15,10,8,2)]
plain_cols1<-c("blue","gray")
plain_cols2<-c("red","gray")
pats_cols<-colorRampPalette(c("#FDE725FF", "white","#440154FF"))(21)
leos_cols<-colorRampPalette(c("white","blue"))(10)
## load the dataset
read_csv("breast_cancer_cells.csv")
## # A tibble: 5,144 × 17
## Gene_Symbol Description Peptides MCF10A_1 MCF10A_2 MCF7_1 MCF7_2 MDA231_1
## <chr> <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 NES Nestin OS=Homo… 7 9.54 4.58 5.07 5.42 25.4
## 2 ZC3HC1 Isoform 2 of N… 2 14 11.6 6.49 6.64 9.8
## 3 CAMSAP1L1 Isoform 2 of C… 5 10.2 8.29 11.6 10.8 12.5
## 4 COBRA1 Negative elong… 3 9 6.35 13.4 14.8 14.9
## 5 HAUS6 Isoform 3 of H… 5 8.21 4.44 12.1 9.82 16.5
## 6 CHCHD4 Isoform 2 of M… 5 12.5 15.8 8.05 8.38 6.78
## 7 DHX30 Isoform 2 of P… 17 12.6 8.18 9.51 9.76 12.7
## 8 SLC12A2 Isoform 2 of S… 15 6.33 4.21 11.3 11.5 3.03
## 9 PTPRJ Receptor-type … 5 9.7 6.2 4.58 4.94 25.7
## 10 ATP6AP2 Renin receptor… 8 9.02 5.69 14.5 15.2 7.25
## # ℹ 5,134 more rows
## # ℹ 9 more variables: MDA231_2 <dbl>, MDA468_1 <dbl>, MDA468_2 <dbl>,
## # SKBR3_1 <dbl>, SKBR3_2 <dbl>, pvalue_MCF7_vs_MCF10A <dbl>,
## # pvalue_MDA231_vs_MCF10A <dbl>, pvalue_MDA468_vs_MCF10A <dbl>,
## # pvalue_SKBR3_vs_MCF10A <dbl>
BC_data <- read.csv(file="breast_cancer_cells.csv")
#and then view it
view(BC_data)
BC_data<-BC_data %>%
mutate(log_MCF10A=log2((MCF10A_1)/(MCF10A_2)),
log_MCF7=log2((MCF7_1)/(MCF7_2)),
log_MDA231=log2((MDA231_1)/(MDA231_2)),
log_MDA468=log2((MDA468_1)/(MDA468_2)),
log_SKBR3=log2((SKBR3_1)/(SKBR3_2)))
view(BC_data)
#make a matrix of just raw the data
BC_mat1<-BC_data %>% select(log_MCF10A:log_SKBR3) %>% as.matrix() %>% round(.,2)
# what does that look like?
head(BC_mat1)
## log_MCF10A log_MCF7 log_MDA231 log_MDA468 log_SKBR3
## [1,] 1.06 -0.10 -0.11 0.23 0.58
## [2,] 0.27 -0.03 -0.07 -0.36 -0.20
## [3,] 0.30 0.09 0.30 0.17 -0.05
## [4,] 0.50 -0.15 0.40 -0.21 0.32
## [5,] 0.89 0.30 0.42 0.86 0.74
## [6,] -0.34 -0.06 -0.32 -0.68 -0.18
## make a heatmap from the data
pheatmap(BC_mat1, color=pats_cols,cellwidth = 30,cellheight = .03, cluster_cols = FALSE,cluster_rows = FALSE,legend = TRUE,fontsize = 7,scale = "column")
##INTERPRETATION##
## What can you see in this figure?
#
## are the repeated measures/reps similar or different? What does this say about the precision and accuracy of them?
# Repeated measures/reps are slightly different. This shows variability within signal strength.
##How does the control compare to the variables? Is this what you might expect? Why? What would you look for in the literature to support this idea?
# MCF10A looks varied from the others, therefore the control, as MCF10A is non-tumorigenic. This is expected due to non-tumorigenic tendency. Literature comparing cell lines of normal or malignant cells can be used.
#looks like MCF10A line is different than the other cells lines. We will take a closer look at that in the next section, once we've done some further data manipulations
## first, make new columns that combine the two reps for each variable by averaging them
BC_data2<-BC_data %>%
mutate(ave_MCF10A=rowMeans(select(.,MCF10A_1, MCF10A_2)),ave_MCF7=rowMeans(select(.,MCF7_1,MCF7_2)),ave_MDA231=rowMeans(select(.,MDA231_1,MDA231_2)),ave_MDA468=rowMeans(select(.,MDA468_1,MDA468_2)),ave_SKBR3=rowMeans(select(.,SKBR3_1,SKBR3_2)))
view(BC_data2)
## MCF10A is control, nontumorigenic
#did it create your new columns? Yes
## then make some new columns that store the log ratios of the variable means you just created, as compared to the control
BC_data2<-BC_data2 %>% mutate(log2_MCF7 = log2(ave_MCF7 / ave_MCF10A),log2_MDA231 = log2(ave_MDA231 / ave_MCF10A),log2_MDA468 = log2(ave_MDA468 / ave_MCF10A),log2_SKBR3 = log2(ave_SKBR3 / ave_MCF10A))
#did it create your new columns? Yes
## ALTERNATIVE CHOICE: make a matrix of just log values the data and a heat map of that. How do the two heat maps compare? This is not possible if your values contain 0's
BC_mat2<-BC_data2 %>% select(starts_with("log2_")) %>%
as.matrix() %>%
round(., 2)
pheatmap(BC_mat2, color=pats_cols, cellwidth=30, cellheight=0.03,
cluster_cols=FALSE, cluster_rows=FALSE, legend=TRUE,
fontsize=7, scale="column")
##BASIC VOLCANO PLOT
## Add a column that stores the negative log of the pvalue of interest (in this case, MCF7)
BC_data2<-BC_data2 %>% mutate(neg_log10_pval = -log10(pvalue_MDA231_vs_MCF10A))
## Use ggplot to plot the log ratio of MCF7 against the -log p-value MCF7
volcano_plot <-BC_data2 %>% ggplot(aes(x = log2_MDA231, y = neg_log10_pval,description=Gene_Symbol))+
geom_point(alpha=0.7,color="blue")
#to view it, type:
volcano_plot
## BETTER VOLCANO PLOT
## Define the significant ones (by using ifelse and setting # parameters) so they can be colored
BC_data2<-BC_data2 %>% mutate(significance=ifelse(log2_MDA231 > 1.8 & neg_log10_pval > 3.0, "UP",
ifelse(log2_MDA231 < -2.0 & neg_log10_pval > 3.0, "DOWN", "NOT SIG")))
## Some standard colors
plain_cols3<-c("UP" = "red","NOT SIG" = "gray","DOWN" = "blue")
## volcano plot as before with some added things
better_volcano_plot<-BC_data2 %>% ggplot(aes(x = log2_MDA231, y = neg_log10_pval, color = significance,text = Gene_Symbol)) +
geom_point(alpha = 0.7) +
scale_color_manual(values = plain_cols3) +
xlim(-6, 6) +
theme_bw() +
theme(axis.text = element_text(colour = "black", size = 14),
text = element_text(size = 14)) +
labs(x = "Log2 Fold Change (MDA231 vs MCF10A)",
y = "-log10(p-value)",
title = "Improved Volcano Plot for Breast Cancer Data")
#to view it, type
better_volcano_plot
#check viewer and / or plots to see it
## use ggplotly to see hover over the points to see the gene names. Record these for the next step
ggplotly(better_volcano_plot)
##INTERPRETATION##
## Significant up regulated are: PMM1, PNPLA2 and ACSS3. also ABHD14B
## Significant down regulated are: F11R. ABCB6 and METTL7A and more
#Why? How?
# classified genes as UP, DOWN, or NOT SIG based on log2 fold change (>1.8 or <-2.0) and p-value significance (-log10(p) > 3.0)
# Make a pivot longer table of the BC data for MCF10A_1:SKBR3_2
BC_long<-pivot_longer(BC_data2, cols = c(MCF10A_1:SKBR3_2), names_to = 'variable')%>% select(-c(pvalue_MCF7_vs_MCF10A:significance))
head(BC_long)
## # A tibble: 6 × 5
## Gene_Symbol Description Peptides variable value
## <chr> <chr> <int> <chr> <dbl>
## 1 NES " Nestin OS=Homo sapiens GN=NES PE=1 SV=2" 7 MCF10A_1 9.54
## 2 NES " Nestin OS=Homo sapiens GN=NES PE=1 SV=2" 7 MCF10A_2 4.58
## 3 NES " Nestin OS=Homo sapiens GN=NES PE=1 SV=2" 7 MCF7_1 5.07
## 4 NES " Nestin OS=Homo sapiens GN=NES PE=1 SV=2" 7 MCF7_2 5.42
## 5 NES " Nestin OS=Homo sapiens GN=NES PE=1 SV=2" 7 MDA231_1 25.4
## 6 NES " Nestin OS=Homo sapiens GN=NES PE=1 SV=2" 7 MDA231_2 27.4
## based on the gene symbols from plotly, select a few proteins from the table. Select just the data and gene symbols and pivot longer
Examples_Down<-BC_long %>% filter(Gene_Symbol=="F11R" | Gene_Symbol=="ABCB6" | Gene_Symbol=="HIST1H1A")
## make barplots facetted by Gene Symbol (when working with other data sets - change x=order to x = variable)
Example_plot_down <- Examples_Down %>%
ggplot(aes(x=factor(variable, levels=c('MCF10A_1','MCF10A_2','MCF7_1','MCF7_2','MDA231_1','MDA231_2','MDA468_1','MDA468_2','SKBR3_1','SKBR3_2')),
y=value)) +
geom_bar(stat="identity", fill="red") +
facet_wrap(~Gene_Symbol) +
theme_bw() +
theme(axis.text = element_text(colour = "black", size=10)) +
theme(text = element_text(size=14)) +
theme(axis.text.x = element_text(angle=45, hjust=1)) +
labs(x="Sample", y="Relative Intensity")
#check viewer and / or plots to see it
Example_plot_down
##INTERPRETATION## ## What can you see in this figure? # repeated measures (e.g., MCF10A_1 vs. MCF10A_2) have similar values, indicating high precision and reproducibility in data collection. Variability suggests experimental inconsistencies or biological heterogeneity. ## are the repeated measures/reps similar or different? What does this say about the precision and accuracy of them? #Reps are pretty consistent…good accuracy, but still slight variation
##How does the control compare to the variables? Is this what you might expect? Why? What would you look for in the literature to support this idea? # Control is generally higher in ingtensity than cancer lines. For literature, studies on breast cancer subtypes can reveal whether these downregulations are specific to aggressive phenotypes.
## Same process for the upregulated ones
Examples_Up<-BC_long %>% filter(Gene_Symbol=="PMM1" | Gene_Symbol=="PNPLA2" | Gene_Symbol=="ACSS3")
head(Examples_Up)
## # A tibble: 6 × 5
## Gene_Symbol Description Peptides variable value
## <chr> <chr> <int> <chr> <dbl>
## 1 PMM1 " Phosphomannomutase 1 OS=Homo sapiens GN… 1 MCF10A_1 3.34
## 2 PMM1 " Phosphomannomutase 1 OS=Homo sapiens GN… 1 MCF10A_2 3.53
## 3 PMM1 " Phosphomannomutase 1 OS=Homo sapiens GN… 1 MCF7_1 13.0
## 4 PMM1 " Phosphomannomutase 1 OS=Homo sapiens GN… 1 MCF7_2 9.66
## 5 PMM1 " Phosphomannomutase 1 OS=Homo sapiens GN… 1 MDA231_1 12.5
## 6 PMM1 " Phosphomannomutase 1 OS=Homo sapiens GN… 1 MDA231_2 12.0
Example_plot_up<-Examples_Up %>% ggplot(aes(x=factor(variable,levels=c('MCF10A_1','MCF10A_2','MCF7_1','MCF7_2','MDA231_1','MDA231_2','MDA468_1','MDA468_2','SKBR3_1','SKBR3_2')),y=value))+
geom_bar(stat="identity",fill="blue")+
facet_wrap(~Gene_Symbol)+
theme_bw()+
theme(axis.text = element_text(colour = "black",size=10))+
theme(text = element_text(size=14))+
theme(axis.text.x = element_text(angle=45, hjust=1))+
labs(x="sample",y="relative intensity")
#check viewer and / or plots to see it
Example_plot_up
##INTERPRETATION## ## What can you see in this figure? are the repeated measures/reps similar or different? What does this say about the precision and accuracy of them? # Reps seem consistent though there is some variablity. Precision and accurary generally appears to be consistent.
##How does the control compare to the variables? Is this what you
might expect? Why? What would you look for in the literature to support
this idea?
# Cell lines versus control are as expected. Cell lines are higher than
control. Literature to look for may be about hallmarks of cancer and
upregulated pathways, or gene knockdown
#interpretation HINT:insert a chunk and create two seprate lines of code that filter for your specific upregulated genes/proteins of interest and selects for only their gene symbols and descriptions. Do this for the downregulated as well. This will generate two list of the descriptors for each gene of interest, helping you understand your figures. Be sure to view it, not just ask for the head of the table generated.
# Filter for upregulated genes and select relevant columns
Upregulated_Genes <- BC_data %>%
filter(Gene_Symbol %in% c("ACSS3", "PMM1", "PNPLA2")) %>%
select(Gene_Symbol, Description)
# View the table
view(Upregulated_Genes)
# Filter for downregulated genes and select relevant columns
Downregulated_Genes <- BC_data %>%
filter(Gene_Symbol %in% c("F11R", "ABCB6", "HIST1H1A")) %>%
select(Gene_Symbol, Description)
# View the table
view(Downregulated_Genes)
head(Downregulated_Genes)
## Gene_Symbol
## 1 ABCB6
## 2 HIST1H1A
## 3 F11R
## Description
## 1 Isoform 2 of ATP-binding cassette sub-family B member 6, mitochondrial OS=Homo sapiens GN=ABCB6
## 2 Histone H1.1 OS=Homo sapiens GN=HIST1H1A PE=1 SV=3
## 3 Junctional adhesion molecule A OS=Homo sapiens GN=F11R PE=1 SV=1
head(Upregulated_Genes)
## Gene_Symbol
## 1 PMM1
## 2 ACSS3
## 3 PNPLA2
## Description
## 1 Phosphomannomutase 1 OS=Homo sapiens GN=PMM1 PE=1 SV=2
## 2 Isoform 2 of Acyl-CoA synthetase short-chain family member 3, mitochondrial OS=Homo sapiens GN=ACSS3
## 3 Patatin-like phospholipase domain-containing protein 2 OS=Homo sapiens GN=PNPLA2 PE=1 SV=1
##now you can knit this and publish to save and share your code. Use this to work with either the brain or breast cells and the Part_C_template to complete your lab 6 ELN. ##Annotate when you have trouble and reference which line of code you need help on ## good luck and have fun!