####Run through this example and try to understand what is going on with the data
####So, lets load RMarkdown
library(rmarkdown)
knitr::opts_chunk$set(echo = TRUE, message=FALSE,warning=FALSE,collapse = TRUE)
library(reshape2)
library(ggplot2)
library(dplyr)
library(plotly)
library(viridis)
library(data.table)
library(pheatmap)
library(tidyverse)
library(ggthemes)
library(clipr)
library(tidyr)
library(Rcpp)
mycolors<-c(viridis(15))
felix_cols<-mycolors[c(5,2)]
felix_4cols<-mycolors[c(15,10,8,2)]
plain_cols1<-c("blue","gray")
plain_cols2<-c("red","gray")
pats_cols<-colorRampPalette(c("#FDE725FF", "white","#440154FF"))(21)
leos_cols<-colorRampPalette(c("white","blue"))(10)
## load the dataset
BC_data<-read_csv(file="breast_cancer_cells.csv")
#and then view it
view(BC_data)
#make a matrix of just raw the data
BC_mat1<-BC_data %>% select(MCF10A_1:SKBR3_2) %>% as.matrix() %>% round(.,2)
# what does that look like?
view(BC_mat1)
## make a heatmap from the data
pheatmap(BC_mat1, color=pats_cols,cellwidth=10,cellheight=.03,cluster_cols=FALSE,cluster_rows=FALSE,legend=TRUE,fontsize = 7,scale="column")
##INTERPRETATION##
## What can you see in this figure? are the repeated measures/reps similar or different? What does this say about the precision and accuracy of them?
##How does the control compare to the variables? Is this what you might expect? Why? What would you look for in the literature to support this idea?
#looks like _______ line is different than the other cells lines. We will take a closer look at that in the next section, once we've done some further data manipulations
## first, make new columns that combine the two reps for each variable by averaging them
BC_data_avg <- BC_data %>% mutate(
mean_MCF10A = (MCF10A_1 + MCF10A_2) / 2,
mean_MCF7 = (MCF7_1 + MCF7_2) / 2,
mean_MDA231 = (MDA231_1 + MDA231_2) / 2,
mean_MDA468 = (MDA468_1 + MDA468_2) / 2,
mean_SKBR3 = (SKBR3_1 + SKBR3_2) / 2
)
print(BC_data_avg)
## # A tibble: 5,144 × 22
## Gene_Symbol Description Peptides MCF10A_1 MCF10A_2 MCF7_1 MCF7_2 MDA231_1
## <chr> <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 NES Nestin OS=Homo… 7 9.54 4.58 5.07 5.42 25.4
## 2 ZC3HC1 Isoform 2 of N… 2 14 11.6 6.49 6.64 9.8
## 3 CAMSAP1L1 Isoform 2 of C… 5 10.2 8.29 11.6 10.8 12.5
## 4 COBRA1 Negative elong… 3 9 6.35 13.4 14.8 14.9
## 5 HAUS6 Isoform 3 of H… 5 8.21 4.44 12.1 9.82 16.5
## 6 CHCHD4 Isoform 2 of M… 5 12.5 15.8 8.05 8.38 6.78
## 7 DHX30 Isoform 2 of P… 17 12.6 8.18 9.51 9.76 12.7
## 8 SLC12A2 Isoform 2 of S… 15 6.33 4.21 11.3 11.5 3.03
## 9 PTPRJ Receptor-type … 5 9.7 6.2 4.58 4.94 25.7
## 10 ATP6AP2 Renin receptor… 8 9.02 5.69 14.5 15.2 7.25
## # ℹ 5,134 more rows
## # ℹ 14 more variables: MDA231_2 <dbl>, MDA468_1 <dbl>, MDA468_2 <dbl>,
## # SKBR3_1 <dbl>, SKBR3_2 <dbl>, pvalue_MCF7_vs_MCF10A <dbl>,
## # pvalue_MDA231_vs_MCF10A <dbl>, pvalue_MDA468_vs_MCF10A <dbl>,
## # pvalue_SKBR3_vs_MCF10A <dbl>, mean_MCF10A <dbl>, mean_MCF7 <dbl>,
## # mean_MDA231 <dbl>, mean_MDA468 <dbl>, mean_SKBR3 <dbl>
#did it create your new columns?
## then make some new columns that store the log ratios of the variable means you just created, as compared to the control
BC_data_avg <- BC_data_avg %>% mutate(
log_MCF7 = log2(mean_MCF7 / mean_MCF10A),
log_MDA231 = log2(mean_MDA231 / mean_MCF10A),
log_MDA468 = log2(mean_MDA468 / mean_MCF10A),
log_SKBR3 = log2(mean_SKBR3 / mean_MCF10A)
)
print(BC_data_avg)
## # A tibble: 5,144 × 26
## Gene_Symbol Description Peptides MCF10A_1 MCF10A_2 MCF7_1 MCF7_2 MDA231_1
## <chr> <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 NES Nestin OS=Homo… 7 9.54 4.58 5.07 5.42 25.4
## 2 ZC3HC1 Isoform 2 of N… 2 14 11.6 6.49 6.64 9.8
## 3 CAMSAP1L1 Isoform 2 of C… 5 10.2 8.29 11.6 10.8 12.5
## 4 COBRA1 Negative elong… 3 9 6.35 13.4 14.8 14.9
## 5 HAUS6 Isoform 3 of H… 5 8.21 4.44 12.1 9.82 16.5
## 6 CHCHD4 Isoform 2 of M… 5 12.5 15.8 8.05 8.38 6.78
## 7 DHX30 Isoform 2 of P… 17 12.6 8.18 9.51 9.76 12.7
## 8 SLC12A2 Isoform 2 of S… 15 6.33 4.21 11.3 11.5 3.03
## 9 PTPRJ Receptor-type … 5 9.7 6.2 4.58 4.94 25.7
## 10 ATP6AP2 Renin receptor… 8 9.02 5.69 14.5 15.2 7.25
## # ℹ 5,134 more rows
## # ℹ 18 more variables: MDA231_2 <dbl>, MDA468_1 <dbl>, MDA468_2 <dbl>,
## # SKBR3_1 <dbl>, SKBR3_2 <dbl>, pvalue_MCF7_vs_MCF10A <dbl>,
## # pvalue_MDA231_vs_MCF10A <dbl>, pvalue_MDA468_vs_MCF10A <dbl>,
## # pvalue_SKBR3_vs_MCF10A <dbl>, mean_MCF10A <dbl>, mean_MCF7 <dbl>,
## # mean_MDA231 <dbl>, mean_MDA468 <dbl>, mean_SKBR3 <dbl>, log_MCF7 <dbl>,
## # log_MDA231 <dbl>, log_MDA468 <dbl>, log_SKBR3 <dbl>
#did it create your new columns?
## ALTERNATIVE CHOICE: make a matrix of just log values the data and a heat map of that. How do the two heat maps compare? This is not possible if your values contain 0's
##BASIC VOLCANO PLOT
## Add a column that stores the negative log of the pvalue of interest (in this case, pvalue_SKBR3_vs_MCF10A)
BC_data <- BC_data %>% mutate(neglog_SKBR3_vs_MCF10A = -log10(pvalue_SKBR3_vs_MCF10A))
## Add log2 fold change column for SKBR3 vs MCF10A
BC_data <- BC_data %>% mutate(log_SKBR3_vs_MCF10A = log2((SKBR3_1 + SKBR3_2) / 2) - log2((MCF10A_1 + MCF10A_2) / 2))
## Use ggplot to plot the log ratio of SKBR3_vs_MCF10A against the -log p-value pvalue_SKBR3_vs_MCF10A
volcano_plot <- BC_data %>% ggplot(aes(x=log_SKBR3_vs_MCF10A, y=neglog_SKBR3_vs_MCF10A, description=Gene_Symbol)) +
geom_point(alpha=0.7, color="royalblue")
volcano_plot
## BETTER VOLCANO PLOT
## Define the significant ones (by using ifelse and setting parameters) so they can be colored
BC_data <- BC_data %>% mutate(significance = ifelse((log_SKBR3_vs_MCF10A > 2 & neglog_SKBR3_vs_MCF10A > 2.99), "UP", ifelse((log_SKBR3_vs_MCF10A < -2 & neglog_SKBR3_vs_MCF10A > 2.99), "DOWN", "NOT SIG")))
## Some standard colors
plain_cols3 <- c("red", "gray", "blue")
## Volcano plot with added features
better_volcano_plot <- BC_data %>% ggplot(aes(x=log_SKBR3_vs_MCF10A, y=neglog_SKBR3_vs_MCF10A, description=Gene_Symbol, color=significance)) +
geom_point(alpha=0.7) +
scale_color_manual(values=plain_cols3) +
xlim(-6,6) +
theme_bw() +
theme(axis.text = element_text(colour = "black", size=14)) +
theme(text = element_text(size=14)) +
labs(x="log ratio SKBR3 vs MCF10A", y="-log(p-value)")
better_volcano_plot
## use ggplotly to see hover over the points to see the gene names. Record these for the next step
library(plotly)
ggplotly(better_volcano_plot)
##INTERPRETATION##
## Significant up regulated are: GCLC, FNBP1L and KRT23
## Significant down regulated are: APOA1. HLA-A and MYO1B
#Why? How?
BC_data %>% filter(significance == "UP") %>% select(Gene_Symbol, log_SKBR3_vs_MCF10A, neglog_SKBR3_vs_MCF10A)
## # A tibble: 6 × 3
## Gene_Symbol log_SKBR3_vs_MCF10A neglog_SKBR3_vs_MCF10A
## <chr> <dbl> <dbl>
## 1 TDP2 2.64 3.04
## 2 DENND4C 2.25 3.16
## 3 KRT23 2.33 3.17
## 4 FNBP1L 2.20 3.27
## 5 GCLC 2.06 3.28
## 6 KMO 14.9 4.13
BC_data %>% filter(significance == "DOWN") %>% select(Gene_Symbol, log_SKBR3_vs_MCF10A, neglog_SKBR3_vs_MCF10A)
## # A tibble: 4 × 3
## Gene_Symbol log_SKBR3_vs_MCF10A neglog_SKBR3_vs_MCF10A
## <chr> <dbl> <dbl>
## 1 HMGN5 -2.16 3.01
## 2 MYO1B -2.57 3.08
## 3 HLA-A -4.51 3.44
## 4 APOA1 -5.09 3.99
# Make a pivot longer table of the BC data
BC_long <- pivot_longer(BC_data, cols = c(MCF10A_1:SKBR3_2), names_to = 'variable') %>%
select(-c(pvalue_MCF7_vs_MCF10A:significance)) %>%
select(-Peptides)
head(BC_long)
## # A tibble: 6 × 4
## Gene_Symbol Description variable value
## <chr> <chr> <chr> <dbl>
## 1 NES Nestin OS=Homo sapiens GN=NES PE=1 SV=2 MCF10A_1 9.54
## 2 NES Nestin OS=Homo sapiens GN=NES PE=1 SV=2 MCF10A_2 4.58
## 3 NES Nestin OS=Homo sapiens GN=NES PE=1 SV=2 MCF7_1 5.07
## 4 NES Nestin OS=Homo sapiens GN=NES PE=1 SV=2 MCF7_2 5.42
## 5 NES Nestin OS=Homo sapiens GN=NES PE=1 SV=2 MDA231_1 25.4
## 6 NES Nestin OS=Homo sapiens GN=NES PE=1 SV=2 MDA231_2 27.4
## based on the gene symbols from plotly, select a few proteins from the table. Select just the data and gene symbols and pivot longer
BC_down <- BC_long %>% filter(Gene_Symbol %in% c("HMGN5", "MYO1B", "HLA-A", "APOA1"))
BC_up <- BC_long %>% filter(Gene_Symbol %in% c("TDP2", "DENND4C", "KRT23", "FNBP1L", "GCLC"))
plot_down <- BC_down %>%
ggplot(aes(x=variable, y=value)) +
geom_bar(stat="identity", fill="red") +
facet_wrap(~Gene_Symbol) +
theme_bw() +
theme(axis.text = element_text(colour = "black", size=10)) +
theme(text = element_text(size=14)) +
theme(axis.text.x = element_text(angle=45, hjust=1)) +
labs(x="sample", y="relative intensity")
plot_down
#check viewer and / or plots to see it
##INTERPRETATION## ## What can you see in this figure? are the repeated measures/reps similar or different? What does this say about the precision and accuracy of them? ##How does the control compare to the variables? Is this what you might expect? Why? What would you look for in the literature to support this idea?
## Same process for the upregulated ones
plot_up <- BC_up %>%
ggplot(aes(x=variable, y=value)) +
geom_bar(stat="identity", fill="royalblue") +
facet_wrap(~Gene_Symbol) +
theme_bw() +
theme(axis.text = element_text(colour = "black", size=10)) +
theme(text = element_text(size=14)) +
theme(axis.text.x = element_text(angle=45, hjust=1)) +
labs(x="sample", y="relative intensity")
print(plot_up)
#check viewer and / or plots to see it
##INTERPRETATION## ## What can you see in this figure? are the repeated measures/reps similar or different? What does this say about the precision and accuracy of them? ##How does the control compare to the variables? Is this what you might expect? Why? What would you look for in the literature to support this idea?
#interpretation HINT:insert a chunk and create two seprate lines of code that filter for your specific upregulated genes/proteins of interest and selects for only their gene symbols and descriptions. Do this for the downregulated as well. This will generate two list of the descriptors for each gene of interest, helping you understand your figures. Be sure to view it, not just ask for the head of the table generated.
## Filter for upregulated gene descriptions
up_info <- BC_data %>%
filter(Gene_Symbol %in% c("TDP2", "DENND4C", "KRT23", "FNBP1L", "GCLC")) %>%
select(Gene_Symbol, Description)
up_info
## # A tibble: 5 × 2
## Gene_Symbol Description
## <chr> <chr>
## 1 TDP2 Isoform 2 of Tyrosyl-DNA phosphodiesterase 2 OS=Homo sapiens GN=T…
## 2 DENND4C Isoform 2 of DENN domain-containing protein 4C OS=Homo sapiens GN…
## 3 KRT23 Keratin, type I cytoskeletal 23 OS=Homo sapiens GN=KRT23 PE=1 SV=2
## 4 FNBP1L Isoform 2 of Formin-binding protein 1-like OS=Homo sapiens GN=FNB…
## 5 GCLC Glutamate--cysteine ligase catalytic subunit OS=Homo sapiens GN=G…
## Filter for downregulated gene descriptions
down_info <- BC_data %>%
filter(Gene_Symbol %in% c("HMGN5", "MYO1B", "HLA-A", "APOA1")) %>%
select(Gene_Symbol, Description)
down_info
## # A tibble: 7 × 2
## Gene_Symbol Description
## <chr> <chr>
## 1 HLA-A HLA class I histocompatibility antigen, A-23 alpha chain OS=Homo …
## 2 HLA-A HLA class I histocompatibility antigen, A-30 alpha chain OS=Homo …
## 3 HLA-A HLA class I histocompatibility antigen, A-2 alpha chain OS=Homo s…
## 4 HMGN5 High mobility group nucleosome-binding domain-containing protein …
## 5 MYO1B Isoform 2 of Myosin-Ib OS=Homo sapiens GN=MYO1B
## 6 HLA-A HLA class I histocompatibility antigen, A-1 alpha chain OS=Homo s…
## 7 APOA1 Apolipoprotein A-I OS=Homo sapiens GN=APOA1 PE=1 SV=1
##now you can knit this and publish to save and share your code. Use this to work with either the brain or breast cells and the Part_C_template to complete your lab 6 ELN. ##Annotate when you have trouble and reference which line of code you need help on ## good luck and have fun!