####Run through this example and try to understand what is going on with the data

####So, lets load RMarkdown

library(rmarkdown)

set messages to FALSE on everything (prevents certain boring things from being shown in the results)

knitr::opts_chunk$set(echo = TRUE, message=FALSE,warning=FALSE,collapse = TRUE)

PACKAGES

library(reshape2)
library(ggplot2)
library(dplyr)
library(plotly)
library(viridis)
library(data.table)
library(pheatmap)
library(tidyverse)
library(ggthemes)
library(clipr)
library(tidyr)
library(Rcpp)

COLORS

mycolors<-c(viridis(15))
felix_cols<-mycolors[c(5,2)]
felix_4cols<-mycolors[c(15,10,8,2)]
plain_cols1<-c("blue","gray")
plain_cols2<-c("red","gray")

pats_cols<-colorRampPalette(c("#FDE725FF", "white","#440154FF"))(21)
leos_cols<-colorRampPalette(c("white","blue"))(10)

LOAD DATA AND MAKE AN OVERALL HEATMAP

## load the dataset 
breast_cancer<-read_csv(file = "breast_cancer_cells.csv") 

#and then view it 
head(breast_cancer)
## # A tibble: 6 × 17
##   Gene_Symbol Description      Peptides MCF10A_1 MCF10A_2 MCF7_1 MCF7_2 MDA231_1
##   <chr>       <chr>               <dbl>    <dbl>    <dbl>  <dbl>  <dbl>    <dbl>
## 1 NES         Nestin OS=Homo …        7     9.54     4.58   5.07   5.42    25.4 
## 2 ZC3HC1      Isoform 2 of Nu…        2    14       11.6    6.49   6.64     9.8 
## 3 CAMSAP1L1   Isoform 2 of Ca…        5    10.2      8.29  11.6   10.8     12.5 
## 4 COBRA1      Negative elonga…        3     9        6.35  13.4   14.8     14.9 
## 5 HAUS6       Isoform 3 of HA…        5     8.21     4.44  12.1    9.82    16.5 
## 6 CHCHD4      Isoform 2 of Mi…        5    12.5     15.8    8.05   8.38     6.78
## # ℹ 9 more variables: MDA231_2 <dbl>, MDA468_1 <dbl>, MDA468_2 <dbl>,
## #   SKBR3_1 <dbl>, SKBR3_2 <dbl>, pvalue_MCF7_vs_MCF10A <dbl>,
## #   pvalue_MDA231_vs_MCF10A <dbl>, pvalue_MDA468_vs_MCF10A <dbl>,
## #   pvalue_SKBR3_vs_MCF10A <dbl>
colnames(breast_cancer)
##  [1] "Gene_Symbol"             "Description"            
##  [3] "Peptides"                "MCF10A_1"               
##  [5] "MCF10A_2"                "MCF7_1"                 
##  [7] "MCF7_2"                  "MDA231_1"               
##  [9] "MDA231_2"                "MDA468_1"               
## [11] "MDA468_2"                "SKBR3_1"                
## [13] "SKBR3_2"                 "pvalue_MCF7_vs_MCF10A"  
## [15] "pvalue_MDA231_vs_MCF10A" "pvalue_MDA468_vs_MCF10A"
## [17] "pvalue_SKBR3_vs_MCF10A"

now let’s visualize the dataset and look for initial trends. We can do this by making a matrix so and then a heatmap to visualize the data

#make a matrix of just raw the data
breast_cancer_matrix1<-breast_cancer %>% select(MCF10A_1:SKBR3_2) %>% as.matrix() %>% round(.,2)


# what does that look like?
head(breast_cancer_matrix1)
##      MCF10A_1 MCF10A_2 MCF7_1 MCF7_2 MDA231_1 MDA231_2 MDA468_1 MDA468_2
## [1,]     9.54     4.58   5.07   5.42    25.43    27.42     4.56     3.88
## [2,]    14.00    11.58   6.49   6.64     9.80    10.31     6.84     8.75
## [3,]    10.22     8.29  11.55  10.82    12.48    10.11     9.54     8.46
## [4,]     9.00     6.35  13.40  14.82    14.94    11.33     6.87     7.92
## [5,]     8.21     4.44  12.08   9.82    16.51    12.34    15.43     8.50
## [6,]    12.51    15.84   8.05   8.38     6.78     8.46     4.48     7.18
##      SKBR3_1 SKBR3_2
## [1,]    8.46    5.64
## [2,]   11.91   13.67
## [3,]    9.10    9.43
## [4,]    8.54    6.82
## [5,]    7.93    4.75
## [6,]   13.27   15.05
## make a heatmap from the data
pheatmap(breast_cancer_matrix1,color=pats_cols,cellwidth=30,cellheight=.03,cluster_cols=FALSE,cluster_rows=TRUE,legend=TRUE,fontsize = 7,scale="column")



##INTERPRETATION##
## What can you see in this figure? are the repeated measures/reps similar or different? What does this say about the precision and accuracy of them? 

## This heatmap allows us to visualize the changes in magnitude of the protein expression for the five breast cancer cell lines,the presence of two data sets (each in their own column) for each cell line helps us to determine the general trends that cell lines. Overal the repeated cell line data sets are similar which inidicates percise data, accuracy cannot be determined from looking at the general trends from the heatmap alone.

##How does the control compare to the variables? Is this what you might expect? Why? What would you look for in the literature to support this idea?  

## The control cell line, MCF10A, appears to have a different overall expression pattern compared with the breast cancer cell lines. This is what would be expected biologically, because MCF10A is a non-tumorigenic breast epithelial cell line, whereas the other cell lines are cancer-derived and are likely to show altered gene or protein expression. Based on the heatmap, SKBR3 appears to be more distinct from the other cell lines, suggesting that it may have a more unique expression profile. 

## this dataset compares the breast cell lines MCF10A, MCF-7, MDA231, MDA468, and SKBR3, and MCF10A is the non-tumorigenic comparison cell line. Therefore, differences between MCF10A and the cancer-derived cell lines would be biologically meaningful and expected. The heatmap is meant to help identify which cell line appears most distinct and should be explored further in later analyses.

## In the literature, I would look for studies comparing normal breast epithelial cells with breast cancer cell lines, and for papers describing the molecular characteristics of MCF7, MDA-MB-231, and SKBR3, to better explain why their expression patterns differ.


#looks like the MDA231 line is different than the other cells lines.  We will take a closer look at that in the next section, once we've done some further data manipulations

Data Manipulation


## first, make new columns that combine the two reps for each variable by averaging them  
breast_cancer<-breast_cancer%>% mutate(
  mean_of_MCF10A= ((MCF10A_1 + MCF10A_2)/2),
  mean_of_MCF7= ((MCF7_1 + MCF7_2)/2), 
  mean_of_MDA231= ((MDA231_1 + MDA231_2)/2), 
  mean_of_MDA468= ((MDA468_1 + MDA468_2)/2),
  mean_of_SKBR3= ((SKBR3_1 + SKBR3_2)/2))

#did it create your new columns?
colnames(breast_cancer)
##  [1] "Gene_Symbol"             "Description"            
##  [3] "Peptides"                "MCF10A_1"               
##  [5] "MCF10A_2"                "MCF7_1"                 
##  [7] "MCF7_2"                  "MDA231_1"               
##  [9] "MDA231_2"                "MDA468_1"               
## [11] "MDA468_2"                "SKBR3_1"                
## [13] "SKBR3_2"                 "pvalue_MCF7_vs_MCF10A"  
## [15] "pvalue_MDA231_vs_MCF10A" "pvalue_MDA468_vs_MCF10A"
## [17] "pvalue_SKBR3_vs_MCF10A"  "mean_of_MCF10A"         
## [19] "mean_of_MCF7"            "mean_of_MDA231"         
## [21] "mean_of_MDA468"          "mean_of_SKBR3"

## then make some new columns that store the log ratios of the variable means you just created, as compared to the control

breast_cancer<-breast_cancer %>% mutate(
  log_of_MCF7=log2(mean_of_MCF7/mean_of_MCF10A), 
  log_of_MDA231=log2(mean_of_MDA231/mean_of_MCF10A), 
  log_of_MDA468=log2(mean_of_MDA468/mean_of_MCF10A),
  log_of_SKBR3=log2(mean_of_SKBR3/mean_of_MCF10A))
 

#did it create your new columns?
colnames(breast_cancer)
##  [1] "Gene_Symbol"             "Description"            
##  [3] "Peptides"                "MCF10A_1"               
##  [5] "MCF10A_2"                "MCF7_1"                 
##  [7] "MCF7_2"                  "MDA231_1"               
##  [9] "MDA231_2"                "MDA468_1"               
## [11] "MDA468_2"                "SKBR3_1"                
## [13] "SKBR3_2"                 "pvalue_MCF7_vs_MCF10A"  
## [15] "pvalue_MDA231_vs_MCF10A" "pvalue_MDA468_vs_MCF10A"
## [17] "pvalue_SKBR3_vs_MCF10A"  "mean_of_MCF10A"         
## [19] "mean_of_MCF7"            "mean_of_MDA231"         
## [21] "mean_of_MDA468"          "mean_of_SKBR3"          
## [23] "log_of_MCF7"             "log_of_MDA231"          
## [25] "log_of_MDA468"           "log_of_SKBR3"
## ALTERNATIVE CHOICE: make a matrix of just log values the data and a heat map of that. How do the two heat maps compare? This is not possible if your values contain 0's

##INTERPRETATION##
## This second heatmap uses log ratios of each cancer cell line relative to the MCF10A control, which provides a more focused visualization, showing how strongly each protein differs from the non-tumorigenic comparison cells.

breast_cancer_matrix2<-breast_cancer %>% select(log_of_MCF7:log_of_SKBR3) %>% as.matrix() %>% round(.,2)

pheatmap(breast_cancer_matrix2,color=pats_cols,cellwidth=30,cellheight=.03,cluster_cols=FALSE,cluster_rows=TRUE,legend=TRUE,fontsize = 7,scale="column")

Diving deeping with VOLCANO PLOTS

##BASIC VOLCANO PLOT

## Add a column that stores the negative log of the pvalue of interest (in this case, MCF7)
breast_cancer<-breast_cancer %>% mutate(
  negative_log_of_MCF7=-log10(pvalue_MCF7_vs_MCF10A), 
  negative_log_of_MDA231=-log10(pvalue_MDA231_vs_MCF10A),
  negative_log_of_MDA468=-log10(pvalue_MDA468_vs_MCF10A),
  negative_log_of_SKBR3=-log10(pvalue_SKBR3_vs_MCF10A))

colnames(breast_cancer)
##  [1] "Gene_Symbol"             "Description"            
##  [3] "Peptides"                "MCF10A_1"               
##  [5] "MCF10A_2"                "MCF7_1"                 
##  [7] "MCF7_2"                  "MDA231_1"               
##  [9] "MDA231_2"                "MDA468_1"               
## [11] "MDA468_2"                "SKBR3_1"                
## [13] "SKBR3_2"                 "pvalue_MCF7_vs_MCF10A"  
## [15] "pvalue_MDA231_vs_MCF10A" "pvalue_MDA468_vs_MCF10A"
## [17] "pvalue_SKBR3_vs_MCF10A"  "mean_of_MCF10A"         
## [19] "mean_of_MCF7"            "mean_of_MDA231"         
## [21] "mean_of_MDA468"          "mean_of_SKBR3"          
## [23] "log_of_MCF7"             "log_of_MDA231"          
## [25] "log_of_MDA468"           "log_of_SKBR3"           
## [27] "negative_log_of_MCF7"    "negative_log_of_MDA231" 
## [29] "negative_log_of_MDA468"  "negative_log_of_SKBR3"

## Use ggplot to plot the log ratio of MCF7 against the -log p-value MCF7

volcano_plot_of_MCF7<-breast_cancer %>% ggplot(aes(x=log_of_MCF7,y=negative_log_of_MCF7,description=Gene_Symbol))+
  geom_point(alpha=0.7,color="cyan")

volcano_plot_of_MDA231<-breast_cancer %>% ggplot(aes(x=log_of_MDA231,y=negative_log_of_MDA231,description=Gene_Symbol))+
  geom_point(alpha=0.7,color="orange")

volcano_plot_of_MDA468<-breast_cancer%>% ggplot(aes(x=log_of_MDA468,y=negative_log_of_MDA468,description=Gene_Symbol))+
  geom_point(alpha=0.7,color="green")

volcano_plot_of_SKBR3<-breast_cancer %>% ggplot(aes(x=log_of_SKBR3,y=negative_log_of_SKBR3,description=Gene_Symbol))+
  geom_point(alpha=0.7,color="pink")
#to view it, type: 
volcano_plot_of_MCF7


volcano_plot_of_MDA231


volcano_plot_of_MDA468


volcano_plot_of_SKBR3


  
## BETTER VOLCANO PLOT

## Define the significant ones (by using ifelse and setting # parameters) so they can be colored
breast_cancer<-breast_cancer %>% mutate(significance=ifelse((log_of_MCF7>3 & negative_log_of_MCF7>3.99),"UP", ifelse((log_of_MCF7<c(-3) & negative_log_of_MCF7>3.99),"DOWN","NOT SIG")))

breast_cancer<-breast_cancer %>% mutate(significance=ifelse((log_of_MDA231>2 & negative_log_of_MDA231>2.75),"UP", ifelse((log_of_MDA231<c(-3) & negative_log_of_MDA231>2.99),"DOWN","NOT SIG")))

breast_cancer<-breast_cancer %>% mutate(significance=ifelse((log_of_MDA468>3.6 & negative_log_of_MDA468>3.6),"UP", ifelse((log_of_MDA468<c(-3) & negative_log_of_MDA468>3.5),"DOWN","NOT SIG")))

breast_cancer<-breast_cancer %>% mutate(significance=ifelse((log_of_SKBR3>2.7 & negative_log_of_SKBR3>2.7),"UP", ifelse((log_of_SKBR3<c(-2.5) & negative_log_of_SKBR3>2.5),"DOWN","NOT SIG")))
## Some standard colors
col1<-c("brown","gray", "orange")

col2<-c("green","gray","red")

col3<-c("pink","gray","blue")

col4<-c("purple","gray","yellow")
## volcano plot as before with some added things
better_volcano_plot_of_MCF7<-breast_cancer %>% ggplot(aes(x=log_of_MCF7,y=negative_log_of_MCF7,description=Gene_Symbol,color=significance))+
  geom_point(alpha=0.7)+
  scale_color_manual(values=col1)+
  xlim(-6,6)+
  theme_bw()+
  theme(axis.text = element_text(colour = "black",size=14))+
  theme(text = element_text(size=14))+
  labs(x="log ratio of MCF7 compared to control, MCF10A",y="-log(p-value)")

better_volcano_plot_of_MDA231<-breast_cancer %>% ggplot(aes(x=log_of_MDA231,y=negative_log_of_MDA231,description=Gene_Symbol,color=significance))+
  geom_point(alpha=0.7)+
  scale_color_manual(values=col2)+
  xlim(-6,6)+
  theme_bw()+
  theme(axis.text = element_text(colour = "black",size=14))+
  theme(text = element_text(size=14))+
  labs(x="log ratio of MDA231 compared to control, MCF10A",y="-log(p-value)")
  
better_volcano_plot_of_MDA468<-breast_cancer%>% ggplot(aes(x=log_of_MDA468,y=negative_log_of_MDA468,description=Gene_Symbol,color=significance))+
  geom_point(alpha=0.7)+
  scale_color_manual(values=col3)+
  xlim(-6,6)+
  theme_bw()+
  theme(axis.text = element_text(colour = "black",size=14))+
  theme(text = element_text(size=14))+
  labs(x="log ratio of MDA468 compared to control, MCF10A",y="-log(p-value)")

better_volcano_plot_of_SKBR3<-breast_cancer %>% ggplot(aes(x=log_of_SKBR3,y=negative_log_of_SKBR3,description=Gene_Symbol,color=significance))+
  geom_point(alpha=0.7)+
  scale_color_manual(values=col4)+
  xlim(-6,6)+
  theme_bw()+
  theme(axis.text = element_text(colour = "black",size=14))+
  theme(text = element_text(size=14))+
  labs(x="log ratio of SKBR3 compared to control, MCF10A",y="-log(p-value)")

#to view it, type
better_volcano_plot_of_MCF7


better_volcano_plot_of_MDA231


better_volcano_plot_of_MDA468


better_volcano_plot_of_SKBR3

#check viewer and / or plots to see it 
## use ggplotly to see hover over the points to see the gene names. Record these for the next step 
ggplotly(better_volcano_plot_of_MCF7)

ggplotly(better_volcano_plot_of_MDA231)

ggplotly(better_volcano_plot_of_MDA468)

ggplotly(better_volcano_plot_of_SKBR3)
##INTERPRETATION## FOR MCF7
## Significant up regulated are: ASS1, HOOK1, ATP2A3 and EFHD1
## Significant down regulated are: APOA1. TMX4, MYO1B and HLA-A

##INTERPRETATION## FOR MDA231
## Significant up regulated are: HOOK1, ASS1, ATP2A3 and EFHD1
## Significant down regulated are: MYO1B, HLA-A, APOA1 and TMX4

##INTERPRETATION## FOR MDA468
## Significant up regulated are: ASS1, EFHD1, HOOK1 and ATP2A3
## Significant down regulated are: TMX4, MYO1B and HLA-A

##INTERPRETATION## FOR SKBR3
## Significant up regulated are: HOOK1, EFHD1, ATP2A3 and ASS1
## Significant down regulated are: HLA-A, APOA1, TMX4 and MYO1B

#Why? How?

Barplots of significant points of interest

EXAMPLES OF A COUPLE PROTEINS or GENES

# Make a pivot longer table of the C19 data for Control_2h_1:Virus_24h_2
breast_cancer_longer<-pivot_longer(breast_cancer, cols = c(MCF10A_1:SKBR3_2), names_to = 'variable')%>% select(-c(pvalue_MCF7_vs_MCF10A:significance))%>%select(-Description, -Peptides)


head(breast_cancer_longer)
## # A tibble: 6 × 3
##   Gene_Symbol variable value
##   <chr>       <chr>    <dbl>
## 1 NES         MCF10A_1  9.54
## 2 NES         MCF10A_2  4.58
## 3 NES         MCF7_1    5.07
## 4 NES         MCF7_2    5.42
## 5 NES         MDA231_1 25.4 
## 6 NES         MDA231_2 27.4

## based on the gene symbols from plotly, select a few proteins from the table. Select just the data and gene symbols and pivot longer

Examples_for_down<-breast_cancer_longer %>% filter(Gene_Symbol=="APOA1"| Gene_Symbol=="HLA-A"| Gene_Symbol=="PLAU"| Gene_Symbol=="TMX4"| Gene_Symbol=="MYO1B"| Gene_Symbol=="HIST1H1A"| Gene_Symbol=="PTGFRN"| Gene_Symbol=="CTSZ")

  
## make barplots facetted by Gene Symbol (when working with other data sets - change x=order to x = variable)

Examples_for_down<-Examples_for_down %>% 
  ggplot(aes(x=variable,y=value))+ 
  geom_bar(stat="identity",fill="orange")+
  facet_wrap(~Gene_Symbol)+
  theme_bw()+
  theme(axis.text = element_text(colour = "brown",size=10))+
  theme(text = element_text(size=14))+
  theme(axis.text.x = element_text(angle=45, hjust=1))+
  labs(x="sample",y="relative intensity")



Examples_for_down

  


#check viewer and / or plots to see it 

##INTERPRETATION## ## What can you see in this figure? are the repeated measures/reps similar or different? What does this say about the precision and accuracy of them? ##How does the control compare to the variables? Is this what you might expect? Why? What would you look for in the literature to support this idea?

## Same process for the upregulated ones

Examples_for_up<-breast_cancer_longer %>% filter(Gene_Symbol=="ALDH1A3"| Gene_Symbol=="PAFAH1B3" | Gene_Symbol=="NLRP2" | Gene_Symbol=="ATP2A3" | Gene_Symbol=="ACSS3" | Gene_Symbol=="SYT7"| Gene_Symbol=="ASS1"| Gene_Symbol=="SYT7" | Gene_Symbol=="SMEK2" | Gene_Symbol=="AKR1C2"| Gene_Symbol=="CORO1A"| Gene_Symbol=="EFHD1"| Gene_Symbol=="C17orf28"| Gene_Symbol=="HOOK1"| Gene_Symbol=="AGR3") 

Examples_for_up<-Examples_for_up %>% 
  ggplot(aes(x=variable,y=value))+ 
  geom_bar(stat="identity",fill="brown")+
  facet_wrap(~Gene_Symbol)+
  theme_bw()+
  theme(axis.text = element_text(colour = "black",size=10))+
  theme(text = element_text(size=14))+
  theme(axis.text.x = element_text(angle=45, hjust=1))+
  labs(x="sample",y="relative intensity")





Examples_for_up

  


#check viewer and / or plots to see it 

##INTERPRETATION## ## What can you see in this figure? are the repeated measures/reps similar or different? What does this say about the precision and accuracy of them? ##How does the control compare to the variables? Is this what you might expect? Why? What would you look for in the literature to support this idea?

#interpretation HINT:insert a chunk and create two seprate lines of code that filter for your specific upregulated genes/proteins of interest and selects for only their gene symbols and descriptions. Do this for the downregulated as well. This will generate two list of the descriptors for each gene of interest, helping you understand your figures. Be sure to view it, not just ask for the head of the table generated.

up_regulated_genes<-breast_cancer %>% filter(Gene_Symbol=="ALDH1A3"| Gene_Symbol=="PAFAH1B3" | Gene_Symbol=="NLRP2" | Gene_Symbol=="ATP2A3" | Gene_Symbol=="ACSS3" | Gene_Symbol=="SYT7"| Gene_Symbol=="ASS1"| Gene_Symbol=="SYT7" | Gene_Symbol=="SMEK2" | Gene_Symbol=="AKR1C2"| Gene_Symbol=="CORO1A"| Gene_Symbol=="EFHD1"| Gene_Symbol=="C17orf28"| Gene_Symbol=="HOOK1"| Gene_Symbol=="AGR3") %>% select(Gene_Symbol, Description)

view(up_regulated_genes)

head(up_regulated_genes)
## # A tibble: 6 × 2
##   Gene_Symbol Description                                                       
##   <chr>       <chr>                                                             
## 1 AGR3        Anterior gradient protein 3 homolog OS=Homo sapiens GN=AGR3 PE=1 …
## 2 AKR1C2      Aldo-keto reductase family 1 member C2 OS=Homo sapiens GN=AKR1C2 …
## 3 NLRP2       Isoform 2 of NACHT, LRR and PYD domains-containing protein 2 OS=H…
## 4 SYT7        Synaptotagmin-7 OS=Homo sapiens GN=SYT7 PE=1 SV=3                 
## 5 ALDH1A3     Aldehyde dehydrogenase family 1 member A3 OS=Homo sapiens GN=ALDH…
## 6 ACSS3       Isoform 2 of Acyl-CoA synthetase short-chain family member 3, mit…
down_regulated_genes<-breast_cancer %>% filter(Gene_Symbol=="APOA1"| Gene_Symbol=="HLA-A"| Gene_Symbol=="PLAU"| Gene_Symbol=="TMX4"| Gene_Symbol=="MYO1B"| Gene_Symbol=="HIST1H1A"| Gene_Symbol=="PTGFRN"| Gene_Symbol=="CTSZ") %>% select(Gene_Symbol, Description)

view(down_regulated_genes)

head(down_regulated_genes)
## # A tibble: 6 × 2
##   Gene_Symbol Description                                                       
##   <chr>       <chr>                                                             
## 1 HLA-A       HLA class I histocompatibility antigen, A-23 alpha chain OS=Homo …
## 2 HLA-A       HLA class I histocompatibility antigen, A-30 alpha chain OS=Homo …
## 3 HLA-A       HLA class I histocompatibility antigen, A-2 alpha chain OS=Homo s…
## 4 CTSZ        Cathepsin Z OS=Homo sapiens GN=CTSZ PE=1 SV=1                     
## 5 PLAU        Isoform 2 of Urokinase-type plasminogen activator OS=Homo sapiens…
## 6 HIST1H1A    Histone H1.1 OS=Homo sapiens GN=HIST1H1A PE=1 SV=3

WRAP UP

##now you can knit this and publish to save and share your code. Use this to work with either the brain or breast cells and the Part_C_template to complete your lab 6 ELN. ##Annotate when you have trouble and reference which line of code you need help on ## good luck and have fun!