####Run through this example and try to understand what is going on with the data

####So, lets load RMarkdown

library(rmarkdown)

set messages to FALSE on everything (prevents certain boring things from being shown in the results)

knitr::opts_chunk$set(echo = TRUE, message=FALSE,warning=FALSE,collapse = TRUE)

PACKAGES

library(reshape2)
library(ggplot2)
library(dplyr)
library(plotly)
library(viridis)
library(data.table)
library(pheatmap)
library(tidyverse)
library(ggthemes)
library(clipr)
library(tidyr)
library(Rcpp)

COLORS

mycolors<-c(viridis(15))
felix_cols<-mycolors[c(5,2)]
felix_4cols<-mycolors[c(15,10,8,2)]
plain_cols1<-c("blue","gray")
plain_cols2<-c("red","gray")

pats_cols<-colorRampPalette(c("#FDE725FF", "white","#440154FF"))(21)
leos_cols<-colorRampPalette(c("white","blue"))(10)

LOAD DATA AND MAKE AN OVERALL HEATMAP

## load the dataset 
breastcancercells_data<-read_csv(file="breast_cancer_cells.csv")

#and then view it 
view(breastcancercells_data)

now let’s visualize the dataset and look for initial trends. We can do this by making a matrix so and then a heatmap to visualize the data

#make a matrix of just raw the data
breastcancercells_mat1<-breastcancercells_data %>% select(MCF10A_1:SKBR3_2) %>% as.matrix() %>% round(.,2)


# what does that look like?
head(breastcancercells_mat1)
##      MCF10A_1 MCF10A_2 MCF7_1 MCF7_2 MDA231_1 MDA231_2 MDA468_1 MDA468_2
## [1,]     9.54     4.58   5.07   5.42    25.43    27.42     4.56     3.88
## [2,]    14.00    11.58   6.49   6.64     9.80    10.31     6.84     8.75
## [3,]    10.22     8.29  11.55  10.82    12.48    10.11     9.54     8.46
## [4,]     9.00     6.35  13.40  14.82    14.94    11.33     6.87     7.92
## [5,]     8.21     4.44  12.08   9.82    16.51    12.34    15.43     8.50
## [6,]    12.51    15.84   8.05   8.38     6.78     8.46     4.48     7.18
##      SKBR3_1 SKBR3_2
## [1,]    8.46    5.64
## [2,]   11.91   13.67
## [3,]    9.10    9.43
## [4,]    8.54    6.82
## [5,]    7.93    4.75
## [6,]   13.27   15.05
## make a heatmap from the data
pheatmap(breastcancercells_mat1, color=pats_cols,cellwidth=30,cellheight=.03,cluster_cols=FALSE,cluster_rows=TRUE,legend=TRUE,fontsize = 7,scale="column")


##INTERPRETATION##
## What can you see in this figure? are the repeated measures/reps similar or different? What does this say about the precision and accuracy of them? 
##How does the control compare to the variables? Is this what you might expect? Why? What would you look for in the literature to support this idea?  

#looks like _______ line  is different than the other cells lines.  We will take a closer look at that in the next section, once we've done some further data manipulations

Data Manipulation


## first, make new columns that combine the two reps for each variable by averaging them  

breastcancercells_data2<-breastcancercells_data %>% mutate(
  mean_MCF10A = ((MCF10A_1 + MCF10A_2)/2),
  mean_MCF7= ((MCF7_1 + MCF7_2)/2), 
  mean_MDA231= ((MDA231_1 + MDA231_2)/2), 
  mean_MDA468= ((MDA468_1 + MDA468_2)/2),
  mean_SKBR3= ((SKBR3_1 + SKBR3_2)/2)) 

view(breastcancercells_data2)
 
#did it create your new columns?

## then make some new columns that store the log ratios of the variable means you just created, as compared to the control

breastcancercells_data2<-breastcancercells_data2 %>% mutate(
  log_MCF7=log2(mean_MCF7/mean_MCF10A), 
  log_MDA231=log2(mean_MDA231/mean_MCF10A), 
  log_MDA468=log2(mean_MDA468/mean_MCF10A),
  log_SKBR3=log2(mean_SKBR3/mean_MCF10A)) 
 
colnames(breastcancercells_data2)
##  [1] "Gene_Symbol"             "Description"            
##  [3] "Peptides"                "MCF10A_1"               
##  [5] "MCF10A_2"                "MCF7_1"                 
##  [7] "MCF7_2"                  "MDA231_1"               
##  [9] "MDA231_2"                "MDA468_1"               
## [11] "MDA468_2"                "SKBR3_1"                
## [13] "SKBR3_2"                 "pvalue_MCF7_vs_MCF10A"  
## [15] "pvalue_MDA231_vs_MCF10A" "pvalue_MDA468_vs_MCF10A"
## [17] "pvalue_SKBR3_vs_MCF10A"  "mean_MCF10A"            
## [19] "mean_MCF7"               "mean_MDA231"            
## [21] "mean_MDA468"             "mean_SKBR3"             
## [23] "log_MCF7"                "log_MDA231"             
## [25] "log_MDA468"              "log_SKBR3"
 

#did it create your new columns?
## ALTERNATIVE CHOICE: make a matrix of just log values the data and a heat map of that. How do the two heat maps compare? This is not possible if your values contain 0's

breastcancercells_mat2<-breastcancercells_data2 %>% select(log_MCF7:log_SKBR3) %>% as.matrix() %>% round(.,2)

pheatmap(breastcancercells_mat2, color=pats_cols,cellwidth=30,cellheight=.03,cluster_cols=FALSE,cluster_rows=TRUE,legend=TRUE,fontsize = 7,scale="column")

Diving deeping with VOLCANO PLOTS

#SKBR3 vs MCF10A

##BASIC VOLCANO PLOT

## Add a column that stores the negative log of the pvalue of interest (in this case, _____)
breastcancercells_data2<-breastcancercells_data2 %>% mutate(neglog_SKBR3=-log10(pvalue_SKBR3_vs_MCF10A))


## Use ggplot to plot the log ratio of ____ against the -log p-value ____
volcano_plot<-breastcancercells_data2 %>% ggplot(aes(x=log_SKBR3,y=neglog_SKBR3,description=Gene_Symbol))+
  geom_point(alpha=0.7,color="royalblue")

#to view it, type: 
volcano_plot


  
## BETTER VOLCANO PLOT
install.packages("ggrepel")
library("ggrepel")

## Define the significant ones (by using ifelse and setting # parameters) so they can be colored
breastcancercells_data2<-breastcancercells_data2 %>% mutate(significance=ifelse((log_SKBR3>2 & neglog_SKBR3>2.99),"UP", ifelse((log_SKBR3<c(-2) & neglog_SKBR3>2.99),"DOWN","NOT SIG")))

## Some standard colors
plain_cols3<-c("red","gray","blue")

## volcano plot as before with some added things
SKBR3_volcano_plot<-breastcancercells_data2 %>% ggplot(aes(x=log_SKBR3,y=neglog_SKBR3,description=Gene_Symbol,color=significance))+
  geom_point(alpha=0.7)+
   geom_text_repel(box.padding = 0.4, size = 3, aes(label=ifelse((log_SKBR3>2 & neglog_SKBR3>2.99),Gene_Symbol,"")))+
   geom_text_repel(box.padding = 0.4, size = 3, aes(label=ifelse((log_SKBR3<c(-2) & neglog_SKBR3>2.99),Gene_Symbol,"")))+
  scale_color_manual(values=plain_cols3)+
  xlim(-6,6)+
  theme_bw()+
  theme(axis.text = element_text(colour = "black",size=14))+
  theme(text = element_text(size=14))+
  labs(x="log2 (mean SKBR3/mean MCF10A",y="-log10(p-value SKBR3 vs MCF10A)")
  
#to view it, type
SKBR3_volcano_plot

## use ggplotly to see hover over the points to see the gene names. Record these for the next step 

ggplotly(SKBR3_volcano_plot)

##INTERPRETATION##
## Significant up regulated are: ___, ___ and _____
## Significant down regulated are: _____. _____ and ____
#Why? How?

Barplots of significant points of interest

EXAMPLES OF A COUPLE PROTEINS or GENES

# Make a pivot longer table of the breastcancercells data for MCF10A_1:SKBR3_2


SKBR3_long<-pivot_longer(breastcancercells_data2, cols = c(MCF10A_1:SKBR3_2), names_to = 'variable')%>% select(-c(pvalue_SKBR3_vs_MCF10A:significance))

head(SKBR3_long)
## # A tibble: 6 × 8
##   Gene_Symbol Description  Peptides pvalue_MCF7_vs_MCF10A pvalue_MDA231_vs_MCF…¹
##   <chr>       <chr>           <dbl>                 <dbl>                  <dbl>
## 1 NES         Nestin OS=H…        7                 0.542                 0.0185
## 2 NES         Nestin OS=H…        7                 0.542                 0.0185
## 3 NES         Nestin OS=H…        7                 0.542                 0.0185
## 4 NES         Nestin OS=H…        7                 0.542                 0.0185
## 5 NES         Nestin OS=H…        7                 0.542                 0.0185
## 6 NES         Nestin OS=H…        7                 0.542                 0.0185
## # ℹ abbreviated name: ¹​pvalue_MDA231_vs_MCF10A
## # ℹ 3 more variables: pvalue_MDA468_vs_MCF10A <dbl>, variable <chr>,
## #   value <dbl>

## based on the gene symbols from plotly, select a few proteins from the table. Select just the data and gene symbols and pivot longer

breastcancercellsSKBR3_down<-SKBR3_long %>% filter(Gene_Symbol=="APOA1" | Gene_Symbol=="HLA-A" | Gene_Symbol=="MYO1B"| Gene_Symbol=="HMGN5") 

head(breastcancercellsSKBR3_down)
## # A tibble: 6 × 8
##   Gene_Symbol Description  Peptides pvalue_MCF7_vs_MCF10A pvalue_MDA231_vs_MCF…¹
##   <chr>       <chr>           <dbl>                 <dbl>                  <dbl>
## 1 HLA-A       HLA class I…        3                0.0312                  0.122
## 2 HLA-A       HLA class I…        3                0.0312                  0.122
## 3 HLA-A       HLA class I…        3                0.0312                  0.122
## 4 HLA-A       HLA class I…        3                0.0312                  0.122
## 5 HLA-A       HLA class I…        3                0.0312                  0.122
## 6 HLA-A       HLA class I…        3                0.0312                  0.122
## # ℹ abbreviated name: ¹​pvalue_MDA231_vs_MCF10A
## # ℹ 3 more variables: pvalue_MDA468_vs_MCF10A <dbl>, variable <chr>,
## #   value <dbl>

  
## make barplots facetted by Gene Symbol (when working with other data sets - change x=order to x = variable)
breastcancercellsSKBR3_plot_down<-breastcancercellsSKBR3_down %>% 
  ggplot(aes(x=factor(variable,levels=c('MCF10A_1','MCF10A_2','MCF7_1','MCF7_2','MDA231_1','MDA231_2','MDA468_1','MDA468_2','SKBR3_1','SKBR3_2')),y=value))+ 
  geom_bar(stat="identity",fill="red")+
  facet_wrap(~Gene_Symbol)+
  theme_bw()+
  theme(axis.text = element_text(colour = "black",size=10))+
  theme(text = element_text(size=14))+
  theme(axis.text.x = element_text(angle=45, hjust=1))+
  labs(x="cell line",y="relative expression")
  
breastcancercellsSKBR3_plot_down



#check viewer and / or plots to see it 

##INTERPRETATION## ## What can you see in this figure? are the repeated measures/reps similar or different? What does this say about the precision and accuracy of them? ##How does the control compare to the variables? Is this what you might expect? Why? What would you look for in the literature to support this idea?

## Same process for the upregulated ones

breastcancercellsSKBR3_up<-SKBR3_long %>% filter(Gene_Symbol=="GCLC" | Gene_Symbol=="FNBP1L" | Gene_Symbol=="KRT23"| Gene_Symbol=="TDP2"| Gene_Symbol=="DENND4C") 

head(breastcancercellsSKBR3_up)
## # A tibble: 6 × 8
##   Gene_Symbol Description  Peptides pvalue_MCF7_vs_MCF10A pvalue_MDA231_vs_MCF…¹
##   <chr>       <chr>           <dbl>                 <dbl>                  <dbl>
## 1 TDP2        Isoform 2 o…        3                0.0400                 0.0281
## 2 TDP2        Isoform 2 o…        3                0.0400                 0.0281
## 3 TDP2        Isoform 2 o…        3                0.0400                 0.0281
## 4 TDP2        Isoform 2 o…        3                0.0400                 0.0281
## 5 TDP2        Isoform 2 o…        3                0.0400                 0.0281
## 6 TDP2        Isoform 2 o…        3                0.0400                 0.0281
## # ℹ abbreviated name: ¹​pvalue_MDA231_vs_MCF10A
## # ℹ 3 more variables: pvalue_MDA468_vs_MCF10A <dbl>, variable <chr>,
## #   value <dbl>


breastcancercellsSKBR3_plot_up <-breastcancercellsSKBR3_up %>% ggplot(aes(x=factor(variable,levels=c('MCF10A_1','MCF10A_2','MCF7_1','MCF7_2','MDA231_1','MDA231_2','MDA468_1','MDA468_2','SKBR3_1','SKBR3_2')),y=value))+ 
  geom_bar(stat="identity",fill="royalblue")+
  facet_wrap(~Gene_Symbol)+
  theme_bw()+
  theme(axis.text = element_text(colour = "black",size=10))+
  theme(text = element_text(size=14))+
  theme(axis.text.x = element_text(angle=45, hjust=1))+
  labs(x="cell line",y="relative expression")
  
breastcancercellsSKBR3_plot_up




#check viewer and / or plots to see it 

##MDA231 vs MCF10A volcano plot


##BASIC VOLCANO PLOT

## Add a column that stores the negative log of the pvalue of interest (in this case, _____)
breastcancercells_data3<-breastcancercells_data2 %>% mutate(neglog_MDA231=-log10(pvalue_MDA231_vs_MCF10A))


## Use ggplot to plot the log ratio of ____ against the -log p-value ____
volcano_plot2<-breastcancercells_data3 %>% ggplot(aes(x=log_MDA231,y=neglog_MDA231,description=Gene_Symbol))+
  geom_point(alpha=0.7,color="royalblue")

#to view it, type: 
volcano_plot2


## BETTER VOLCANO PLOT

## Define the significant ones (by using ifelse and setting # parameters) so they can be colored
breastcancercells_data3<-breastcancercells_data3 %>% mutate(significance=ifelse((log_MDA231>1.87 & neglog_MDA231>2.87),"UP", ifelse((log_MDA231<c(-2.27) & neglog_MDA231>2.87),"DOWN","NOT SIG")))

## Some standard colors
plain_cols3<-c("red","gray","blue")

## volcano plot as before with some added things
MDA231_volcano_plot<-breastcancercells_data3 %>% ggplot(aes(x=log_MDA231,y=neglog_MDA231,description=Gene_Symbol,color=significance))+
  geom_point(alpha=0.7)+
   geom_text_repel(box.padding = 0.4, size = 3, aes(label=ifelse((log_MDA231>1.87 & neglog_MDA231>2.87),Gene_Symbol,"")))+
   geom_text_repel(box.padding = 0.4, size = 3, aes(label=ifelse((log_MDA231<c(-2.27) & neglog_MDA231>2.87),Gene_Symbol,"")))+
  scale_color_manual(values=plain_cols3)+
  xlim(-6,6)+
  theme_bw()+
  theme(axis.text = element_text(colour = "black",size=14))+
  theme(text = element_text(size=14))+
  labs(x="log2 (mean MDA231/mean MCF10A",y="-log10(p-value MDA231 vs MCF10A)")
  
#to view it, type
MDA231_volcano_plot

## use ggplotly to see hover over the points to see the gene names. Record these for the next step 

ggplotly(MDA231_volcano_plot)

##INTERPRETATION##
## Significant up regulated are: ___, ___ and _____
## Significant down regulated are: _____. _____ and ____
#Why? How?

##MDA231 vs MCF10A bar plots of significant points of interest

# Make a pivot longer table of the breastcancercells data for MCF10A_1:SKBR3_2


MDA231_long<-pivot_longer(breastcancercells_data3, cols = c(MCF10A_1:SKBR3_2), names_to = 'variable')%>% select(-c(pvalue_MDA231_vs_MCF10A:significance))

head(MDA231_long)
## # A tibble: 6 × 7
##   Gene_Symbol Description  Peptides pvalue_MCF7_vs_MCF10A neglog_MDA231 variable
##   <chr>       <chr>           <dbl>                 <dbl>         <dbl> <chr>   
## 1 NES         Nestin OS=H…        7                 0.542          1.73 MCF10A_1
## 2 NES         Nestin OS=H…        7                 0.542          1.73 MCF10A_2
## 3 NES         Nestin OS=H…        7                 0.542          1.73 MCF7_1  
## 4 NES         Nestin OS=H…        7                 0.542          1.73 MCF7_2  
## 5 NES         Nestin OS=H…        7                 0.542          1.73 MDA231_1
## 6 NES         Nestin OS=H…        7                 0.542          1.73 MDA231_2
## # ℹ 1 more variable: value <dbl>
## based on the gene symbols from plotly, select a few proteins from the table. Select just the data and gene symbols and pivot longer

breastcancercells_MDA231_down<-MDA231_long %>% filter(Gene_Symbol=="APOA1" | Gene_Symbol=="HLA-A" | Gene_Symbol=="HIST1H1A"| Gene_Symbol=="ADCK3"| Gene_Symbol=="PTGFRN"| Gene_Symbol=="F11R") 

head(breastcancercells_MDA231_down)
## # A tibble: 6 × 7
##   Gene_Symbol Description  Peptides pvalue_MCF7_vs_MCF10A neglog_MDA231 variable
##   <chr>       <chr>           <dbl>                 <dbl>         <dbl> <chr>   
## 1 HLA-A       HLA class I…        3                0.0312         0.915 MCF10A_1
## 2 HLA-A       HLA class I…        3                0.0312         0.915 MCF10A_2
## 3 HLA-A       HLA class I…        3                0.0312         0.915 MCF7_1  
## 4 HLA-A       HLA class I…        3                0.0312         0.915 MCF7_2  
## 5 HLA-A       HLA class I…        3                0.0312         0.915 MDA231_1
## 6 HLA-A       HLA class I…        3                0.0312         0.915 MDA231_2
## # ℹ 1 more variable: value <dbl>

  
## make barplots facetted by Gene Symbol (when working with other data sets - change x=order to x = variable)
breastcancercells_MDA231_plot_down<-breastcancercells_MDA231_down %>% 
  ggplot(aes(x=factor(variable,levels=c('MCF10A_1','MCF10A_2','MCF7_1','MCF7_2','MDA231_1','MDA231_2','MDA468_1','MDA468_2','SKBR3_1','SKBR3_2')),y=value))+ 
  geom_bar(stat="identity",fill="red")+
  facet_wrap(~Gene_Symbol)+
  theme_bw()+
  theme(axis.text = element_text(colour = "black",size=10))+
  theme(text = element_text(size=14))+
  theme(axis.text.x = element_text(angle=45, hjust=1))+
  labs(x="cell line",y="relative expression")
  
breastcancercells_MDA231_plot_down

#check viewer and / or plots to see it

## Same process for the upregulated ones

breastcancercells_MDA231_up<-MDA231_long %>% filter(Gene_Symbol=="ACSS3" | Gene_Symbol=="ABHD14B" | Gene_Symbol=="PNPLA2") 

head(breastcancercells_MDA231_up)
## # A tibble: 6 × 7
##   Gene_Symbol Description  Peptides pvalue_MCF7_vs_MCF10A neglog_MDA231 variable
##   <chr>       <chr>           <dbl>                 <dbl>         <dbl> <chr>   
## 1 ABHD14B     Abhydrolase…        4               0.00136          3.29 MCF10A_1
## 2 ABHD14B     Abhydrolase…        4               0.00136          3.29 MCF10A_2
## 3 ABHD14B     Abhydrolase…        4               0.00136          3.29 MCF7_1  
## 4 ABHD14B     Abhydrolase…        4               0.00136          3.29 MCF7_2  
## 5 ABHD14B     Abhydrolase…        4               0.00136          3.29 MDA231_1
## 6 ABHD14B     Abhydrolase…        4               0.00136          3.29 MDA231_2
## # ℹ 1 more variable: value <dbl>


breastcancercells_MDA231_plot_up <-breastcancercells_MDA231_up %>% ggplot(aes(x=factor(variable,levels=c('MCF10A_1','MCF10A_2','MCF7_1','MCF7_2','MDA231_1','MDA231_2','MDA468_1','MDA468_2','SKBR3_1','SKBR3_2')),y=value))+ 
  geom_bar(stat="identity",fill="royalblue")+
  facet_wrap(~Gene_Symbol)+
  theme_bw()+
  theme(axis.text = element_text(colour = "black",size=10))+
  theme(text = element_text(size=14))+
  theme(axis.text.x = element_text(angle=45, hjust=1))+
  labs(x="cell line",y="relative expression")
  
breastcancercells_MDA231_plot_up




#check viewer and / or plots to see it 

##INTERPRETATION## ## What can you see in this figure? are the repeated measures/reps similar or different? What does this say about the precision and accuracy of them? ##How does the control compare to the variables? Is this what you might expect? Why? What would you look for in the literature to support this idea?

#interpretation HINT:insert a chunk and create two seprate lines of code that filter for your specific upregulated genes/proteins of interest and selects for only their gene symbols and descriptions. Do this for the downregulated as well. This will generate two list of the descriptors for each gene of interest, helping you understand your figures. Be sure to view it, not just ask for the head of the table generated.

up_SKBR3 <- breastcancercells_data %>% select(Gene_Symbol, Description) %>% filter(Gene_Symbol=="FNBP1L" | Gene_Symbol=="GCLC" | Gene_Symbol=="KRT23" | Gene_Symbol=="TDP2" | Gene_Symbol=="DENND4C")

down_SKBR3  <- breastcancercells_data %>% select(Gene_Symbol, Description) %>% filter(Gene_Symbol=="APOA1" | Gene_Symbol=="HLA-A" | Gene_Symbol=="MYO1B"| Gene_Symbol=="HMGN5") %>% filter(Description =="Apolipoprotein A-I OS=Homo sapiens GN=APOA1 PE=1 SV=1" | Description =="Isoform 2 of Myosin-Ib OS=Homo sapiens GN=MYO1B"| Description =="High mobility group nucleosome-binding domain-containing protein 5 OS=Homo sapiens GN=HMGN5 PE=1 SV=1"| Description == "HLA class I histocompatibility antigen, A-1 alpha chain OS=Homo sapiens GN=HLA-A PE=1 SV=1" )

up_MDA231 <- breastcancercells_data %>% select(Gene_Symbol, Description) %>% filter(Gene_Symbol=="ACSS3" | Gene_Symbol=="ABHD14B" | Gene_Symbol=="PNPLA2") 

down_MDA231  <- breastcancercells_data %>% select(Gene_Symbol, Description) %>% filter(Gene_Symbol=="APOA1" | Gene_Symbol=="HLA-A" | Gene_Symbol=="HIST1H1A"| Gene_Symbol=="ADCK3"| Gene_Symbol=="PTGFRN"| Gene_Symbol=="F11R")  %>% filter(Description =="Apolipoprotein A-I OS=Homo sapiens GN=APOA1 PE=1 SV=1" | Description =="HLA class I histocompatibility antigen, A-1 alpha chain OS=Homo sapiens GN=HLA-A PE=1 SV=1"| Description =="Histone H1.1 OS=Homo sapiens GN=HIST1H1A PE=1 SV=3"| Description == "Isoform 3 of Chaperone activity of bc1 complex-like, mitochondrial OS=Homo sapiens GN=ADCK3" | Description == "Prostaglandin F2 receptor negative regulator OS=Homo sapiens GN=PTGFRN PE=1 SV=2" | Description == "Junctional adhesion molecule A OS=Homo sapiens GN=F11R PE=1 SV=1" )

WRAP UP

##now you can knit this and publish to save and share your code. Use this to work with either the brain or breast cells and the Part_C_template to complete your lab 6 ELN. ##Annotate when you have trouble and reference which line of code you need help on ## good luck and have fun!