####Run through this example and try to understand what is going on with the data

####So, lets load RMarkdown

library(rmarkdown)

set messages to FALSE on everything (prevents certain boring things from being shown in the results)

knitr::opts_chunk$set(echo = TRUE, message=FALSE,warning=FALSE,collapse = TRUE)

PACKAGES

library(reshape2)
library(ggplot2)
library(dplyr)
library(plotly)
library(viridis)
library(data.table)
library(pheatmap)
library(tidyverse)
library(ggthemes)
library(clipr)
library(tidyr)
library(Rcpp)

COLORS

mycolors<-c(viridis(15))
felix_cols<-mycolors[c(5,2)]
felix_4cols<-mycolors[c(15,10,8,2)]
plain_cols1<-c("blue","gray")
plain_cols2<-c("red","gray")

pats_cols<-colorRampPalette(c("#FDE725FF", "white","#440154FF"))(21)
leos_cols<-colorRampPalette(c("white","blue"))(10)

LOAD DATA AND MAKE AN OVERALL HEATMAP

## load the dataset 

bc_data<-read_csv(file = "breast_cancer_cells.csv")

head(bc_data)
## # A tibble: 6 × 17
##   Gene_S…¹ Descr…² Pepti…³ MCF10…⁴ MCF10…⁵ MCF7_1 MCF7_2 MDA23…⁶ MDA23…⁷ MDA46…⁸
##   <chr>    <chr>     <dbl>   <dbl>   <dbl>  <dbl>  <dbl>   <dbl>   <dbl>   <dbl>
## 1 NES      Nestin…       7    9.54    4.58   5.07   5.42   25.4    27.4     4.56
## 2 ZC3HC1   Isofor…       2   14      11.6    6.49   6.64    9.8    10.3     6.84
## 3 CAMSAP1… Isofor…       5   10.2     8.29  11.6   10.8    12.5    10.1     9.54
## 4 COBRA1   Negati…       3    9       6.35  13.4   14.8    14.9    11.3     6.87
## 5 HAUS6    Isofor…       5    8.21    4.44  12.1    9.82   16.5    12.3    15.4 
## 6 CHCHD4   Isofor…       5   12.5    15.8    8.05   8.38    6.78    8.46    4.48
## # … with 7 more variables: MDA468_2 <dbl>, SKBR3_1 <dbl>, SKBR3_2 <dbl>,
## #   pvalue_MCF7_vs_MCF10A <dbl>, pvalue_MDA231_vs_MCF10A <dbl>,
## #   pvalue_MDA468_vs_MCF10A <dbl>, pvalue_SKBR3_vs_MCF10A <dbl>, and
## #   abbreviated variable names ¹​Gene_Symbol, ²​Description, ³​Peptides,
## #   ⁴​MCF10A_1, ⁵​MCF10A_2, ⁶​MDA231_1, ⁷​MDA231_2, ⁸​MDA468_1

colnames(bc_data)
##  [1] "Gene_Symbol"             "Description"            
##  [3] "Peptides"                "MCF10A_1"               
##  [5] "MCF10A_2"                "MCF7_1"                 
##  [7] "MCF7_2"                  "MDA231_1"               
##  [9] "MDA231_2"                "MDA468_1"               
## [11] "MDA468_2"                "SKBR3_1"                
## [13] "SKBR3_2"                 "pvalue_MCF7_vs_MCF10A"  
## [15] "pvalue_MDA231_vs_MCF10A" "pvalue_MDA468_vs_MCF10A"
## [17] "pvalue_SKBR3_vs_MCF10A"
## make a matrix of just original raw the data


bc_mat1<-bc_data %>% select(MCF10A_1:SKBR3_2) %>% as.matrix() %>% round(.,2)

head(bc_mat1)
##      MCF10A_1 MCF10A_2 MCF7_1 MCF7_2 MDA231_1 MDA231_2 MDA468_1 MDA468_2
## [1,]     9.54     4.58   5.07   5.42    25.43    27.42     4.56     3.88
## [2,]    14.00    11.58   6.49   6.64     9.80    10.31     6.84     8.75
## [3,]    10.22     8.29  11.55  10.82    12.48    10.11     9.54     8.46
## [4,]     9.00     6.35  13.40  14.82    14.94    11.33     6.87     7.92
## [5,]     8.21     4.44  12.08   9.82    16.51    12.34    15.43     8.50
## [6,]    12.51    15.84   8.05   8.38     6.78     8.46     4.48     7.18
##      SKBR3_1 SKBR3_2
## [1,]    8.46    5.64
## [2,]   11.91   13.67
## [3,]    9.10    9.43
## [4,]    8.54    6.82
## [5,]    7.93    4.75
## [6,]   13.27   15.05
## make a heatmap from the data

pheatmap(bc_mat1, color=pats_cols,cellwidth=30,cellheight=.03,cluster_cols=FALSE,cluster_rows=TRUE,legend=TRUE,fontsize = 7,scale="column")

##INTERPRETATION## ## What can you see in this figure? are the repeated measures/reps similar or different? What does this say about the precision and accuracy of them? ##How does the control compare to the variables? Is this what you might expect? Why? What would you look for in the literature to support this idea?


## make some new columns that take the average of the pairs of observations for each variable  

bc_data <- bc_data %>% mutate(
  mean_MCF10A= ((MCF10A_1 + MCF10A_2)/2),
  mean_MCF7= ((MCF7_1 + MCF7_2)/2), 
  mean_MDA231= ((MDA231_1 + MDA231_2)/2), 
  mean_MDA468= ((MDA468_1 + MDA468_2)/2),
  mean_SKBR3= ((SKBR3_1 + SKBR3_2)/2))

colnames(bc_data)
##  [1] "Gene_Symbol"             "Description"            
##  [3] "Peptides"                "MCF10A_1"               
##  [5] "MCF10A_2"                "MCF7_1"                 
##  [7] "MCF7_2"                  "MDA231_1"               
##  [9] "MDA231_2"                "MDA468_1"               
## [11] "MDA468_2"                "SKBR3_1"                
## [13] "SKBR3_2"                 "pvalue_MCF7_vs_MCF10A"  
## [15] "pvalue_MDA231_vs_MCF10A" "pvalue_MDA468_vs_MCF10A"
## [17] "pvalue_SKBR3_vs_MCF10A"  "mean_MCF10A"            
## [19] "mean_MCF7"               "mean_MDA231"            
## [21] "mean_MDA468"             "mean_SKBR3"

## make some new columns that store the log ratios of the means you just created  

bc_data<-bc_data %>% mutate(
  log_MCF7=log2(mean_MCF7/mean_MCF10A), 
  log_MDA231=log2(mean_MDA231/mean_MCF10A), 
  log_MDA468=log2(mean_MDA468/mean_MCF10A),
  log_SKBR3=log2(mean_SKBR3/mean_MCF10A)) 
 
colnames(bc_data)
##  [1] "Gene_Symbol"             "Description"            
##  [3] "Peptides"                "MCF10A_1"               
##  [5] "MCF10A_2"                "MCF7_1"                 
##  [7] "MCF7_2"                  "MDA231_1"               
##  [9] "MDA231_2"                "MDA468_1"               
## [11] "MDA468_2"                "SKBR3_1"                
## [13] "SKBR3_2"                 "pvalue_MCF7_vs_MCF10A"  
## [15] "pvalue_MDA231_vs_MCF10A" "pvalue_MDA468_vs_MCF10A"
## [17] "pvalue_SKBR3_vs_MCF10A"  "mean_MCF10A"            
## [19] "mean_MCF7"               "mean_MDA231"            
## [21] "mean_MDA468"             "mean_SKBR3"             
## [23] "log_MCF7"                "log_MDA231"             
## [25] "log_MDA468"              "log_SKBR3"
##Alternative Heatmap of log values compared to control 

bc_mat2<-bc_data %>% select(log_MCF7:log_SKBR3) %>% as.matrix() %>% round(.,2)

pheatmap(bc_mat2, color=pats_cols,cellwidth=30,cellheight=.03,cluster_cols=FALSE,cluster_rows=TRUE,legend=TRUE,fontsize = 7,scale="column")

PLOTTING A VOLCANO

##Preview of data via BASIC VOLCANO PLOT

## Add a column that stores the negative log of the pvalue of interest 
bc_data<-bc_data %>% mutate(
  neglog_MCF7=-log10(pvalue_MCF7_vs_MCF10A), 
  neglog_MDA231=-log10(pvalue_MDA231_vs_MCF10A),
  neglog_MDA468=-log10(pvalue_MDA468_vs_MCF10A),
  neglog_SKBR3=-log10(pvalue_SKBR3_vs_MCF10A))

colnames(bc_data)
##  [1] "Gene_Symbol"             "Description"            
##  [3] "Peptides"                "MCF10A_1"               
##  [5] "MCF10A_2"                "MCF7_1"                 
##  [7] "MCF7_2"                  "MDA231_1"               
##  [9] "MDA231_2"                "MDA468_1"               
## [11] "MDA468_2"                "SKBR3_1"                
## [13] "SKBR3_2"                 "pvalue_MCF7_vs_MCF10A"  
## [15] "pvalue_MDA231_vs_MCF10A" "pvalue_MDA468_vs_MCF10A"
## [17] "pvalue_SKBR3_vs_MCF10A"  "mean_MCF10A"            
## [19] "mean_MCF7"               "mean_MDA231"            
## [21] "mean_MDA468"             "mean_SKBR3"             
## [23] "log_MCF7"                "log_MDA231"             
## [25] "log_MDA468"              "log_SKBR3"              
## [27] "neglog_MCF7"             "neglog_MDA231"          
## [29] "neglog_MDA468"           "neglog_SKBR3"
## Use ggplot to plot the log ratio at 24 hours against the -log p-value at 24 hours
volcano_plot_MCF7<-bc_data %>% ggplot(aes(x=log_MCF7,y=neglog_MCF7,description=Gene_Symbol))+
  geom_point(alpha=0.7,color="royalblue")

volcano_plot_MCF7

## Use ggplot to plot the log ratio at 24 hours against the -log p-value at 24 hours
volcano_plot_MDA231<-bc_data %>% ggplot(aes(x=log_MDA231,y=neglog_MDA231,description=Gene_Symbol))+
  geom_point(alpha=0.7,color="darkgreen")

volcano_plot_MDA231

## Use ggplot to plot the log ratio at 24 hours against the -log p-value at 24 hours
volcano_plot_MDA468<-bc_data %>% ggplot(aes(x=log_MDA468,y=neglog_MDA468,description=Gene_Symbol))+
  geom_point(alpha=0.7,color="purple")

volcano_plot_MDA468

## Use ggplot to plot the log ratio at 24 hours against the -log p-value at 24 hours
volcano_plot_SKBR3<-bc_data %>% ggplot(aes(x=log_SKBR3,y=neglog_SKBR3,description=Gene_Symbol))+
  geom_point(alpha=0.7,color="red")

volcano_plot_SKBR3

PLOTTING A VOLCANO MCF7


## BETTER VOLCANO PLOT for MCF7

## Define the significant ones (by using ifelse and setting # parameters) so they can be colored
bc_data<-bc_data %>% mutate(significance=ifelse((log_MCF7>3 & neglog_MCF7>3.99),"UP", ifelse((log_MCF7<c(-3) & neglog_MCF7>3.99),"DOWN","NOT SIG")))
# adjusted significance values from 2 & 2.99 to 3 and 3.99 - too many showing for this cell line 

## Some standard colors
plain_cols3<-c("blue","gray", "red")

## volcano plot as before with some added things
better_volcano_plot_MCF7<-bc_data %>% ggplot(aes(x=log_MCF7,y=neglog_MCF7,description=Gene_Symbol,color=significance))+
  geom_point(alpha=0.7)+
  scale_color_manual(values=plain_cols3)+
  xlim(-6,6)+
  theme_bw()+
  theme(axis.text = element_text(colour = "black",size=14))+
  theme(text = element_text(size=14))+
  labs(x="log ratio of MCF7 compared to control, MCF10A",y="-log(p-value)")
  

better_volcano_plot_MCF7

## use ggplotly to see hover over the points to see the gene names. Record these for the next step 

ggplotly(better_volcano_plot_MCF7)
## Significant up regulated MCF7 are: C17orf28, SYT7, CORO1A and AGR3 ## Significant down regulated MCF7 are: CTSZ and APOA1

PLOTTING A VOLCANO MDA231


## BETTER VOLCANO PLOT for MDA231

## Define the significant ones (by using ifelse and setting # parameters) so they can be colored
bc_data<-bc_data %>% mutate(significance=ifelse((log_MDA231>2 & neglog_MDA231>2.75),"UP", ifelse((log_MDA231<c(-3) & neglog_MDA231>2.99),"DOWN","NOT SIG")))
# adjusted significance values  - too many down regulated for this cell line 

## Some standard colors
plain_cols3<-c("darkgreen","gray","orange")

## volcano plot as before with some added things
better_volcano_plot_MDA231<-bc_data %>% ggplot(aes(x=log_MDA231,y=neglog_MDA231,description=Gene_Symbol,color=significance))+
  geom_point(alpha=0.7)+
  scale_color_manual(values=plain_cols3)+
  xlim(-6,6)+
  theme_bw()+
  theme(axis.text = element_text(colour = "black",size=14))+
  theme(text = element_text(size=14))+
  labs(x="log ratio of MDA231 compared to control, MCF10A",y="-log(p-value)")
  

better_volcano_plot_MDA231

## use ggplotly to see hover over the points to see the gene names. Record these for the next step 

ggplotly(better_volcano_plot_MDA231)
## Significant up regulated MDA231 are: ACSS3, PAFAH1B3,and SMEK2 ## Significant down regulated MDA231 are: APOA1, HLA-A, HIST1H1A, and PTGFRN

PLOTTING A VOLCANO MDA468


## BETTER VOLCANO PLOT for MDA468

## Define the significant ones (by using ifelse and setting # parameters) so they can be colored
bc_data<-bc_data %>% mutate(significance=ifelse((log_MDA468>3.6 & neglog_MDA468>3.6),"UP", ifelse((log_MDA468<c(-3) & neglog_MDA468>3.5),"DOWN","NOT SIG")))
# adjusted significance values  - too many down regulated for this cell line 

## Some standard colors
plain_cols3<-c("purple","gray","brown")

## volcano plot as before with some added things
better_volcano_plot_MDA468<-bc_data %>% ggplot(aes(x=log_MDA468,y=neglog_MDA468,description=Gene_Symbol,color=significance))+
  geom_point(alpha=0.7)+
  scale_color_manual(values=plain_cols3)+
  xlim(-6,6)+
  theme_bw()+
  theme(axis.text = element_text(colour = "black",size=14))+
  theme(text = element_text(size=14))+
  labs(x="log ratio of MDA468 compared to control, MCF10A",y="-log(p-value)")
  

better_volcano_plot_MDA468

## use ggplotly to see hover over the points to see the gene names. Record these for the next step 

ggplotly(better_volcano_plot_MDA468)
## Significant up regulated MDA468 are: SYT7, ALDH1A3, AKR1C2, and NLRP2 ## Significant down regulated MDA468 are: HLA-A, and PLAU

PLOTTING A VOLCANO SKBR3


## BETTER VOLCANO PLOT for SKBR3

## Define the significant ones (by using ifelse and setting # parameters) so they can be colored
bc_data<-bc_data %>% mutate(significance=ifelse((log_SKBR3>2.7 & neglog_SKBR3>2.7),"UP", ifelse((log_SKBR3<c(-2.5) & neglog_SKBR3>2.5),"DOWN","NOT SIG")))
# adjusted significance values  - too many down regulated for this cell line 

## Some standard colors
plain_cols3<-c("red","gray","cyan")

## volcano plot as before with some added things
better_volcano_plot_SKBR3<-bc_data %>% ggplot(aes(x=log_SKBR3,y=neglog_SKBR3,description=Gene_Symbol,color=significance))+
  geom_point(alpha=0.7)+
  scale_color_manual(values=plain_cols3)+
  xlim(-6,6)+
  theme_bw()+
  theme(axis.text = element_text(colour = "black",size=14))+
  theme(text = element_text(size=14))+
  labs(x="log ratio of MDA468 compared to control, MCF10A",y="-log(p-value)")
  

better_volcano_plot_SKBR3

## use ggplotly to see hover over the points to see the gene names. Record these for the next step 

ggplotly(better_volcano_plot_SKBR3)
## Significant up regulated SKBR3are: HOOK1, ASS1, EFHD1, and ATP2A3 ## Significant down regulated SKBR3 are: HLA-A, APOA1, MYO1B, and TMX4

Interpreting VOLCANO PLOTS

# Significant up regulated MCF7 are: C17orf28, SYT7, CORO1A and AGR3 # Significant up regulated MDA231 are: ACSS3, PAFAH1B3,and SMEK2 # Significant up regulated MDA468 are: SYT7, ALDH1A3, AKR1C2, and NLRP2 # Significant up regulated SKBR3are: HOOK1, ASS1, EFHD1, and ATP2A3

# Significant down regulated MCF7 are: CTSZ and APOA1 # Significant down regulated MDA231 are: APOA1, HLA-A, HIST1H1A, and PTGFRN # Significant down regulated MDA468 are: HLA-A, and PLAU # Significant down regulated SKBR3 are: HLA-A, APOA1, MYO1B, and TMX4 #therefore, HLA-A and /or APOA1 are highly significant down regulated in all cell lines

Barplots of significant points of interest

EXAMPLES OF A COUPLE PROTEINS or GENES

# Make a pivot longer table of the  data
bc_long<-pivot_longer(bc_data, cols = c(MCF10A_1:SKBR3_2), names_to = 'variable')%>% select(-c(pvalue_MCF7_vs_MCF10A:significance))%>%select(-Description, -Peptides)

head(bc_long)
## # A tibble: 6 × 3
##   Gene_Symbol variable value
##   <chr>       <chr>    <dbl>
## 1 NES         MCF10A_1  9.54
## 2 NES         MCF10A_2  4.58
## 3 NES         MCF7_1    5.07
## 4 NES         MCF7_2    5.42
## 5 NES         MDA231_1 25.4 
## 6 NES         MDA231_2 27.4


## based on the gene symbols from plotly, select a few proteins from the table. Select just the data and gene symbols and pivot longer
Examples_Down<-bc_long %>% filter(Gene_Symbol=="HLA-A" | Gene_Symbol=="APOA1" | Gene_Symbol=="CTSZ" | Gene_Symbol=="HIST1H1A" | Gene_Symbol=="PTGFRN" | Gene_Symbol=="PLAU"| Gene_Symbol=="MYO1B" | Gene_Symbol=="TMX4") 
## Significant down regulated MCF7 are: CTSZ and APOA1
## Significant down regulated MDA231 are: APOA1, HLA-A, HIST1H1A, and PTGFRN
## Significant down regulated MDA468 are:  HLA-A,  and PLAU
## Significant down regulated SKBR3 are:  HLA-A, APOA1, MYO1B,  and TMX4

  
## make barplots facetted by Gene Symbol (when working with other data sets - change x=order to x = variable)

Example_plot_down<-Examples_Down %>% 
  ggplot(aes(x=variable,y=value))+ 
  geom_bar(stat="identity",fill="blue")+
  facet_wrap(~Gene_Symbol)+
  theme_bw()+
  theme(axis.text = element_text(colour = "black",size=10))+
  theme(text = element_text(size=14))+
  theme(axis.text.x = element_text(angle=45, hjust=1))+
  labs(x="sample",y="relative intensity")
  
Example_plot_down

##INTERPRETATION## ## What can you see in this figure? are the repeated measures/reps similar or different? What does this say about the precision and accuracy of them? ##How does the control compare to the variables? Is this what you might expect? Why? What would you look for in the literature to support this idea?

## Same process for the upregulated ones
Examples_Up<-bc_long %>% filter(Gene_Symbol=="C17orf28" | Gene_Symbol=="SYT7" | Gene_Symbol=="CORO1A" | Gene_Symbol=="AGR3" | Gene_Symbol=="ACSS3" | Gene_Symbol=="PAFAH1B3"| Gene_Symbol=="SMEK2" | Gene_Symbol=="SYT7" | Gene_Symbol=="ALDH1A3"| Gene_Symbol=="AKR1C2"| Gene_Symbol=="NLRP2"| Gene_Symbol=="HOOK1"| Gene_Symbol=="ASS1"| Gene_Symbol=="EFHD1"| Gene_Symbol=="ATP2A3") 

## Significant up regulated MCF7 are: C17orf28, SYT7, CORO1A and AGR3
## Significant up regulated MDA231 are: ACSS3, PAFAH1B3,and SMEK2
## Significant up regulated MDA468 are: SYT7, ALDH1A3, AKR1C2, and NLRP2
## Significant up regulated SKBR3are: HOOK1, ASS1, EFHD1, and ATP2A3


Example_plot_up<-Examples_Up %>% 
  ggplot(aes(x=variable,y=value))+ 
  geom_bar(stat="identity",fill="red")+
  facet_wrap(~Gene_Symbol)+
  theme_bw()+
  theme(axis.text = element_text(colour = "black",size=10))+
  theme(text = element_text(size=14))+
  theme(axis.text.x = element_text(angle=45, hjust=1))+
  labs(x="sample",y="relative intensity")
  
Example_plot_up

##INTERPRETATION## ## What can you see in this figure? are the repeated measures/reps similar or different? What does this say about the precision and accuracy of them? ##How does the control compare to the variables? Is this what you might expect? Why? What would you look for in the literature to support this idea?

#Upregulated genes: 

upregulated<-bc_data%>% filter (Gene_Symbol=="C17orf28" | Gene_Symbol=="SYT7" | Gene_Symbol=="CORO1A" | Gene_Symbol=="AGR3" | Gene_Symbol=="ACSS3" | Gene_Symbol=="PAFAH1B3"| Gene_Symbol=="SMEK2" | Gene_Symbol=="SYT7" | Gene_Symbol=="ALDH1A3"| Gene_Symbol=="AKR1C2"| Gene_Symbol=="NLRP2"| Gene_Symbol=="HOOK1"| Gene_Symbol=="ASS1"| Gene_Symbol=="EFHD1"| Gene_Symbol=="ATP2A3") %>% select(Gene_Symbol, Description)
  
view(upregulated)
# and screen shot that table
head(upregulated)
## # A tibble: 6 × 2
##   Gene_Symbol Description                                                       
##   <chr>       <chr>                                                             
## 1 AGR3        Anterior gradient protein 3 homolog OS=Homo sapiens GN=AGR3 PE=1 …
## 2 AKR1C2      Aldo-keto reductase family 1 member C2 OS=Homo sapiens GN=AKR1C2 …
## 3 NLRP2       Isoform 2 of NACHT, LRR and PYD domains-containing protein 2 OS=H…
## 4 SYT7        Synaptotagmin-7 OS=Homo sapiens GN=SYT7 PE=1 SV=3                 
## 5 ALDH1A3     Aldehyde dehydrogenase family 1 member A3 OS=Homo sapiens GN=ALDH…
## 6 ACSS3       Isoform 2 of Acyl-CoA synthetase short-chain family member 3, mit…
#Downregulated genes: 

downregulated<-bc_data%>% filter (Gene_Symbol=="HLA-A" | Gene_Symbol=="APOA1" | Gene_Symbol=="CTSZ" | Gene_Symbol=="HIST1H1A" | Gene_Symbol=="PTGFRN" | Gene_Symbol=="PLAU"| Gene_Symbol=="MYO1B" | Gene_Symbol=="TMX4") %>% select(Gene_Symbol, Description)
  
view(downregulated)
# and screen shot that table

head(downregulated)
## # A tibble: 6 × 2
##   Gene_Symbol Description                                                       
##   <chr>       <chr>                                                             
## 1 HLA-A       HLA class I histocompatibility antigen, A-23 alpha chain OS=Homo …
## 2 HLA-A       HLA class I histocompatibility antigen, A-30 alpha chain OS=Homo …
## 3 HLA-A       HLA class I histocompatibility antigen, A-2 alpha chain OS=Homo s…
## 4 CTSZ        Cathepsin Z OS=Homo sapiens GN=CTSZ PE=1 SV=1                     
## 5 PLAU        Isoform 2 of Urokinase-type plasminogen activator OS=Homo sapiens…
## 6 HIST1H1A    Histone H1.1 OS=Homo sapiens GN=HIST1H1A PE=1 SV=3

##now you can knit this and publish to save and share your code. be sure to include the links to your knitted doc for each of your group mates in your lab 6 ELN. #remeber: ##Annotate when you have trouble and reference which line of code you need help on ## good luck and have fun!