####Run through this example and try to understand what is going on with the data

####So, lets load RMarkdown

library(rmarkdown)

set messages to FALSE on everything (prevents certain boring things from being shown in the results)

knitr::opts_chunk$set(echo = TRUE, message=FALSE,warning=FALSE,collapse = TRUE)

PACKAGES

library(reshape2)
library(ggplot2)
library(dplyr)
library(plotly)
library(viridis)
library(data.table)
library(pheatmap)
library(tidyverse)
library(ggthemes)
library(clipr)
library(tidyr)
library(Rcpp)

COLORS

mycolors<-c(viridis(15))
felix_cols<-mycolors[c(5,2)]
felix_4cols<-mycolors[c(15,10,8,2)]
plain_cols1<-c("blue","gray")
plain_cols2<-c("red","gray")

pats_cols<-colorRampPalette(c("#FDE725FF", "white","#440154FF"))(21)
leos_cols<-colorRampPalette(c("white","blue"))(10)

LOAD DATA AND MAKE AN OVERALL HEATMAP

## load the dataset 
breastcancer_data<-read_csv(file="breast_cancer_cells.csv")


#and then view it 

now let’s visualize the dataset and look for initial trends. We can do this by making a matrix so and then a heatmap to visualize the data

#make a matrix of just raw the data


breastcancer_mat1<-breastcancer_data %>% select(MCF10A_1:SKBR3_2) %>% as.matrix() %>% round(.,2)

head(breastcancer_mat1)
##      MCF10A_1 MCF10A_2 MCF7_1 MCF7_2 MDA231_1 MDA231_2 MDA468_1 MDA468_2
## [1,]     9.54     4.58   5.07   5.42    25.43    27.42     4.56     3.88
## [2,]    14.00    11.58   6.49   6.64     9.80    10.31     6.84     8.75
## [3,]    10.22     8.29  11.55  10.82    12.48    10.11     9.54     8.46
## [4,]     9.00     6.35  13.40  14.82    14.94    11.33     6.87     7.92
## [5,]     8.21     4.44  12.08   9.82    16.51    12.34    15.43     8.50
## [6,]    12.51    15.84   8.05   8.38     6.78     8.46     4.48     7.18
##      SKBR3_1 SKBR3_2
## [1,]    8.46    5.64
## [2,]   11.91   13.67
## [3,]    9.10    9.43
## [4,]    8.54    6.82
## [5,]    7.93    4.75
## [6,]   13.27   15.05

# what does that look like?

## how do I add data to make matrix? 
## make a heatmap from the data

pheatmap(breastcancer_mat1, color=pats_cols,cellwidth=30,cellheight=.03,cluster_cols=FALSE,cluster_rows=FALSE,legend=TRUE,fontsize = 7,scale="column")


##INTERPRETATION##
## What can you see in this figure? are the repeated measures/reps similar or different? What does this say about the precision and accuracy of them? 
##How does the control compare to the variables? Is this what you might expect? Why? What would you look for in the literature to support this idea?  

#looks like _______ line  is different than the other cells lines.  We will take a closer look at that in the next section, once we've done some further data manipulations

Data Manipulation


## first, make new columns that combine the two reps for each variable by averaging them  


breastcancer_data2<-breastcancer_data %>% mutate(
  mean_control= ((MCF10A_1 + MCF10A_2)/2),
  mean_MCF7= ((MCF7_1 + MCF7_2)/2), 
  mean_MDA231= ((MDA231_1 + MDA231_2)/2), 
  mean_MDA468= ((MDA468_1 + MDA468_2)/2),
  mean_SKBR3= ((SKBR3_1 + SKBR3_2)/2)) 
 
view(breastcancer_data2)

#did it create your new columns?

## then make some new columns that store the log ratios of the variable means you just created, as compared to the control

 
breastcancer_data<-breastcancer_data %>% mutate(
  log_MCF7=log2((MCF7_1 + MCF7_2)/(MCF10A_1 + MCF10A_2)), 
  log_MDA231=log2((MDA231_1 + MDA231_2)/(MCF10A_1 + MCF10A_2)), 
  log_MDA468=log2((MDA468_1 + MDA468_2)/(MCF10A_1 + MCF10A_2)),
  log_SKBR3=log2((SKBR3_1 + SKBR3_2)/(MCF10A_1 + MCF10A_2))) 
#did it create your new columns?
## ALTERNATIVE CHOICE: make a matrix of just log values the data and a heat map of that. How do the two heat maps compare? This is not possible if your values contain 0's

breastcancer_mat2<-breastcancer_data2 %>% select(MCF10A_1:SKBR3_2) %>% as.matrix() %>% round(.,2)

pheatmap(breastcancer_mat2, color=pats_cols,cellwidth=30,cellheight=.03,cluster_cols=FALSE,cluster_rows=TRUE,legend=TRUE,fontsize = 7,scale="column")

Diving deeping with VOLCANO PLOTS

##BASIC VOLCANO PLOT
breastcancer_data<-breastcancer_data %>% mutate(neglog_MCF7=-log10(pvalue_MCF7_vs_MCF10A))

volcano_plot<-breastcancer_data %>% ggplot(aes(x=log_MCF7,y=neglog_MCF7,description=Gene_Symbol))+
  geom_point(alpha=0.7,color="green")

volcano_plot

#Do not include into eln, need interactive plot.

## Add a column that stores the negative log of the pvalue of interest (in this case, _____)


## Use ggplot to plot the log ratio of ____ against the -log p-value ____


#to view it, type: 

  
## BETTER VOLCANO PLOT

## Define the significant ones (by using ifelse and setting # parameters) so they can be colored

breastcancer_data<-breastcancer_data %>% mutate(significance=ifelse((log_MCF7>3 & neglog_MCF7>4),"UP", ifelse((log_MCF7<c(-2) & neglog_MCF7>4),"DOWN","NOT SIG")))

## Some standard colors

plain_cols3<-c("red","gray","blue")
## volcano plot as before with some added things
better_volcano_plot<-breastcancer_data %>% ggplot(aes(x=log_MCF7,y=neglog_MCF7,description=Gene_Symbol,color=significance))+
  geom_point(alpha=0.7)+
  scale_color_manual(values=plain_cols3)+
  xlim(-6,6)+
  theme_bw()+
  theme(axis.text = element_text(colour = "black",size=14))+
  theme(text = element_text(size=14))+
  labs(x="log ratio at MCF7 compared to MCF10A",y="-log(p-value)")
  

better_volcano_plot

  
#to view it, type


#check viewer and / or plots to see it 
## use ggplotly to see hover over the points to see the gene names. Record these for the next step 

ggplotly(better_volcano_plot)
##INTERPRETATION##
## Significant up regulated are: log_MCF7>3 & neglog_MCF7>4 
## Significant down regulated are: log_MCF7<c(-2) & neglog_MCF7>4
#Why? How? I had to many samples so I changed the parameters to be under 5 for each one.  

Barplots of significant points of interest

EXAMPLES OF A COUPLE PROTEINS or GENES

# Make a pivot longer table of the C19 data for Control_2h_1:Virus_24h_2

breastcancer_long<-pivot_longer(breastcancer_data, cols = c(MCF10A_1:SKBR3_2), names_to = 'variable')%>% select(-c(pvalue_MCF7_vs_MCF10A:significance))%>%select(-Description)

head(breastcancer_long)
## # A tibble: 6 × 4
##   Gene_Symbol Peptides variable value
##   <chr>          <dbl> <chr>    <dbl>
## 1 NES                7 MCF10A_1  9.54
## 2 NES                7 MCF10A_2  4.58
## 3 NES                7 MCF7_1    5.07
## 4 NES                7 MCF7_2    5.42
## 5 NES                7 MDA231_1 25.4 
## 6 NES                7 MDA231_2 27.4

## based on the gene symbols from plotly, select a few proteins from the table. Select just the data and gene symbols and pivot longer



Examples_Down<-breastcancer_long %>% filter(Gene_Symbol=="ADCK3" | Gene_Symbol=="CTSZ" | Gene_Symbol=="APOA1") 
## those need to be changes.
head(Examples_Down)
## # A tibble: 6 × 4
##   Gene_Symbol Peptides variable value
##   <chr>          <dbl> <chr>    <dbl>
## 1 CTSZ               8 MCF10A_1 19.8 
## 2 CTSZ               8 MCF10A_2 19.8 
## 3 CTSZ               8 MCF7_1    0.83
## 4 CTSZ               8 MCF7_2    0.91
## 5 CTSZ               8 MDA231_1 11.2 
## 6 CTSZ               8 MDA231_2 15.4

Example_plot_down<-Examples_Down %>% 
  ggplot(aes(x=variable,y=value))+ 
  geom_bar(stat="identity",fill="red")+
  facet_wrap(~Gene_Symbol)+
  theme_bw()+
  theme(axis.text = element_text(colour = "black",size=10))+
  theme(text = element_text(size=14))+
  theme(axis.text.x = element_text(angle=45, hjust=1))+
  labs(x="sample",y="relative intensity")
  
Example_plot_down

  
## make barplots facetted by Gene Symbol (when working with other data sets - change x=order to x = variable)






  


#check viewer and / or plots to see it 

##INTERPRETATION## ## What can you see in this figure? are the repeated measures/reps similar or different? What does this say about the precision and accuracy of them? ##How does the control compare to the variables? Is this what you might expect? Why? What would you look for in the literature to support this idea?

## Same process for the upregulated ones

Examples_Up<-breastcancer_long %>% filter(Gene_Symbol=="SYT7" | Gene_Symbol=="CORO1A"| Gene_Symbol=="AGR3"| Gene_Symbol=="C17orf28")

head(Examples_Up)
## # A tibble: 6 × 4
##   Gene_Symbol Peptides variable value
##   <chr>          <dbl> <chr>    <dbl>
## 1 AGR3               5 MCF10A_1  1.99
## 2 AGR3               5 MCF10A_2  2.23
## 3 AGR3               5 MCF7_1   41.7 
## 4 AGR3               5 MCF7_2   41.0 
## 5 AGR3               5 MDA231_1  1.82
## 6 AGR3               5 MDA231_2  2.57

Example_plot_up<-Examples_Up %>% ggplot(aes(x=variable,y=value))+ 
  geom_bar(stat="identity",fill="royalblue")+
  facet_wrap(~Gene_Symbol)+
  theme_bw()+
  theme(axis.text = element_text(colour = "black",size=10))+
  theme(text = element_text(size=14))+
  theme(axis.text.x = element_text(angle=45, hjust=1))+
  labs(x="sample",y="relative intensity")
  
Example_plot_up



#check viewer and / or plots to see it 

##INTERPRETATION## ## What can you see in this figure? are the repeated measures/reps similar or different? What does this say about the precision and accuracy of them? ##How does the control compare to the variables? Is this what you might expect? Why? What would you look for in the literature to support this idea?

##Had to re-assign parameters for volcano plotty as it had more proteins or genes then necessary for analysis
## up regulated genes do not follow as expected for the most part but Gene SYT7 did follow the expected upward trend more than the other ones. AGR3 had the most unexpected results being pretty high in the beginning, opposite of what is expected. There could be many reasons why the result is not as expected: contamination, external stimuli, genetic alterations, or if a gene is disregarding apoptosis. It is most likely that the genes with an unexpected trend are inhibiting apoptosis; this makes sense as they are cancer cells.
# down regulated genes are following the expected result more than the up regulated ones. However, there is still some deviation specifically in the CTSZ sample.

#interpretation HINT:insert a chunk and create two seprate lines of code that filter for your specific upregulated genes/proteins of interest and selects for only their gene symbols and descriptions. Do this for the downregulated as well. This will generate two list of the descriptors for each gene of interest, helping you understand your figures. Be sure to view it, not just ask for the head of the table generated.


Breastcancer_Up<-breastcancer_data %>% filter (Gene_Symbol=="SYT7" | Gene_Symbol=="CORO1A"| Gene_Symbol=="AGR3"| Gene_Symbol=="C17orf28")%>% select(Gene_Symbol|Description)

view(Breastcancer_Up)

Breastcancer_Down<-breastcancer_data %>% filter (Gene_Symbol=="ADCK3" | Gene_Symbol=="CTSZ" | Gene_Symbol=="APOA1") %>% select(Gene_Symbol|Description)

view(Breastcancer_Down)

WRAP UP

##now you can knit this and publish to save and share your code. Use this to work with either the brain or breast cells and the Part_C_template to complete your lab 6 ELN. ##Annotate when you have trouble and reference which line of code you need help on ## good luck and have fun!