####Run through this example and try to understand what is going on with the data

####So, lets load RMarkdown

library(rmarkdown)

set messages to FALSE on everything (prevents certain boring things from being shown in the results)

knitr::opts_chunk$set(echo = TRUE, message=FALSE,warning=FALSE,collapse = TRUE)

PACKAGES

library(reshape2)
library(ggplot2)
library(dplyr)
library(plotly)
library(viridis)
library(data.table)
library(pheatmap)
library(tidyverse)
library(ggthemes)
library(clipr)
library(tidyr)
library(Rcpp)

COLORS

mycolors<-c(viridis(15))
felix_cols<-mycolors[c(5,2)]
felix_4cols<-mycolors[c(15,10,8,2)]
plain_cols1<-c("blue","gray")
plain_cols2<-c("red","gray")

pats_cols<-colorRampPalette(c("#FDE725FF", "white","#440154FF"))(21)
leos_cols<-colorRampPalette(c("white","blue"))(10)

LOAD DATA AND MAKE AN OVERALL HEATMAP

## load the dataset 
C19_data<-read_csv(file="breast_cancer_cells.csv")


#and then view it 
view(C19_data)

now let’s visualize the dataset and look for initial trends. We can do this by making a matrix so and then a heatmap to visualize the data

#make a matrix of just raw the data

C19_mat1<-C19_data %>% select(MCF10A_1:SKBR3_2) %>% as.matrix() %>% round(.,2)

# what does that look like?
head(C19_mat1)
##      MCF10A_1 MCF10A_2 MCF7_1 MCF7_2 MDA231_1 MDA231_2 MDA468_1 MDA468_2
## [1,]     9.54     4.58   5.07   5.42    25.43    27.42     4.56     3.88
## [2,]    14.00    11.58   6.49   6.64     9.80    10.31     6.84     8.75
## [3,]    10.22     8.29  11.55  10.82    12.48    10.11     9.54     8.46
## [4,]     9.00     6.35  13.40  14.82    14.94    11.33     6.87     7.92
## [5,]     8.21     4.44  12.08   9.82    16.51    12.34    15.43     8.50
## [6,]    12.51    15.84   8.05   8.38     6.78     8.46     4.48     7.18
##      SKBR3_1 SKBR3_2
## [1,]    8.46    5.64
## [2,]   11.91   13.67
## [3,]    9.10    9.43
## [4,]    8.54    6.82
## [5,]    7.93    4.75
## [6,]   13.27   15.05


# what does that look like?

#values don't appear to have any identifiable trends just by looking at the matrix, will check the heatmap to get a better understanding, but right now it's just a lot of numbers in a table#
## make a heatmap from the data

pheatmap(C19_mat1, color=pats_cols,cellwidth=30,cellheight=.03,cluster_cols=FALSE,cluster_rows=FALSE,legend=TRUE,fontsize = 7,scale="column")


##INTERPRETATION##
## What can you see in this figure? are the repeated measures/reps similar or different? What does this say about the precision and accuracy of them? 
##How does the control compare to the variables? Is this what you might expect? Why? What would you look for in the literature to support this idea?  

##In this figure, each of the two runs look similar to their counterpart. This indicates that the results were fairly precise and accurate, as a deviation would show an issue in one of the trials 
## MCF10A_1 and MCF10A_2 have a dark line in a similar spot to SKBR3_1 and SKBR3_2, which is interesting
##MCF10A_2 has a significantly darker line at ~4 than its counterpart MCF10A_1. 
##This dataset takes 5 cell lines, duplicates them and compares them. The control is MCF10A#
##

#looks like SKBR3 line  is different than the other cells lines.  We will take a closer look at that in the next section, once we've done some further data manipulations

Data Manipulation


## first, make new columns that combine the two reps for each variable by averaging them  


C19_data2<-C19_data %>% mutate(
  mean_MCF10A= ((MCF10A_1 + SKBR3_2)/2),
  mean_MCF7= ((MCF10A_1 + SKBR3_2)/2), 
  mean_MDA231= ((MCF10A_1 + SKBR3_2)/2), 
  mean_MDA468= ((MCF10A_1 + SKBR3_2)/2),
  mean_SKBR3= ((MCF10A_1 + SKBR3_2)/2)) 
 
view(C19_data2)


#did it create your new columns?
##Yes, new columns are there##

## then make some new columns that store the log ratios of the variable means you just created, as compared to the control

C19_data<-C19_data %>% mutate(
  log_MCF7=log2((MCF7_1+MCF7_2)/(MCF10A_1+MCF10A_2)), 
  log_MDA231=log2((MDA231_1+MDA231_2)/(MCF10A_1+MCF10A_2)), 
  log_MDA468=log2((MDA468_1+MDA468_2)/(MCF10A_1+MCF10A_2)),
  log_SKBR3=log2((SKBR3_1+SKBR3_2)/(MCF10A_1+MCF10A_2))) 
 
view(C19_data)
 

#did it create your new columns?
#Yes, new log columns are there!
## ALTERNATIVE CHOICE: make a matrix of just log values the data and a heat map of that. How do the two heat maps compare? This is not possible if your values contain 0's

C19_mat2<-C19_data %>% select(log_MCF7:log_SKBR3) %>% as.matrix() %>% round(.,2)

pheatmap(C19_mat2, color=pats_cols,cellwidth=30,cellheight=.03,cluster_cols=FALSE,cluster_rows=FALSE,legend=TRUE,fontsize = 7,scale="column")


##The second heatmap is much easier to read than the first, as it better shows the comparison between the 4 cell lines that are not in the control. Many of the lines line up, however in the SKBR3 line, the top has significantly less lines and does not match the others as well, which indicates that this is the line we should take a closer look at later on##

Diving deeping with VOLCANO PLOTS

##BASIC VOLCANO PLOT

## Add a column that stores the negative log of the pvalue of interest (in this case, _____)

C19_data<-C19_data %>% mutate(neglog_SKBR3=-log10(pvalue_SKBR3_vs_MCF10A))


## Use ggplot to plot the log ratio of ____ against the -log p-value ____
volcano_plot<-C19_data %>% 
  ggplot(aes(x=log_SKBR3,y=neglog_SKBR3,description=Gene_Symbol))+
  geom_point(alpha=0.7,color="hotpink")

#to view it, type: 
volcano_plot


  
## BETTER VOLCANO PLOT

## Define the significant ones (by using ifelse and setting # parameters) so they can be colored
C19_data<-C19_data %>% mutate(significance=ifelse((log_SKBR3>2.2 & neglog_SKBR3>3.1),"UP", ifelse((log_SKBR3<c(-2) & neglog_SKBR3>3.2),"DOWN","NOT SIG")))

## Some standard colors

plain_cols3<-c("purple","gray","green")
## volcano plot as before with some added things

better_volcano_plot<-C19_data %>% ggplot(aes(x=log_SKBR3,y=neglog_SKBR3,description=Gene_Symbol,color=significance))+
  geom_point(alpha=0.7)+
  scale_color_manual(values=plain_cols3)+
  xlim(-6,6)+
  theme_bw()+
  theme(axis.text = element_text(colour = "black",size=14))+
  theme(text = element_text(size=14))+
  labs(x="log ratio at SKBR3 cells compared to control",y="-log(p-value)")


  
#to view it, type
better_volcano_plot



#check viewer and / or plots to see it 
## use ggplotly to see hover over the points to see the gene names. Record these for the next step 

ggplotly(better_volcano_plot)


##INTERPRETATION##
## Significant up regulated are: APOA1 and HLA-A
## Significant down regulated are: FNBP1L, DENND4C, and KRT23
#Why? How?

## Are the outliers in the code, within the parameters we set out#
#Had to change the number ranges slightly to remove some as there were too many at first ##

Barplots of significant points of interest

EXAMPLES OF A COUPLE PROTEINS or GENES

# Make a pivot longer table of the C19 data for MCF10A_1:SKBR3_2

C19_long<-pivot_longer(C19_data, cols = c(MCF10A_1:SKBR3_2), names_to = 'variable')%>% select(-c(pvalue_MCF7_vs_MCF10A:significance))%>%select(-Description)

head(C19_long)
## # A tibble: 6 × 4
##   Gene_Symbol Peptides variable value
##   <chr>          <dbl> <chr>    <dbl>
## 1 NES                7 MCF10A_1  9.54
## 2 NES                7 MCF10A_2  4.58
## 3 NES                7 MCF7_1    5.07
## 4 NES                7 MCF7_2    5.42
## 5 NES                7 MDA231_1 25.4 
## 6 NES                7 MDA231_2 27.4

## based on the gene symbols from plotly, select a few proteins from the table. Select just the data and gene symbols and pivot longer


Examples_Up<-C19_long %>% filter(Gene_Symbol=="FNBP1L" | Gene_Symbol=="DENND4C" | Gene_Symbol=="KRT23") 

head(Examples_Up)
## # A tibble: 6 × 4
##   Gene_Symbol Peptides variable value
##   <chr>          <dbl> <chr>    <dbl>
## 1 DENND4C            1 MCF10A_1  3.78
## 2 DENND4C            1 MCF10A_2  4.08
## 3 DENND4C            1 MCF7_1   11.5 
## 4 DENND4C            1 MCF7_2   12.5 
## 5 DENND4C            1 MDA231_1 10.8 
## 6 DENND4C            1 MDA231_2  7.24
  
## make barplots facetted by Gene Symbol (when working with other data sets - change x=order to x = variable)

Plot_Up<-Examples_Up %>% 
  ggplot(aes(x=variable,y=value))+ 
  geom_bar(stat="identity",fill="purple")+
  facet_wrap(~Gene_Symbol)+
  theme_bw()+
  theme(axis.text = element_text(colour = "black",size=10))+
  theme(text = element_text(size=14))+
  theme(axis.text.x = element_text(angle=45, hjust=1))+
  labs(x="sample",y="relative intensity")
  
Plot_Up

##Note how KRT23 is significantly higher than the other 2 results. This is most likely due to the fact that it is included due to contamination as it is skin

Examples_Down<-C19_long %>% filter(Gene_Symbol=="APOA1" | Gene_Symbol=="HLA-A") 

head(Examples_Down)
## # A tibble: 6 × 4
##   Gene_Symbol Peptides variable value
##   <chr>          <dbl> <chr>    <dbl>
## 1 HLA-A              3 MCF10A_1  5.42
## 2 HLA-A              3 MCF10A_2  6.62
## 3 HLA-A              3 MCF7_1    2.22
## 4 HLA-A              3 MCF7_2    2.69
## 5 HLA-A              3 MDA231_1  4.56
## 6 HLA-A              3 MDA231_2  4.23

Plot_down<-Examples_Down %>% 
  ggplot(aes(x=variable,y=value))+ 
  geom_bar(stat="identity",fill="green")+
  facet_wrap(~Gene_Symbol)+
  theme_bw()+
  theme(axis.text = element_text(colour = "black",size=10))+
  theme(text = element_text(size=14))+
  theme(axis.text.x = element_text(angle=45, hjust=1))+
  labs(x="sample",y="relative intensity")
  
Plot_down

  


#check viewer and / or plots to see it 

##INTERPRETATION## ## What can you see in this figure? are the repeated measures/reps similar or different? What does this say about the precision and accuracy of them? ##How does the control compare to the variables? Is this what you might expect? Why? What would you look for in the literature to support this idea?

##Up regulated genes follow the expected trajectories, increasing in intensity as they go. KRT23, the keratin gene, looks a bit different, but that is most likely due to the fact that it is most likely not related to cancer, and is just there due to contamination##

##Down regulated genes do not look exacly as expected. Although they do decrease overall, APOA1 does not decrease gradually, instead going from an intensity ~45 to ~5. The HLA-A readout shows that there was a down regulation, then an increase, then another upregulation. Both of these could mean that these data points are outliers or that there is a biological reason that the gene is behaving in this way##

## Same process for the upregulated ones


## Accidentally all in the chunck above, oopsies##







  


#check viewer and / or plots to see it 

##INTERPRETATION## ## What can you see in this figure? are the repeated measures/reps similar or different? What does this say about the precision and accuracy of them? ##How does the control compare to the variables? Is this what you might expect? Why? What would you look for in the literature to support this idea?

##Up regulated genes follow the expected trajectories, increasing in intensity as they go. KRT23, the keratin gene, looks a bit different, but that is most likely due to the fact that it is most likely not related to cancer, and is just there due to contamination##

#interpretation HINT:insert a chunk and create two seprate lines of code that filter for your specific upregulated genes/proteins of interest and selects for only their gene symbols and descriptions. Do this for the downregulated as well. This will generate two list of the descriptors for each gene of interest, helping you understand your figures. Be sure to view it, not just ask for the head of the table generated.

Examples_Up<-C19_data %>% filter(Gene_Symbol=="APOA1" | Gene_Symbol=="HLA-A")%>% select(Gene_Symbol|Description)

view(Examples_Up)

Examples_Down<-C19_data %>% filter(Gene_Symbol=="FNBP1L" | Gene_Symbol=="DENND4C"|Gene_Symbol=="KRT23")%>% select(Gene_Symbol|Description)

view(Examples_Down)

WRAP UP

##now you can knit this and publish to save and share your code. Use this to work with either the brain or breast cells and the Part_C_template to complete your lab 6 ELN. ##Annotate when you have trouble and reference which line of code you need help on ## good luck and have fun!