####Run through this example and try to understand what is going on with the data

####So, lets load RMarkdown

library(rmarkdown)

set messages to FALSE on everything (prevents certain boring things from being shown in the results)

knitr::opts_chunk$set(echo = TRUE, message=FALSE,warning=FALSE,collapse = TRUE)

PACKAGES

library(reshape2)
library(ggplot2)
library(dplyr)
library(plotly)
library(viridis)
library(data.table)
library(pheatmap)
library(tidyverse)
library(ggthemes)
library(clipr)
library(tidyr)
library(Rcpp)

COLORS

mycolors<-c(viridis(15))
felix_cols<-mycolors[c(5,2)]
felix_4cols<-mycolors[c(15,10,8,2)]
plain_cols1<-c("blue","gray")
plain_cols2<-c("red","gray")

pats_cols<-colorRampPalette(c("#FDE725FF", "white","#440154FF"))(21)
leos_cols<-colorRampPalette(c("white","blue"))(10)

LOAD DATA AND MAKE AN OVERALL HEATMAP

## load the dataset 
breast_cancer_data<-read_csv(file="breast_cancer_cells.csv")

#and then view it 
view(breast_cancer_data)

## now let's visualize the dataset and look for initial trends. We can do this by making a matrix so and then a heatmap to visualize the data 

BC_data<-breast_cancer_data %>% 
  mutate(log_MCF10A=log2((MCF10A_1)/(MCF10A_2)),
         log_MCF7=log2((MCF7_1)/(MCF7_2)),
         log_MDA231=log2((MDA231_1)/(MDA231_2)),
         log_MDA468=log2((MDA468_1)/(MDA468_2)),
         log_SKBR3=log2((SKBR3_1)/(SKBR3_2)))
view(BC_data)
#make a matrix of just raw the data
BC_data_mat1<-BC_data %>% 
  select(log_MCF10A:log_SKBR3) %>% 
  as.matrix() %>% 
  round(.,2)

# what does that look like?
head(BC_data_mat1)
##      log_MCF10A log_MCF7 log_MDA231 log_MDA468 log_SKBR3
## [1,]       1.06    -0.10      -0.11       0.23      0.58
## [2,]       0.27    -0.03      -0.07      -0.36     -0.20
## [3,]       0.30     0.09       0.30       0.17     -0.05
## [4,]       0.50    -0.15       0.40      -0.21      0.32
## [5,]       0.89     0.30       0.42       0.86      0.74
## [6,]      -0.34    -0.06      -0.32      -0.68     -0.18
## make a heatmap from the data

pheatmap(BC_data_mat1, color=pats_cols,cellwidth = 30,cellheight = .03, cluster_cols = FALSE,cluster_rows = FALSE,legend = TRUE,fontsize = 7,scale = "column")


##INTERPRETATION##
## What can you see in this figure?
#The expression levels of different breast cancer cell lines are displayed differently in this heat map.  With purple denoting greater values and yellow denoting lower values, the color gradient shows changes.
##are the repeated measures/reps similar or different?
#There seems to be some variance in the repeated measures, with some regions showing greater signals than others.  Within cell lines, there is considerable uniformity, nevertheless.
##What does this say about the precision and accuracy of them?
#The variation suggests some degree of measurement variability. 

##How does the control compare to the variables?
#Since MCF10A is a non-tumorigenic breast epithelial cell line that is frequently used as a baseline in studies on breast cancer, it is most likely the control.  MCF10A has varying expression levels in comparison to the carcinoma cell lines (MCF7, MDA-MB-231, MDA-MB-468, and SKBR3), indicating a substantial difference between normal and malignant cells.
##Is this what you might expect? Why? What would you look for in the literature to support this idea?  
#Yes, since compared to normal cells, cancer cell lines usually display different gene expression profiles.  I would search for research that compares the expression of genes in malignant and normal breast cells, paying special attention to the various genes that are expressed and the pathways that contribute to the development of cancer.
#looks like MCF10A line  is different than the other cells lines.  We will take a closer look at that in the next section, once we've done some further data manipulations

Data Manipulation


## first, make new columns that combine the two reps for each variable by averaging them  
BC_data<-BC_data %>% 
  mutate(ave_MCF10A=rowMeans(select(.,MCF10A_1, MCF10A_2)),ave_MCF7=rowMeans(select(.,MCF7_1,MCF7_2)),ave_MDA231=rowMeans(select(.,MDA231_1,MDA231_2)),ave_MDA468=rowMeans(select(.,MDA468_1,MDA468_2)),ave_SKBR3=rowMeans(select(.,SKBR3_1,SKBR3_2)))

 

#did it create your new columns?
#yes

## then make some new columns that store the log ratios of the variable means you just created, as compared to the control

BC_data <- BC_data %>%
  mutate(log2_MCF7 = log2(ave_MCF7 / ave_MCF10A),log2_MDA231 = log2(ave_MDA231 / ave_MCF10A),log2_MDA468 = log2(ave_MDA468 / ave_MCF10A),log2_SKBR3 = log2(ave_SKBR3 / ave_MCF10A))
 

#did it create your new columns?
#yes
## ALTERNATIVE CHOICE: make a matrix of just log values the data and a heat map of that. How do the two heat maps compare? This is not possible if your values contain 0's
BC_data_mat2 <- BC_data %>%
  select(starts_with("log2_")) %>%
  as.matrix() %>%
  round(., 2)
pheatmap(BC_data_mat2, color=pats_cols, cellwidth=30, cellheight=0.03,
         cluster_cols=FALSE, cluster_rows=FALSE, legend=TRUE,
         fontsize=7, scale="column")

Diving deeping with VOLCANO PLOTS

##BASIC VOLCANO PLOT

## Add a column that stores the negative log of the pvalue of interest (in this case, MDA231)

BC_data <- BC_data %>%
  mutate(neg_log10_pval = -log10(pvalue_MDA231_vs_MCF10A))

## Use ggplot to plot the log ratio of MDA231 against the -log p-value MDA231_vs_MCF10A
volcano_plot <- BC_data %>%
  ggplot(aes(x = log2_MDA231, y = neg_log10_pval)) +
  geom_point(alpha = 0.7, color = "royalblue")

#to view it, type: 
volcano_plot


  
## BETTER VOLCANO PLOT

## Define the significant ones (by using ifelse and setting # parameters) so they can be colored
BC_data <- BC_data %>%
  mutate(significance = ifelse(log2_MDA231 > 1.8 & neg_log10_pval > 3.0, "UP",
                        ifelse(log2_MDA231 < -2.0 & neg_log10_pval > 3.0, "DOWN", "NOT SIG")))

## Some standard colors
volcano_colors <- c("UP" = "red", "DOWN" = "blue", "NOT SIG" = "green")


## volcano plot as before with some added things
better_volcano_plot <- BC_data %>%
  ggplot(aes(x = log2_MDA231, y = neg_log10_pval, color = significance,text = Gene_Symbol)) +
  geom_point(alpha = 0.7) +  
  scale_color_manual(values = volcano_colors) +
  xlim(-6, 6) +
  theme_bw() +
  theme(axis.text = element_text(colour = "black", size = 14),text = element_text(size = 14)) +
  labs(x = "Log2 Fold Change (MDA231 vs MCF10A)", 
       y = "-log10(p-value)",
       title = "Improved Volcano Plot for Breast Cancer Data")




  
#to view it, type
better_volcano_plot


#check viewer and / or plots to see it 
## use ggplotly to see hover over the points to see the gene names. Record these for the next step 

ggplotly(better_volcano_plot)

##INTERPRETATION##
## Significant up regulated are: ACSS3, PNPLA2, and PMM1
## Significant down regulated are: ADCK3, SPNS1, and HIST1H1A
#Why? How?
# Log2 Fold Change (LFC) > 2.0 or < -2.0 = Strong expression changes
# -log10(p-value) > 3.0 → Statistically significant

Barplots of significant points of interest

EXAMPLES OF A COUPLE PROTEINS or GENES

# Make a pivot longer table of the Breast cancer data for MCF10A_1:SKBR3_2
BC_long<-pivot_longer(BC_data, cols = c(MCF10A_1:SKBR3_2), names_to = 'variable')%>% select(-c(pvalue_MCF7_vs_MCF10A:significance))
head(BC_long)
## # A tibble: 6 × 5
##   Gene_Symbol Description                             Peptides variable value
##   <chr>       <chr>                                      <dbl> <chr>    <dbl>
## 1 NES         Nestin OS=Homo sapiens GN=NES PE=1 SV=2        7 MCF10A_1  9.54
## 2 NES         Nestin OS=Homo sapiens GN=NES PE=1 SV=2        7 MCF10A_2  4.58
## 3 NES         Nestin OS=Homo sapiens GN=NES PE=1 SV=2        7 MCF7_1    5.07
## 4 NES         Nestin OS=Homo sapiens GN=NES PE=1 SV=2        7 MCF7_2    5.42
## 5 NES         Nestin OS=Homo sapiens GN=NES PE=1 SV=2        7 MDA231_1 25.4 
## 6 NES         Nestin OS=Homo sapiens GN=NES PE=1 SV=2        7 MDA231_2 27.4

## based on the gene symbols from plotly, select a few proteins from the table. Select just the data and gene symbols and pivot longer

Examples_Down<-BC_long %>% filter(Gene_Symbol=="ADCK3" | Gene_Symbol=="SPNS1" | Gene_Symbol=="HIST1H1A")

head(Examples_Down)
## # A tibble: 6 × 5
##   Gene_Symbol Description                                Peptides variable value
##   <chr>       <chr>                                         <dbl> <chr>    <dbl>
## 1 SPNS1       Isoform 2 of Protein spinster homolog 1 O…        1 MCF10A_1 14.2 
## 2 SPNS1       Isoform 2 of Protein spinster homolog 1 O…        1 MCF10A_2 14.0 
## 3 SPNS1       Isoform 2 of Protein spinster homolog 1 O…        1 MCF7_1   10.8 
## 4 SPNS1       Isoform 2 of Protein spinster homolog 1 O…        1 MCF7_2   13.6 
## 5 SPNS1       Isoform 2 of Protein spinster homolog 1 O…        1 MDA231_1  3.03
## 6 SPNS1       Isoform 2 of Protein spinster homolog 1 O…        1 MDA231_2  3.13

  
## make barplots facetted by Gene Symbol (when working with other data sets - change x=order to x = variable)

Example_plot_down <- Examples_Down %>% 
  ggplot(aes(x=factor(variable, levels=c('MCF10A_1','MCF10A_2','MCF7_1','MCF7_2','MDA231_1','MDA231_2','MDA468_1','MDA468_2','SKBR3_1','SKBR3_2')), 
             y=value)) + 
  geom_bar(stat="identity", fill="red") +
  facet_wrap(~Gene_Symbol) +
  theme_bw() +
  theme(axis.text = element_text(colour = "black", size=10)) +
  theme(text = element_text(size=14)) +
  theme(axis.text.x = element_text(angle=45, hjust=1)) +
  labs(x="Sample", y="Relative Intensity")



#check viewer and / or plots to see it 
Example_plot_down

##INTERPRETATION## ## What can you see in this figure? are the repeated measures/reps similar or different? What does this say about the precision and accuracy of them? #Three downregulated genes (ADCK3, HIST1H1A, and SPNS1) are shown in the picture along with their relative intensity values for several breast cancer cell lines and the control (MCF10A). Overall, the repeated measurements seem consistent, indicating high precision. Nonetheless, there are small variances, which could point to a little amount of experimental variability. ##How does the control compare to the variables? Is this what you might expect? Why? What would you look for in the literature to support this idea? #As would be expected given that these genes are downregulated in breast cancer, the control (MCF10A) cell line typically shows greater relative intensity levels than the cancer cell lines. I would search for research on the expression levels of these genes in normal versus malignant breast tissues, as well as their function in tumor suppression or cancer advancement.

## Same process for the upregulated ones

Examples_Up<-BC_long %>% filter(Gene_Symbol=="ACSS3" | Gene_Symbol=="PNPLA2" | Gene_Symbol=="PMM1")

head(Examples_Up)
## # A tibble: 6 × 5
##   Gene_Symbol Description                                Peptides variable value
##   <chr>       <chr>                                         <dbl> <chr>    <dbl>
## 1 PMM1        Phosphomannomutase 1 OS=Homo sapiens GN=P…        1 MCF10A_1  3.34
## 2 PMM1        Phosphomannomutase 1 OS=Homo sapiens GN=P…        1 MCF10A_2  3.53
## 3 PMM1        Phosphomannomutase 1 OS=Homo sapiens GN=P…        1 MCF7_1   13.0 
## 4 PMM1        Phosphomannomutase 1 OS=Homo sapiens GN=P…        1 MCF7_2    9.66
## 5 PMM1        Phosphomannomutase 1 OS=Homo sapiens GN=P…        1 MDA231_1 12.5 
## 6 PMM1        Phosphomannomutase 1 OS=Homo sapiens GN=P…        1 MDA231_2 12.0


Example_plot_up<-Examples_Up %>% ggplot(aes(x=factor(variable,levels=c('MCF10A_1','MCF10A_2','MCF7_1','MCF7_2','MDA231_1','MDA231_2','MDA468_1','MDA468_2','SKBR3_1','SKBR3_2')),y=value))+ 
  geom_bar(stat="identity",fill="royalblue")+
  facet_wrap(~Gene_Symbol)+
  theme_bw()+
  theme(axis.text = element_text(colour = "black",size=10))+
  theme(text = element_text(size=14))+
  theme(axis.text.x = element_text(angle=45, hjust=1))+
  labs(x="sample",y="relative intensity")


#check viewer and / or plots to see it 
Example_plot_up

##INTERPRETATION## ## What can you see in this figure? are the repeated measures/reps similar or different? What does this say about the precision and accuracy of them? #This figure shows relative intensity values for three upregulated genes (ACSS3, PMM1, and PNPLA2) across breast cancer cell lines and the control (MCF10A). repeated measures are consitent but there is some variability. precision and accuracy has a good consistency ##How does the control compare to the variables? Is this what you might expect? Why? What would you look for in the literature to support this idea?
#comparing to the control cancer cell lines show higher levels showing that these genes upregulated breast cancer. literature disscussing about hallmarks of cancer, proliferation,cancer metabolism can be supportive #interpretation HINT:insert a chunk and create two seprate lines of code that filter for your specific upregulated genes/proteins of interest and selects for only their gene symbols and descriptions. Do this for the downregulated as well. This will generate two list of the descriptors for each gene of interest, helping you understand your figures. Be sure to view it, not just ask for the head of the table generated.

Upregulated_Genes <- BC_data %>%
  filter(Gene_Symbol %in% c("ACSS3","PMM1","PNPLA2")) %>%  
  select(Gene_Symbol, Description)

view(Upregulated_Genes)

Downregulated_Genes <- BC_data %>%
  filter(Gene_Symbol %in% c("ADCK3", "SPNS1", "HIST1H1A")) %>%  
  select(Gene_Symbol, Description)

view(Downregulated_Genes)

WRAP UP

##now you can knit this and publish to save and share your code. Use this to work with either the brain or breast cells and the Part_C_template to complete your lab 6 ELN. ##Annotate when you have trouble and reference which line of code you need help on ## good luck and have fun!