####Run through this example and try to understand what is going on with the data

####So, lets load RMarkdown

library(rmarkdown)

set messages to FALSE on everything (prevents certain boring things from being shown in the results)

knitr::opts_chunk$set(echo = TRUE, message=FALSE,warning=FALSE,collapse = TRUE)

PACKAGES

library(reshape2)
library(ggplot2)
library(dplyr)
library(plotly)
library(viridis)
library(data.table)
library(pheatmap)
library(tidyverse)
library(ggthemes)
library(clipr)
library(tidyr)
library(Rcpp)

COLORS

mycolors<-c(viridis(15))
felix_cols<-mycolors[c(5,2)]
felix_4cols<-mycolors[c(15,10,8,2)]
plain_cols1<-c("blue","gray")
plain_cols2<-c("red","gray")

pats_cols<-colorRampPalette(c("#FDE725FF", "white","#440154FF"))(21)
leos_cols<-colorRampPalette(c("white","blue"))(10)

LOAD DATA AND MAKE AN OVERALL HEATMAP

## load the dataset 

ci_data<-read_csv(file = "complex_I_inhibitors.csv")

head(ci_data)
## # A tibble: 6 × 18
##   Uniprot_…¹ Gene_…² Prote…³ Pepti…⁴ PBS_1 PBS_2 Phenf…⁵ Phenf…⁶ Pieri…⁷ Pieri…⁸
##   <chr>      <chr>   <chr>     <dbl> <dbl> <dbl>   <dbl>   <dbl>   <dbl>   <dbl>
## 1 P50851     LRBA    LRBA_H…       9   9.3   9.4    11      11       7.3     8.2
## 2 Q9UIF8     BAZ2B   BAZ2B_…       2  13.1  13.1     9.3     9.2     8.8     8.8
## 3 P15927     RPA2    RFA2_H…      12   8.2   8.2     9.4     9.3     9.5     8.7
## 4 P13797     PLS3    PLST_H…      22   8.6   8.6    11      11.1     8.7     9.8
## 5 Q9H9B1     EHMT1   EHMT1_…       8  13.2  13.3    10.9    11       9.9     9.5
## 6 Q86WX3     RPS19B… AROS_H…       2  12.4  12.4    11.4    11.3     8.8     8.8
## # … with 8 more variables: Rotenone_1 <dbl>, Rotenone_2 <dbl>, MPP_1 <dbl>,
## #   MPP_2 <dbl>, pvalue_phenformin_vs_PBS <dbl>,
## #   pvalue_piericidin_vs_PBS <dbl>, pvalue_rotenone_vs_PBS <dbl>,
## #   pvalue_MPP_vs_PBS <dbl>, and abbreviated variable names ¹​Uniprot_Accession,
## #   ²​Gene_Symbol, ³​Protein_Description, ⁴​Peptides, ⁵​Phenformin_1,
## #   ⁶​Phenformin_2, ⁷​Piericidin_1, ⁸​Piericidin_2

colnames(ci_data)
##  [1] "Uniprot_Accession"        "Gene_Symbol"             
##  [3] "Protein_Description"      "Peptides"                
##  [5] "PBS_1"                    "PBS_2"                   
##  [7] "Phenformin_1"             "Phenformin_2"            
##  [9] "Piericidin_1"             "Piericidin_2"            
## [11] "Rotenone_1"               "Rotenone_2"              
## [13] "MPP_1"                    "MPP_2"                   
## [15] "pvalue_phenformin_vs_PBS" "pvalue_piericidin_vs_PBS"
## [17] "pvalue_rotenone_vs_PBS"   "pvalue_MPP_vs_PBS"
## make a matrix of just original raw the data


ci_mat1<-ci_data %>% select(PBS_1:MPP_2) %>% as.matrix() %>% round(.,2)

head(ci_mat1)
##      PBS_1 PBS_2 Phenformin_1 Phenformin_2 Piericidin_1 Piericidin_2 Rotenone_1
## [1,]   9.3   9.4         11.0         11.0          7.3          8.2       12.5
## [2,]  13.1  13.1          9.3          9.2          8.8          8.8        9.3
## [3,]   8.2   8.2          9.4          9.3          9.5          8.7       10.2
## [4,]   8.6   8.6         11.0         11.1          8.7          9.8       11.3
## [5,]  13.2  13.3         10.9         11.0          9.9          9.5        7.3
## [6,]  12.4  12.4         11.4         11.3          8.8          8.8        9.0
##      Rotenone_2 MPP_1 MPP_2
## [1,]       12.7   9.8   9.0
## [2,]        9.1   9.2  10.1
## [3,]        9.6  12.7  14.0
## [4,]       10.8  10.1  10.0
## [5,]        7.4   9.1   8.4
## [6,]        8.6   7.9   9.5
## make a heatmap from the data

pheatmap(ci_mat1, color=pats_cols,cellwidth=30,cellheight=.03,cluster_cols=FALSE,cluster_rows=TRUE,legend=TRUE,fontsize = 7,scale="column")

##INTERPRETATION## ## What can you see in this figure? are the repeated measures/reps similar or different? What does this say about the precision and accuracy of them? ##How does the control compare to the variables? Is this what you might expect? Why? What would you look for in the literature to support this idea?


## make some new columns that take the average of the pairs of observations for each variable  

ci_data <- ci_data %>% mutate(
  mean_PBS= ((PBS_1 + PBS_2)/2),
  mean_Phenformin= ((Phenformin_1 + Phenformin_2)/2), 
  mean_Piericidin= ((Piericidin_1 + Piericidin_2)/2), 
  mean_Rotenone= ((Rotenone_1 + Rotenone_2)/2),
  mean_MPP= ((MPP_1 + MPP_2)/2))

colnames(ci_data)
##  [1] "Uniprot_Accession"        "Gene_Symbol"             
##  [3] "Protein_Description"      "Peptides"                
##  [5] "PBS_1"                    "PBS_2"                   
##  [7] "Phenformin_1"             "Phenformin_2"            
##  [9] "Piericidin_1"             "Piericidin_2"            
## [11] "Rotenone_1"               "Rotenone_2"              
## [13] "MPP_1"                    "MPP_2"                   
## [15] "pvalue_phenformin_vs_PBS" "pvalue_piericidin_vs_PBS"
## [17] "pvalue_rotenone_vs_PBS"   "pvalue_MPP_vs_PBS"       
## [19] "mean_PBS"                 "mean_Phenformin"         
## [21] "mean_Piericidin"          "mean_Rotenone"           
## [23] "mean_MPP"

## make some new columns that store the log ratios of the means you just created  

ci_data<-ci_data %>% mutate(
  log_Phenformin=log2(mean_Phenformin/mean_PBS), 
  log_Piericidin=log2(mean_Piericidin/mean_PBS), 
  log_Rotenone=log2(mean_Rotenone/mean_PBS),
  log_MPP=log2(mean_MPP/mean_PBS)) 
 
colnames(ci_data)
##  [1] "Uniprot_Accession"        "Gene_Symbol"             
##  [3] "Protein_Description"      "Peptides"                
##  [5] "PBS_1"                    "PBS_2"                   
##  [7] "Phenformin_1"             "Phenformin_2"            
##  [9] "Piericidin_1"             "Piericidin_2"            
## [11] "Rotenone_1"               "Rotenone_2"              
## [13] "MPP_1"                    "MPP_2"                   
## [15] "pvalue_phenformin_vs_PBS" "pvalue_piericidin_vs_PBS"
## [17] "pvalue_rotenone_vs_PBS"   "pvalue_MPP_vs_PBS"       
## [19] "mean_PBS"                 "mean_Phenformin"         
## [21] "mean_Piericidin"          "mean_Rotenone"           
## [23] "mean_MPP"                 "log_Phenformin"          
## [25] "log_Piericidin"           "log_Rotenone"            
## [27] "log_MPP"
##Alternative Heatmap of log values compared to control 

ci_mat2<-ci_data %>% select(log_Phenformin:log_MPP) %>% as.matrix() %>% round(.,2)

#Need to filter out na's 
#no second heat map - data contains 0's and i couldnt get it to work

#pheatmap(ci_mat2, color=pats_cols,cellwidth=30,cellheight=.03,cluster_cols=FALSE,cluster_rows=TRUE,legend=TRUE,fontsize = 7,scale="column")

PLOTTING A VOLCANO

##Preview of data via BASIC VOLCANO PLOT

## Add a column that stores the negative log of the pvalue of interest 
ci_data<-ci_data %>% mutate(
  neglog_Phenformin=-log10(pvalue_phenformin_vs_PBS), 
  neglog_Piericidin=-log10(pvalue_piericidin_vs_PBS),
  neglog_Rotenone=-log10(pvalue_rotenone_vs_PBS),
  neglog_MPP=-log10(pvalue_MPP_vs_PBS))

colnames(ci_data)
##  [1] "Uniprot_Accession"        "Gene_Symbol"             
##  [3] "Protein_Description"      "Peptides"                
##  [5] "PBS_1"                    "PBS_2"                   
##  [7] "Phenformin_1"             "Phenformin_2"            
##  [9] "Piericidin_1"             "Piericidin_2"            
## [11] "Rotenone_1"               "Rotenone_2"              
## [13] "MPP_1"                    "MPP_2"                   
## [15] "pvalue_phenformin_vs_PBS" "pvalue_piericidin_vs_PBS"
## [17] "pvalue_rotenone_vs_PBS"   "pvalue_MPP_vs_PBS"       
## [19] "mean_PBS"                 "mean_Phenformin"         
## [21] "mean_Piericidin"          "mean_Rotenone"           
## [23] "mean_MPP"                 "log_Phenformin"          
## [25] "log_Piericidin"           "log_Rotenone"            
## [27] "log_MPP"                  "neglog_Phenformin"       
## [29] "neglog_Piericidin"        "neglog_Rotenone"         
## [31] "neglog_MPP"
## Use ggplot to plot the log ratio at 24 hours against the -log p-value at 24 hours
volcano_plot_Phenformin<-ci_data %>% ggplot(aes(x=log_Phenformin,y=neglog_Phenformin,description=Gene_Symbol))+
  geom_point(alpha=0.7,color="royalblue")

volcano_plot_Phenformin

## Use ggplot to plot the log ratio at 24 hours against the -log p-value at 24 hours
volcano_plot_Piericidin<-ci_data %>% ggplot(aes(x=log_Piericidin,y=neglog_Piericidin,description=Gene_Symbol))+
  geom_point(alpha=0.7,color="darkgreen")

volcano_plot_Piericidin

## Use ggplot to plot the log ratio at 24 hours against the -log p-value at 24 hours
volcano_plot_Rotenone<-ci_data %>% ggplot(aes(x=log_Rotenone,y=neglog_Rotenone,description=Gene_Symbol))+
  geom_point(alpha=0.7,color="purple")

volcano_plot_Rotenone

## Use ggplot to plot the log ratio at 24 hours against the -log p-value at 24 hours
volcano_plot_MPP<-ci_data %>% ggplot(aes(x=log_MPP,y=neglog_MPP,description=Gene_Symbol))+
  geom_point(alpha=0.7,color="red")

volcano_plot_MPP

PLOTTING A VOLCANO Phenformin


## BETTER VOLCANO PLOT for Phenformin

## Define the significant ones (by using ifelse and setting # parameters) so they can be colored
ci_data<-ci_data %>% mutate(significance=ifelse((log_Phenformin>1 & neglog_Phenformin>1.99),"UP", ifelse((log_Phenformin<c(-0.95) & neglog_Phenformin>1.5),"DOWN","NOT SIG")))
# adjusted significance values from 2 & 2.99 to 3 and 3.99 - too many showing for this cell line 

## Some standard colors
plain_cols3<-c("blue","gray","red")

## volcano plot as before with some added things
better_volcano_plot_Phenformin<-ci_data %>% ggplot(aes(x=log_Phenformin,y=neglog_Phenformin,description=Gene_Symbol,color=significance))+
  geom_point(alpha=0.7)+
  scale_color_manual(values=plain_cols3)+
  xlim(-6,6)+
  theme_bw()+
  theme(axis.text = element_text(colour = "black",size=14))+
  theme(text = element_text(size=14))+
  labs(x="log ratio of Phenformin compared to control, PBS",y="-log(p-value)")
  

better_volcano_plot_Phenformin

## use ggplotly to see hover over the points to see the gene names. Record these for the next step 

ggplotly(better_volcano_plot_Phenformin)
## Significant up regulated Phenformin are: ZBTB8OS, RPLP2, PPP3R1 ## Significant down regulated Phenformin are: MAFF, AK4, PDK1, P4HA2

PLOTTING A VOLCANO Piericidin


## BETTER VOLCANO PLOT for Piericidin

## Define the significant ones (by using ifelse and setting # parameters) so they can be colored
ci_data<-ci_data %>% mutate(significance=ifelse((log_Piericidin>2 & neglog_Piericidin>2.5),"UP", ifelse((log_Piericidin<c(-2.5) & neglog_Piericidin>1.99),"DOWN","NOT SIG")))
# adjusted significance values  - too many down regulated for this cell line 

## Some standard colors
plain_cols3<-c("darkgreen","gray","orange")

## volcano plot as before with some added things
better_volcano_plot_Piericidin<-ci_data %>% ggplot(aes(x=log_Piericidin,y=neglog_Piericidin,description=Gene_Symbol,color=significance))+
  geom_point(alpha=0.7)+
  scale_color_manual(values=plain_cols3)+
  xlim(-6,6)+
  theme_bw()+
  theme(axis.text = element_text(colour = "black",size=14))+
  theme(text = element_text(size=14))+
  labs(x="log ratio of Piericidin compared to control, PBS",y="-log(p-value)")
  

better_volcano_plot_Piericidin

## use ggplotly to see hover over the points to see the gene names. Record these for the next step 

ggplotly(better_volcano_plot_Piericidin)
## Significant up regulated Piericidin are:HSP90AB1, DPY30, RPS29 ## Significant down regulated Piericidin are: USP1, TTYH3, MGST3, MT-ND1

PLOTTING A VOLCANO Rotenone


## BETTER VOLCANO PLOT for Rotenone

## Define the significant ones (by using ifelse and setting # parameters) so they can be colored
ci_data<-ci_data %>% mutate(significance=ifelse((log_Rotenone>2.5 & neglog_Rotenone>2),"UP", ifelse((log_Rotenone<c(-2.5) & neglog_Rotenone>2),"DOWN","NOT SIG")))
# adjusted significance values  - too many down regulated for this cell line 

## Some standard colors
plain_cols3<-c("purple","gray","brown")

## volcano plot as before with some added things
better_volcano_plot_Rotenone<-ci_data %>% ggplot(aes(x=log_Rotenone,y=neglog_Rotenone,description=Gene_Symbol,color=significance))+
  geom_point(alpha=0.7)+
  scale_color_manual(values=plain_cols3)+
  xlim(-6,6)+
  theme_bw()+
  theme(axis.text = element_text(colour = "black",size=14))+
  theme(text = element_text(size=14))+
  labs(x="log ratio of Rotenone compared to control, PBS",y="-log(p-value)")
  

better_volcano_plot_Rotenone

## use ggplotly to see hover over the points to see the gene names. Record these for the next step 

ggplotly(better_volcano_plot_Rotenone)
## Significant up regulated Rotenone are: FAM129A, GEM, SQSTM1, S100A6 ## Significant down regulated Rotenone are: RET, CBFA2T2

PLOTTING A VOLCANO MPP


## BETTER VOLCANO PLOT for MPP

## Define the significant ones (by using ifelse and setting # parameters) so they can be colored
ci_data<-ci_data %>% mutate(significance=ifelse((log_MPP>3 & neglog_MPP>3),"UP", ifelse((log_MPP<c(-3) & neglog_MPP>3),"DOWN","NOT SIG")))
# adjusted significance values  - too many down regulated for this cell line 

## Some standard colors
plain_cols3<-c("red","gray","cyan")

## volcano plot as before with some added things
better_volcano_plot_MPP<-ci_data %>% ggplot(aes(x=log_MPP,y=neglog_MPP,description=Gene_Symbol,color=significance))+
  geom_point(alpha=0.7)+
  scale_color_manual(values=plain_cols3)+
  xlim(-6,6)+
  theme_bw()+
  theme(axis.text = element_text(colour = "black",size=14))+
  theme(text = element_text(size=14))+
  labs(x="log ratio of MPP compared to control, PBS",y="-log(p-value)")
  

better_volcano_plot_MPP

## use ggplotly to see hover over the points to see the gene names. Record these for the next step 

ggplotly(better_volcano_plot_MPP)
## Significant up regulated MPP are: PRKCE, FAM203A, VPS8 ## Significant down regulated MPP are: REXO4, SETD1B, CCDC82, SERPINI1

Interpreting VOLCANO PLOTS

# Significant up regulated MPP are: PRKCE, FAM203A, VPS8 # Significant up regulated Rotenone are: FAM129A, GEM, SQSTM1, S100A6 # Significant up regulated Piericidin are:HSP90AB1, DPY30, RPS29 # Significant up regulated Phenformin are: ZBTB8OS, RPLP2, PPP3R1

# Significant down regulated Piericidin are: USP1, TTYH3, MGST3, MT-ND1 # Significant down regulated Phenformin are: MAFF, AK4, PDK1, P4HA2 # Significant down regulated MPP are: REXO4, SETD1B, CCDC82, SERPINI1 # Significant down regulated Rotenone are: RET, CBFA2T2

Barplots of significant points of interest

EXAMPLES OF A COUPLE PROTEINS or GENES

# Make a pivot longer table of the  data
ci_long<-pivot_longer(ci_data, cols = c(PBS_1:MPP_2), names_to = 'variable')%>% select(-c(pvalue_phenformin_vs_PBS:significance))%>%select(-Protein_Description, -Peptides)

head(ci_long)
## # A tibble: 6 × 4
##   Uniprot_Accession Gene_Symbol variable     value
##   <chr>             <chr>       <chr>        <dbl>
## 1 P50851            LRBA        PBS_1          9.3
## 2 P50851            LRBA        PBS_2          9.4
## 3 P50851            LRBA        Phenformin_1  11  
## 4 P50851            LRBA        Phenformin_2  11  
## 5 P50851            LRBA        Piericidin_1   7.3
## 6 P50851            LRBA        Piericidin_2   8.2


## based on the gene symbols from plotly, select a few proteins from the table. Select just the data and gene symbols and pivot longer
Examples_Down<-ci_long %>% filter(Gene_Symbol=="USP1" | Gene_Symbol=="TTYH3" | Gene_Symbol=="MGST3" | Gene_Symbol=="MT-ND1" | Gene_Symbol=="MAFF" | Gene_Symbol=="AK4"| Gene_Symbol=="PDK1" | Gene_Symbol=="P4HA2"| Gene_Symbol=="REXO4"| Gene_Symbol=="SETD1B"| Gene_Symbol=="CCDC82"| Gene_Symbol=="SERPINI1"| Gene_Symbol=="RET"| Gene_Symbol=="CBFA2T2") 


  
## make barplots facetted by Gene Symbol (when working with other data sets - change x=order to x = variable)

Example_plot_down<-Examples_Down %>% 
  ggplot(aes(x=factor(variable,levels=c('PBS_1','PBS_2','Phenformin_1','Phenformin_2','Piericidin_1','Piericidin_2','Rotenone_1','Rotenone_2','MPP_1','MPP_2')),y=value))+ 
  geom_bar(stat="identity",fill="blue")+
  facet_wrap(~Gene_Symbol)+
  theme_bw()+
  theme(axis.text = element_text(colour = "black",size=10))+
  theme(text = element_text(size=14))+
  theme(axis.text.x = element_text(angle=45, hjust=1))+
  labs(x="sample",y="relative intensity")
  
Example_plot_down

##INTERPRETATION## ## What can you see in this figure? are the repeated measures/reps similar or different? What does this say about the precision and accuracy of them? ##How does the control compare to the variables? Is this what you might expect? Why? What would you look for in the literature to support this idea?

## Same process for the upregulated ones
Examples_Up<-ci_long %>% filter(Gene_Symbol=="PRKCE" | Gene_Symbol=="FAM203A" | Gene_Symbol=="VPS8" | Gene_Symbol=="FAM129A" | Gene_Symbol=="GEM" | Gene_Symbol=="SQSTM1"| Gene_Symbol=="S100A6" | Gene_Symbol=="HSP90AB1" | Gene_Symbol=="RPS29"| Gene_Symbol=="DPY30"| Gene_Symbol=="ZBTB8OS"| Gene_Symbol=="RPLP2"| Gene_Symbol=="PPP3R1") 



Example_plot_up<-Examples_Up %>% 
  ggplot(aes(x=factor(variable,levels=c('PBS_1','PBS_2','Phenformin_1','Phenformin_2','Piericidin_1','Piericidin_2','Rotenone_1','Rotenone_2','MPP_1','MPP_2')),y=value))+ 
  geom_bar(stat="identity",fill="red")+
  facet_wrap(~Gene_Symbol)+
  theme_bw()+
  theme(axis.text = element_text(colour = "black",size=10))+
  theme(text = element_text(size=14))+
  theme(axis.text.x = element_text(angle=45, hjust=1))+
  labs(x="sample",y="relative intensity")
  
Example_plot_up

##INTERPRETATION## ## What can you see in this figure? are the repeated measures/reps similar or different? What does this say about the precision and accuracy of them? ##How does the control compare to the variables? Is this what you might expect? Why? What would you look for in the literature to support this idea?

#Upregulated genes: 

upregulated<-ci_data%>% filter (Gene_Symbol=="PRKCE" | Gene_Symbol=="FAM203A" | Gene_Symbol=="VPS8" | Gene_Symbol=="FAM129A" | Gene_Symbol=="GEM" | Gene_Symbol=="SQSTM1"| Gene_Symbol=="S100A6" | Gene_Symbol=="HSP90AB1" | Gene_Symbol=="RPS29"| Gene_Symbol=="DPY30"| Gene_Symbol=="ZBTB8OS"| Gene_Symbol=="RPLP2"| Gene_Symbol=="PPP3R1")  %>% select(Uniprot_Accession, Gene_Symbol, Protein_Description)
  
view(upregulated)
# and screen shot that table
head(upregulated)
## # A tibble: 6 × 3
##   Uniprot_Accession Gene_Symbol Protein_Description                       
##   <chr>             <chr>       <chr>                                     
## 1 Q8IWT0            ZBTB8OS     ARCH_HUMAN Protein archease               
## 2 P05387            RPLP2       RLA2_HUMAN 60S acidic ribosomal protein P2
## 3 P63098            PPP3R1      CANB1_HUMAN Calcineurin subunit B type 1  
## 4 P06703            S100A6      S10A6_HUMAN Protein S100-A6               
## 5 Q9BTY7            FAM203A     F203A_HUMAN Protein FAM203A               
## 6 Q02156            PRKCE       KPCE_HUMAN Protein kinase C epsilon type
#Downregulated genes: 

downregulated<-ci_data%>% filter (Gene_Symbol=="USP1" | Gene_Symbol=="TTYH3" | Gene_Symbol=="MGST3" | Gene_Symbol=="MT-ND1" | Gene_Symbol=="MAFF" | Gene_Symbol=="AK4"| Gene_Symbol=="PDK1" | Gene_Symbol=="P4HA2"| Gene_Symbol=="REXO4"| Gene_Symbol=="SETD1B"| Gene_Symbol=="CCDC82"| Gene_Symbol=="SERPINI1"| Gene_Symbol=="RET"| Gene_Symbol=="CBFA2T2") %>% select(Uniprot_Accession, Gene_Symbol, Protein_Description)
  
view(downregulated)
# and screen shot that table

head(downregulated)
## # A tibble: 6 × 3
##   Uniprot_Accession Gene_Symbol Protein_Description                             
##   <chr>             <chr>       <chr>                                           
## 1 Q8N4S0            CCDC82      CCD82_HUMAN Coiled-coil domain-containing prote…
## 2 Q9ULX9            MAFF        MAFF_HUMAN Transcription factor MafF            
## 3 P27144            AK4         KAD4_HUMAN Adenylate kinase 4, mitochondrial    
## 4 Q15118            PDK1        PDK1_HUMAN [Pyruvate dehydrogenase (acetyl-tran…
## 5 O15460            P4HA2       P4HA2_HUMAN Prolyl 4-hydroxylase subunit alpha-2
## 6 Q9C0H2            TTYH3       TTYH3_HUMAN Isoform 4 of Protein tweety homolog…

##now you can knit this and publish to save and share your code. be sure to include the links to your knitted doc for each of your group mates in your lab 6 ELN. #remeber: ##Annotate when you have trouble and reference which line of code you need help on ## good luck and have fun!