####Run through this example and try to understand what is going on with the data
####So, lets load RMarkdown
library(rmarkdown)
knitr::opts_chunk$set(echo = TRUE, message=FALSE,warning=FALSE,collapse = TRUE)
library(reshape2)
library(ggplot2)
library(dplyr)
library(plotly)
library(viridis)
library(data.table)
library(pheatmap)
library(tidyverse)
library(ggthemes)
library(clipr)
library(tidyr)
library(Rcpp)
mycolors<-c(viridis(15))
felix_cols<-mycolors[c(5,2)]
felix_4cols<-mycolors[c(15,10,8,2)]
plain_cols1<-c("blue","gray")
plain_cols2<-c("red","gray")
pats_cols<-colorRampPalette(c("#FDE725FF", "white","#440154FF"))(21)
leos_cols<-colorRampPalette(c("white","blue"))(10)
## load the dataset
ci_data<-read_csv(file = "complex_I_inhibitors.csv")
head(ci_data)
## # A tibble: 6 × 18
## Uniprot_…¹ Gene_…² Prote…³ Pepti…⁴ PBS_1 PBS_2 Phenf…⁵ Phenf…⁶ Pieri…⁷ Pieri…⁸
## <chr> <chr> <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 P50851 LRBA LRBA_H… 9 9.3 9.4 11 11 7.3 8.2
## 2 Q9UIF8 BAZ2B BAZ2B_… 2 13.1 13.1 9.3 9.2 8.8 8.8
## 3 P15927 RPA2 RFA2_H… 12 8.2 8.2 9.4 9.3 9.5 8.7
## 4 P13797 PLS3 PLST_H… 22 8.6 8.6 11 11.1 8.7 9.8
## 5 Q9H9B1 EHMT1 EHMT1_… 8 13.2 13.3 10.9 11 9.9 9.5
## 6 Q86WX3 RPS19B… AROS_H… 2 12.4 12.4 11.4 11.3 8.8 8.8
## # … with 8 more variables: Rotenone_1 <dbl>, Rotenone_2 <dbl>, MPP_1 <dbl>,
## # MPP_2 <dbl>, pvalue_phenformin_vs_PBS <dbl>,
## # pvalue_piericidin_vs_PBS <dbl>, pvalue_rotenone_vs_PBS <dbl>,
## # pvalue_MPP_vs_PBS <dbl>, and abbreviated variable names ¹Uniprot_Accession,
## # ²Gene_Symbol, ³Protein_Description, ⁴Peptides, ⁵Phenformin_1,
## # ⁶Phenformin_2, ⁷Piericidin_1, ⁸Piericidin_2
colnames(ci_data)
## [1] "Uniprot_Accession" "Gene_Symbol"
## [3] "Protein_Description" "Peptides"
## [5] "PBS_1" "PBS_2"
## [7] "Phenformin_1" "Phenformin_2"
## [9] "Piericidin_1" "Piericidin_2"
## [11] "Rotenone_1" "Rotenone_2"
## [13] "MPP_1" "MPP_2"
## [15] "pvalue_phenformin_vs_PBS" "pvalue_piericidin_vs_PBS"
## [17] "pvalue_rotenone_vs_PBS" "pvalue_MPP_vs_PBS"
## make a matrix of just original raw the data
ci_mat1<-ci_data %>% select(PBS_1:MPP_2) %>% as.matrix() %>% round(.,2)
head(ci_mat1)
## PBS_1 PBS_2 Phenformin_1 Phenformin_2 Piericidin_1 Piericidin_2 Rotenone_1
## [1,] 9.3 9.4 11.0 11.0 7.3 8.2 12.5
## [2,] 13.1 13.1 9.3 9.2 8.8 8.8 9.3
## [3,] 8.2 8.2 9.4 9.3 9.5 8.7 10.2
## [4,] 8.6 8.6 11.0 11.1 8.7 9.8 11.3
## [5,] 13.2 13.3 10.9 11.0 9.9 9.5 7.3
## [6,] 12.4 12.4 11.4 11.3 8.8 8.8 9.0
## Rotenone_2 MPP_1 MPP_2
## [1,] 12.7 9.8 9.0
## [2,] 9.1 9.2 10.1
## [3,] 9.6 12.7 14.0
## [4,] 10.8 10.1 10.0
## [5,] 7.4 9.1 8.4
## [6,] 8.6 7.9 9.5
## make a heatmap from the data
pheatmap(ci_mat1, color=pats_cols,cellwidth=30,cellheight=.03,cluster_cols=FALSE,cluster_rows=TRUE,legend=TRUE,fontsize = 7,scale="column")
##INTERPRETATION## ## What can you see in this figure? are the repeated measures/reps similar or different? What does this say about the precision and accuracy of them? ##How does the control compare to the variables? Is this what you might expect? Why? What would you look for in the literature to support this idea?
## make some new columns that take the average of the pairs of observations for each variable
ci_data <- ci_data %>% mutate(
mean_PBS= ((PBS_1 + PBS_2)/2),
mean_Phenformin= ((Phenformin_1 + Phenformin_2)/2),
mean_Piericidin= ((Piericidin_1 + Piericidin_2)/2),
mean_Rotenone= ((Rotenone_1 + Rotenone_2)/2),
mean_MPP= ((MPP_1 + MPP_2)/2))
colnames(ci_data)
## [1] "Uniprot_Accession" "Gene_Symbol"
## [3] "Protein_Description" "Peptides"
## [5] "PBS_1" "PBS_2"
## [7] "Phenformin_1" "Phenformin_2"
## [9] "Piericidin_1" "Piericidin_2"
## [11] "Rotenone_1" "Rotenone_2"
## [13] "MPP_1" "MPP_2"
## [15] "pvalue_phenformin_vs_PBS" "pvalue_piericidin_vs_PBS"
## [17] "pvalue_rotenone_vs_PBS" "pvalue_MPP_vs_PBS"
## [19] "mean_PBS" "mean_Phenformin"
## [21] "mean_Piericidin" "mean_Rotenone"
## [23] "mean_MPP"
## make some new columns that store the log ratios of the means you just created
ci_data<-ci_data %>% mutate(
log_Phenformin=log2(mean_Phenformin/mean_PBS),
log_Piericidin=log2(mean_Piericidin/mean_PBS),
log_Rotenone=log2(mean_Rotenone/mean_PBS),
log_MPP=log2(mean_MPP/mean_PBS))
colnames(ci_data)
## [1] "Uniprot_Accession" "Gene_Symbol"
## [3] "Protein_Description" "Peptides"
## [5] "PBS_1" "PBS_2"
## [7] "Phenformin_1" "Phenformin_2"
## [9] "Piericidin_1" "Piericidin_2"
## [11] "Rotenone_1" "Rotenone_2"
## [13] "MPP_1" "MPP_2"
## [15] "pvalue_phenformin_vs_PBS" "pvalue_piericidin_vs_PBS"
## [17] "pvalue_rotenone_vs_PBS" "pvalue_MPP_vs_PBS"
## [19] "mean_PBS" "mean_Phenformin"
## [21] "mean_Piericidin" "mean_Rotenone"
## [23] "mean_MPP" "log_Phenformin"
## [25] "log_Piericidin" "log_Rotenone"
## [27] "log_MPP"
##Alternative Heatmap of log values compared to control
ci_mat2<-ci_data %>% select(log_Phenformin:log_MPP) %>% as.matrix() %>% round(.,2)
#Need to filter out na's
#no second heat map - data contains 0's and i couldnt get it to work
#pheatmap(ci_mat2, color=pats_cols,cellwidth=30,cellheight=.03,cluster_cols=FALSE,cluster_rows=TRUE,legend=TRUE,fontsize = 7,scale="column")
##Preview of data via BASIC VOLCANO PLOT
## Add a column that stores the negative log of the pvalue of interest
ci_data<-ci_data %>% mutate(
neglog_Phenformin=-log10(pvalue_phenformin_vs_PBS),
neglog_Piericidin=-log10(pvalue_piericidin_vs_PBS),
neglog_Rotenone=-log10(pvalue_rotenone_vs_PBS),
neglog_MPP=-log10(pvalue_MPP_vs_PBS))
colnames(ci_data)
## [1] "Uniprot_Accession" "Gene_Symbol"
## [3] "Protein_Description" "Peptides"
## [5] "PBS_1" "PBS_2"
## [7] "Phenformin_1" "Phenformin_2"
## [9] "Piericidin_1" "Piericidin_2"
## [11] "Rotenone_1" "Rotenone_2"
## [13] "MPP_1" "MPP_2"
## [15] "pvalue_phenformin_vs_PBS" "pvalue_piericidin_vs_PBS"
## [17] "pvalue_rotenone_vs_PBS" "pvalue_MPP_vs_PBS"
## [19] "mean_PBS" "mean_Phenformin"
## [21] "mean_Piericidin" "mean_Rotenone"
## [23] "mean_MPP" "log_Phenformin"
## [25] "log_Piericidin" "log_Rotenone"
## [27] "log_MPP" "neglog_Phenformin"
## [29] "neglog_Piericidin" "neglog_Rotenone"
## [31] "neglog_MPP"
## Use ggplot to plot the log ratio at 24 hours against the -log p-value at 24 hours
volcano_plot_Phenformin<-ci_data %>% ggplot(aes(x=log_Phenformin,y=neglog_Phenformin,description=Gene_Symbol))+
geom_point(alpha=0.7,color="royalblue")
volcano_plot_Phenformin
## Use ggplot to plot the log ratio at 24 hours against the -log p-value at 24 hours
volcano_plot_Piericidin<-ci_data %>% ggplot(aes(x=log_Piericidin,y=neglog_Piericidin,description=Gene_Symbol))+
geom_point(alpha=0.7,color="darkgreen")
volcano_plot_Piericidin
## Use ggplot to plot the log ratio at 24 hours against the -log p-value at 24 hours
volcano_plot_Rotenone<-ci_data %>% ggplot(aes(x=log_Rotenone,y=neglog_Rotenone,description=Gene_Symbol))+
geom_point(alpha=0.7,color="purple")
volcano_plot_Rotenone
## Use ggplot to plot the log ratio at 24 hours against the -log p-value at 24 hours
volcano_plot_MPP<-ci_data %>% ggplot(aes(x=log_MPP,y=neglog_MPP,description=Gene_Symbol))+
geom_point(alpha=0.7,color="red")
volcano_plot_MPP
## BETTER VOLCANO PLOT for Phenformin
## Define the significant ones (by using ifelse and setting # parameters) so they can be colored
ci_data<-ci_data %>% mutate(significance=ifelse((log_Phenformin>1 & neglog_Phenformin>1.99),"UP", ifelse((log_Phenformin<c(-0.95) & neglog_Phenformin>1.5),"DOWN","NOT SIG")))
# adjusted significance values from 2 & 2.99 to 3 and 3.99 - too many showing for this cell line
## Some standard colors
plain_cols3<-c("blue","gray","red")
## volcano plot as before with some added things
better_volcano_plot_Phenformin<-ci_data %>% ggplot(aes(x=log_Phenformin,y=neglog_Phenformin,description=Gene_Symbol,color=significance))+
geom_point(alpha=0.7)+
scale_color_manual(values=plain_cols3)+
xlim(-6,6)+
theme_bw()+
theme(axis.text = element_text(colour = "black",size=14))+
theme(text = element_text(size=14))+
labs(x="log ratio of Phenformin compared to control, PBS",y="-log(p-value)")
better_volcano_plot_Phenformin
## use ggplotly to see hover over the points to see the gene names. Record these for the next step
ggplotly(better_volcano_plot_Phenformin)
## Significant up regulated Phenformin are: ZBTB8OS, RPLP2, PPP3R1
## Significant down regulated Phenformin are: MAFF, AK4, PDK1, P4HA2
## BETTER VOLCANO PLOT for Piericidin
## Define the significant ones (by using ifelse and setting # parameters) so they can be colored
ci_data<-ci_data %>% mutate(significance=ifelse((log_Piericidin>2 & neglog_Piericidin>2.5),"UP", ifelse((log_Piericidin<c(-2.5) & neglog_Piericidin>1.99),"DOWN","NOT SIG")))
# adjusted significance values - too many down regulated for this cell line
## Some standard colors
plain_cols3<-c("darkgreen","gray","orange")
## volcano plot as before with some added things
better_volcano_plot_Piericidin<-ci_data %>% ggplot(aes(x=log_Piericidin,y=neglog_Piericidin,description=Gene_Symbol,color=significance))+
geom_point(alpha=0.7)+
scale_color_manual(values=plain_cols3)+
xlim(-6,6)+
theme_bw()+
theme(axis.text = element_text(colour = "black",size=14))+
theme(text = element_text(size=14))+
labs(x="log ratio of Piericidin compared to control, PBS",y="-log(p-value)")
better_volcano_plot_Piericidin
## use ggplotly to see hover over the points to see the gene names. Record these for the next step
ggplotly(better_volcano_plot_Piericidin)
## Significant up regulated Piericidin are:HSP90AB1, DPY30, RPS29
## Significant down regulated Piericidin are: USP1, TTYH3, MGST3, MT-ND1
## BETTER VOLCANO PLOT for Rotenone
## Define the significant ones (by using ifelse and setting # parameters) so they can be colored
ci_data<-ci_data %>% mutate(significance=ifelse((log_Rotenone>2.5 & neglog_Rotenone>2),"UP", ifelse((log_Rotenone<c(-2.5) & neglog_Rotenone>2),"DOWN","NOT SIG")))
# adjusted significance values - too many down regulated for this cell line
## Some standard colors
plain_cols3<-c("purple","gray","brown")
## volcano plot as before with some added things
better_volcano_plot_Rotenone<-ci_data %>% ggplot(aes(x=log_Rotenone,y=neglog_Rotenone,description=Gene_Symbol,color=significance))+
geom_point(alpha=0.7)+
scale_color_manual(values=plain_cols3)+
xlim(-6,6)+
theme_bw()+
theme(axis.text = element_text(colour = "black",size=14))+
theme(text = element_text(size=14))+
labs(x="log ratio of Rotenone compared to control, PBS",y="-log(p-value)")
better_volcano_plot_Rotenone
## use ggplotly to see hover over the points to see the gene names. Record these for the next step
ggplotly(better_volcano_plot_Rotenone)
## Significant up regulated Rotenone are: FAM129A, GEM, SQSTM1, S100A6
## Significant down regulated Rotenone are: RET, CBFA2T2
## BETTER VOLCANO PLOT for MPP
## Define the significant ones (by using ifelse and setting # parameters) so they can be colored
ci_data<-ci_data %>% mutate(significance=ifelse((log_MPP>3 & neglog_MPP>3),"UP", ifelse((log_MPP<c(-3) & neglog_MPP>3),"DOWN","NOT SIG")))
# adjusted significance values - too many down regulated for this cell line
## Some standard colors
plain_cols3<-c("red","gray","cyan")
## volcano plot as before with some added things
better_volcano_plot_MPP<-ci_data %>% ggplot(aes(x=log_MPP,y=neglog_MPP,description=Gene_Symbol,color=significance))+
geom_point(alpha=0.7)+
scale_color_manual(values=plain_cols3)+
xlim(-6,6)+
theme_bw()+
theme(axis.text = element_text(colour = "black",size=14))+
theme(text = element_text(size=14))+
labs(x="log ratio of MPP compared to control, PBS",y="-log(p-value)")
better_volcano_plot_MPP
## use ggplotly to see hover over the points to see the gene names. Record these for the next step
ggplotly(better_volcano_plot_MPP)
## Significant up regulated MPP are: PRKCE, FAM203A, VPS8
## Significant down regulated MPP are: REXO4, SETD1B, CCDC82, SERPINI1
# Significant up regulated MPP are: PRKCE, FAM203A, VPS8 # Significant up regulated Rotenone are: FAM129A, GEM, SQSTM1, S100A6 # Significant up regulated Piericidin are:HSP90AB1, DPY30, RPS29 # Significant up regulated Phenformin are: ZBTB8OS, RPLP2, PPP3R1
# Significant down regulated Piericidin are: USP1, TTYH3, MGST3, MT-ND1 # Significant down regulated Phenformin are: MAFF, AK4, PDK1, P4HA2 # Significant down regulated MPP are: REXO4, SETD1B, CCDC82, SERPINI1 # Significant down regulated Rotenone are: RET, CBFA2T2
# Make a pivot longer table of the data
ci_long<-pivot_longer(ci_data, cols = c(PBS_1:MPP_2), names_to = 'variable')%>% select(-c(pvalue_phenformin_vs_PBS:significance))%>%select(-Protein_Description, -Peptides)
head(ci_long)
## # A tibble: 6 × 4
## Uniprot_Accession Gene_Symbol variable value
## <chr> <chr> <chr> <dbl>
## 1 P50851 LRBA PBS_1 9.3
## 2 P50851 LRBA PBS_2 9.4
## 3 P50851 LRBA Phenformin_1 11
## 4 P50851 LRBA Phenformin_2 11
## 5 P50851 LRBA Piericidin_1 7.3
## 6 P50851 LRBA Piericidin_2 8.2
## based on the gene symbols from plotly, select a few proteins from the table. Select just the data and gene symbols and pivot longer
Examples_Down<-ci_long %>% filter(Gene_Symbol=="USP1" | Gene_Symbol=="TTYH3" | Gene_Symbol=="MGST3" | Gene_Symbol=="MT-ND1" | Gene_Symbol=="MAFF" | Gene_Symbol=="AK4"| Gene_Symbol=="PDK1" | Gene_Symbol=="P4HA2"| Gene_Symbol=="REXO4"| Gene_Symbol=="SETD1B"| Gene_Symbol=="CCDC82"| Gene_Symbol=="SERPINI1"| Gene_Symbol=="RET"| Gene_Symbol=="CBFA2T2")
## make barplots facetted by Gene Symbol (when working with other data sets - change x=order to x = variable)
Example_plot_down<-Examples_Down %>%
ggplot(aes(x=factor(variable,levels=c('PBS_1','PBS_2','Phenformin_1','Phenformin_2','Piericidin_1','Piericidin_2','Rotenone_1','Rotenone_2','MPP_1','MPP_2')),y=value))+
geom_bar(stat="identity",fill="blue")+
facet_wrap(~Gene_Symbol)+
theme_bw()+
theme(axis.text = element_text(colour = "black",size=10))+
theme(text = element_text(size=14))+
theme(axis.text.x = element_text(angle=45, hjust=1))+
labs(x="sample",y="relative intensity")
Example_plot_down
##INTERPRETATION## ## What can you see in this figure? are the repeated measures/reps similar or different? What does this say about the precision and accuracy of them? ##How does the control compare to the variables? Is this what you might expect? Why? What would you look for in the literature to support this idea?
## Same process for the upregulated ones
Examples_Up<-ci_long %>% filter(Gene_Symbol=="PRKCE" | Gene_Symbol=="FAM203A" | Gene_Symbol=="VPS8" | Gene_Symbol=="FAM129A" | Gene_Symbol=="GEM" | Gene_Symbol=="SQSTM1"| Gene_Symbol=="S100A6" | Gene_Symbol=="HSP90AB1" | Gene_Symbol=="RPS29"| Gene_Symbol=="DPY30"| Gene_Symbol=="ZBTB8OS"| Gene_Symbol=="RPLP2"| Gene_Symbol=="PPP3R1")
Example_plot_up<-Examples_Up %>%
ggplot(aes(x=factor(variable,levels=c('PBS_1','PBS_2','Phenformin_1','Phenformin_2','Piericidin_1','Piericidin_2','Rotenone_1','Rotenone_2','MPP_1','MPP_2')),y=value))+
geom_bar(stat="identity",fill="red")+
facet_wrap(~Gene_Symbol)+
theme_bw()+
theme(axis.text = element_text(colour = "black",size=10))+
theme(text = element_text(size=14))+
theme(axis.text.x = element_text(angle=45, hjust=1))+
labs(x="sample",y="relative intensity")
Example_plot_up
##INTERPRETATION## ## What can you see in this figure? are the repeated measures/reps similar or different? What does this say about the precision and accuracy of them? ##How does the control compare to the variables? Is this what you might expect? Why? What would you look for in the literature to support this idea?
#Upregulated genes:
upregulated<-ci_data%>% filter (Gene_Symbol=="PRKCE" | Gene_Symbol=="FAM203A" | Gene_Symbol=="VPS8" | Gene_Symbol=="FAM129A" | Gene_Symbol=="GEM" | Gene_Symbol=="SQSTM1"| Gene_Symbol=="S100A6" | Gene_Symbol=="HSP90AB1" | Gene_Symbol=="RPS29"| Gene_Symbol=="DPY30"| Gene_Symbol=="ZBTB8OS"| Gene_Symbol=="RPLP2"| Gene_Symbol=="PPP3R1") %>% select(Uniprot_Accession, Gene_Symbol, Protein_Description)
view(upregulated)
# and screen shot that table
head(upregulated)
## # A tibble: 6 × 3
## Uniprot_Accession Gene_Symbol Protein_Description
## <chr> <chr> <chr>
## 1 Q8IWT0 ZBTB8OS ARCH_HUMAN Protein archease
## 2 P05387 RPLP2 RLA2_HUMAN 60S acidic ribosomal protein P2
## 3 P63098 PPP3R1 CANB1_HUMAN Calcineurin subunit B type 1
## 4 P06703 S100A6 S10A6_HUMAN Protein S100-A6
## 5 Q9BTY7 FAM203A F203A_HUMAN Protein FAM203A
## 6 Q02156 PRKCE KPCE_HUMAN Protein kinase C epsilon type
#Downregulated genes:
downregulated<-ci_data%>% filter (Gene_Symbol=="USP1" | Gene_Symbol=="TTYH3" | Gene_Symbol=="MGST3" | Gene_Symbol=="MT-ND1" | Gene_Symbol=="MAFF" | Gene_Symbol=="AK4"| Gene_Symbol=="PDK1" | Gene_Symbol=="P4HA2"| Gene_Symbol=="REXO4"| Gene_Symbol=="SETD1B"| Gene_Symbol=="CCDC82"| Gene_Symbol=="SERPINI1"| Gene_Symbol=="RET"| Gene_Symbol=="CBFA2T2") %>% select(Uniprot_Accession, Gene_Symbol, Protein_Description)
view(downregulated)
# and screen shot that table
head(downregulated)
## # A tibble: 6 × 3
## Uniprot_Accession Gene_Symbol Protein_Description
## <chr> <chr> <chr>
## 1 Q8N4S0 CCDC82 CCD82_HUMAN Coiled-coil domain-containing prote…
## 2 Q9ULX9 MAFF MAFF_HUMAN Transcription factor MafF
## 3 P27144 AK4 KAD4_HUMAN Adenylate kinase 4, mitochondrial
## 4 Q15118 PDK1 PDK1_HUMAN [Pyruvate dehydrogenase (acetyl-tran…
## 5 O15460 P4HA2 P4HA2_HUMAN Prolyl 4-hydroxylase subunit alpha-2
## 6 Q9C0H2 TTYH3 TTYH3_HUMAN Isoform 4 of Protein tweety homolog…
##now you can knit this and publish to save and share your code. be sure to include the links to your knitted doc for each of your group mates in your lab 6 ELN. #remeber: ##Annotate when you have trouble and reference which line of code you need help on ## good luck and have fun!