####Run through this example and try to understand what is going on with the data
####So, lets load RMarkdown
library(rmarkdown)
knitr::opts_chunk$set(echo = TRUE, message=FALSE,warning=FALSE,collapse = TRUE)
library(reshape2)
library(ggplot2)
library(dplyr)
library(plotly)
library(viridis)
library(data.table)
library(pheatmap)
library(tidyverse)
library(ggthemes)
library(clipr)
library(tidyr)
library(Rcpp)
mycolors<-c(viridis(15))
felix_cols<-mycolors[c(5,2)]
felix_4cols<-mycolors[c(15,10,8,2)]
plain_cols1<-c("blue","gray")
plain_cols2<-c("red","gray")
pats_cols<-colorRampPalette(c("#FDE725FF", "white","#440154FF"))(21)
leos_cols<-colorRampPalette(c("white","blue"))(10)
## load the dataset
bc_data<-read_csv(file = "breast_cancer_cells.csv")
head(bc_data)
## # A tibble: 6 × 17
## Gene_S…¹ Descr…² Pepti…³ MCF10…⁴ MCF10…⁵ MCF7_1 MCF7_2 MDA23…⁶ MDA23…⁷ MDA46…⁸
## <chr> <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 NES Nestin… 7 9.54 4.58 5.07 5.42 25.4 27.4 4.56
## 2 ZC3HC1 Isofor… 2 14 11.6 6.49 6.64 9.8 10.3 6.84
## 3 CAMSAP1… Isofor… 5 10.2 8.29 11.6 10.8 12.5 10.1 9.54
## 4 COBRA1 Negati… 3 9 6.35 13.4 14.8 14.9 11.3 6.87
## 5 HAUS6 Isofor… 5 8.21 4.44 12.1 9.82 16.5 12.3 15.4
## 6 CHCHD4 Isofor… 5 12.5 15.8 8.05 8.38 6.78 8.46 4.48
## # … with 7 more variables: MDA468_2 <dbl>, SKBR3_1 <dbl>, SKBR3_2 <dbl>,
## # pvalue_MCF7_vs_MCF10A <dbl>, pvalue_MDA231_vs_MCF10A <dbl>,
## # pvalue_MDA468_vs_MCF10A <dbl>, pvalue_SKBR3_vs_MCF10A <dbl>, and
## # abbreviated variable names ¹Gene_Symbol, ²Description, ³Peptides,
## # ⁴MCF10A_1, ⁵MCF10A_2, ⁶MDA231_1, ⁷MDA231_2, ⁸MDA468_1
colnames(bc_data)
## [1] "Gene_Symbol" "Description"
## [3] "Peptides" "MCF10A_1"
## [5] "MCF10A_2" "MCF7_1"
## [7] "MCF7_2" "MDA231_1"
## [9] "MDA231_2" "MDA468_1"
## [11] "MDA468_2" "SKBR3_1"
## [13] "SKBR3_2" "pvalue_MCF7_vs_MCF10A"
## [15] "pvalue_MDA231_vs_MCF10A" "pvalue_MDA468_vs_MCF10A"
## [17] "pvalue_SKBR3_vs_MCF10A"
## make a matrix of just original raw the data
bc_mat1<-bc_data %>% select(MCF10A_1:SKBR3_2) %>% as.matrix() %>% round(.,2)
head(bc_mat1)
## MCF10A_1 MCF10A_2 MCF7_1 MCF7_2 MDA231_1 MDA231_2 MDA468_1 MDA468_2
## [1,] 9.54 4.58 5.07 5.42 25.43 27.42 4.56 3.88
## [2,] 14.00 11.58 6.49 6.64 9.80 10.31 6.84 8.75
## [3,] 10.22 8.29 11.55 10.82 12.48 10.11 9.54 8.46
## [4,] 9.00 6.35 13.40 14.82 14.94 11.33 6.87 7.92
## [5,] 8.21 4.44 12.08 9.82 16.51 12.34 15.43 8.50
## [6,] 12.51 15.84 8.05 8.38 6.78 8.46 4.48 7.18
## SKBR3_1 SKBR3_2
## [1,] 8.46 5.64
## [2,] 11.91 13.67
## [3,] 9.10 9.43
## [4,] 8.54 6.82
## [5,] 7.93 4.75
## [6,] 13.27 15.05
## make a heatmap from the data
pheatmap(bc_mat1, color=pats_cols,cellwidth=30,cellheight=.03,cluster_cols=FALSE,cluster_rows=TRUE,legend=TRUE,fontsize = 7,scale="column")
##INTERPRETATION## ## What can you see in this figure? are the repeated measures/reps similar or different? What does this say about the precision and accuracy of them? ##How does the control compare to the variables? Is this what you might expect? Why? What would you look for in the literature to support this idea?
## make some new columns that take the average of the pairs of observations for each variable
bc_data <- bc_data %>% mutate(
mean_MCF10A= ((MCF10A_1 + MCF10A_2)/2),
mean_MCF7= ((MCF7_1 + MCF7_2)/2),
mean_MDA231= ((MDA231_1 + MDA231_2)/2),
mean_MDA468= ((MDA468_1 + MDA468_2)/2),
mean_SKBR3= ((SKBR3_1 + SKBR3_2)/2))
colnames(bc_data)
## [1] "Gene_Symbol" "Description"
## [3] "Peptides" "MCF10A_1"
## [5] "MCF10A_2" "MCF7_1"
## [7] "MCF7_2" "MDA231_1"
## [9] "MDA231_2" "MDA468_1"
## [11] "MDA468_2" "SKBR3_1"
## [13] "SKBR3_2" "pvalue_MCF7_vs_MCF10A"
## [15] "pvalue_MDA231_vs_MCF10A" "pvalue_MDA468_vs_MCF10A"
## [17] "pvalue_SKBR3_vs_MCF10A" "mean_MCF10A"
## [19] "mean_MCF7" "mean_MDA231"
## [21] "mean_MDA468" "mean_SKBR3"
## make some new columns that store the log ratios of the means you just created
bc_data<-bc_data %>% mutate(
log_MCF7=log2(mean_MCF7/mean_MCF10A),
log_MDA231=log2(mean_MDA231/mean_MCF10A),
log_MDA468=log2(mean_MDA468/mean_MCF10A),
log_SKBR3=log2(mean_SKBR3/mean_MCF10A))
colnames(bc_data)
## [1] "Gene_Symbol" "Description"
## [3] "Peptides" "MCF10A_1"
## [5] "MCF10A_2" "MCF7_1"
## [7] "MCF7_2" "MDA231_1"
## [9] "MDA231_2" "MDA468_1"
## [11] "MDA468_2" "SKBR3_1"
## [13] "SKBR3_2" "pvalue_MCF7_vs_MCF10A"
## [15] "pvalue_MDA231_vs_MCF10A" "pvalue_MDA468_vs_MCF10A"
## [17] "pvalue_SKBR3_vs_MCF10A" "mean_MCF10A"
## [19] "mean_MCF7" "mean_MDA231"
## [21] "mean_MDA468" "mean_SKBR3"
## [23] "log_MCF7" "log_MDA231"
## [25] "log_MDA468" "log_SKBR3"
##Alternative Heatmap of log values compared to control
bc_mat2<-bc_data %>% select(log_MCF7:log_SKBR3) %>% as.matrix() %>% round(.,2)
pheatmap(bc_mat2, color=pats_cols,cellwidth=30,cellheight=.03,cluster_cols=FALSE,cluster_rows=TRUE,legend=TRUE,fontsize = 7,scale="column")
##Preview of data via BASIC VOLCANO PLOT
## Add a column that stores the negative log of the pvalue of interest
bc_data<-bc_data %>% mutate(
neglog_MCF7=-log10(pvalue_MCF7_vs_MCF10A),
neglog_MDA231=-log10(pvalue_MDA231_vs_MCF10A),
neglog_MDA468=-log10(pvalue_MDA468_vs_MCF10A),
neglog_SKBR3=-log10(pvalue_SKBR3_vs_MCF10A))
colnames(bc_data)
## [1] "Gene_Symbol" "Description"
## [3] "Peptides" "MCF10A_1"
## [5] "MCF10A_2" "MCF7_1"
## [7] "MCF7_2" "MDA231_1"
## [9] "MDA231_2" "MDA468_1"
## [11] "MDA468_2" "SKBR3_1"
## [13] "SKBR3_2" "pvalue_MCF7_vs_MCF10A"
## [15] "pvalue_MDA231_vs_MCF10A" "pvalue_MDA468_vs_MCF10A"
## [17] "pvalue_SKBR3_vs_MCF10A" "mean_MCF10A"
## [19] "mean_MCF7" "mean_MDA231"
## [21] "mean_MDA468" "mean_SKBR3"
## [23] "log_MCF7" "log_MDA231"
## [25] "log_MDA468" "log_SKBR3"
## [27] "neglog_MCF7" "neglog_MDA231"
## [29] "neglog_MDA468" "neglog_SKBR3"
## Use ggplot to plot the log ratio at 24 hours against the -log p-value at 24 hours
volcano_plot_MCF7<-bc_data %>% ggplot(aes(x=log_MCF7,y=neglog_MCF7,description=Gene_Symbol))+
geom_point(alpha=0.7,color="royalblue")
volcano_plot_MCF7
## Use ggplot to plot the log ratio at 24 hours against the -log p-value at 24 hours
volcano_plot_MDA231<-bc_data %>% ggplot(aes(x=log_MDA231,y=neglog_MDA231,description=Gene_Symbol))+
geom_point(alpha=0.7,color="darkgreen")
volcano_plot_MDA231
## Use ggplot to plot the log ratio at 24 hours against the -log p-value at 24 hours
volcano_plot_MDA468<-bc_data %>% ggplot(aes(x=log_MDA468,y=neglog_MDA468,description=Gene_Symbol))+
geom_point(alpha=0.7,color="purple")
volcano_plot_MDA468
## Use ggplot to plot the log ratio at 24 hours against the -log p-value at 24 hours
volcano_plot_SKBR3<-bc_data %>% ggplot(aes(x=log_SKBR3,y=neglog_SKBR3,description=Gene_Symbol))+
geom_point(alpha=0.7,color="red")
volcano_plot_SKBR3
## BETTER VOLCANO PLOT for MCF7
## Define the significant ones (by using ifelse and setting # parameters) so they can be colored
bc_data<-bc_data %>% mutate(significance=ifelse((log_MCF7>3 & neglog_MCF7>3.99),"UP", ifelse((log_MCF7<c(-3) & neglog_MCF7>3.99),"DOWN","NOT SIG")))
# adjusted significance values from 2 & 2.99 to 3 and 3.99 - too many showing for this cell line
## Some standard colors
plain_cols3<-c("blue","gray", "red")
## volcano plot as before with some added things
better_volcano_plot_MCF7<-bc_data %>% ggplot(aes(x=log_MCF7,y=neglog_MCF7,description=Gene_Symbol,color=significance))+
geom_point(alpha=0.7)+
scale_color_manual(values=plain_cols3)+
xlim(-6,6)+
theme_bw()+
theme(axis.text = element_text(colour = "black",size=14))+
theme(text = element_text(size=14))+
labs(x="log ratio of MCF7 compared to control, MCF10A",y="-log(p-value)")
better_volcano_plot_MCF7
## use ggplotly to see hover over the points to see the gene names. Record these for the next step
ggplotly(better_volcano_plot_MCF7)
## Significant up regulated MCF7 are: C17orf28, SYT7, CORO1A and AGR3
## Significant down regulated MCF7 are: CTSZ and APOA1
## BETTER VOLCANO PLOT for MDA231
## Define the significant ones (by using ifelse and setting # parameters) so they can be colored
bc_data<-bc_data %>% mutate(significance=ifelse((log_MDA231>2 & neglog_MDA231>2.75),"UP", ifelse((log_MDA231<c(-3) & neglog_MDA231>2.99),"DOWN","NOT SIG")))
# adjusted significance values - too many down regulated for this cell line
## Some standard colors
plain_cols3<-c("darkgreen","gray","orange")
## volcano plot as before with some added things
better_volcano_plot_MDA231<-bc_data %>% ggplot(aes(x=log_MDA231,y=neglog_MDA231,description=Gene_Symbol,color=significance))+
geom_point(alpha=0.7)+
scale_color_manual(values=plain_cols3)+
xlim(-6,6)+
theme_bw()+
theme(axis.text = element_text(colour = "black",size=14))+
theme(text = element_text(size=14))+
labs(x="log ratio of MDA231 compared to control, MCF10A",y="-log(p-value)")
better_volcano_plot_MDA231
## use ggplotly to see hover over the points to see the gene names. Record these for the next step
ggplotly(better_volcano_plot_MDA231)
## Significant up regulated MDA231 are: ACSS3, PAFAH1B3,and SMEK2
## Significant down regulated MDA231 are: APOA1, HLA-A, HIST1H1A, and PTGFRN
## BETTER VOLCANO PLOT for MDA468
## Define the significant ones (by using ifelse and setting # parameters) so they can be colored
bc_data<-bc_data %>% mutate(significance=ifelse((log_MDA468>3.6 & neglog_MDA468>3.6),"UP", ifelse((log_MDA468<c(-3) & neglog_MDA468>3.5),"DOWN","NOT SIG")))
# adjusted significance values - too many down regulated for this cell line
## Some standard colors
plain_cols3<-c("purple","gray","brown")
## volcano plot as before with some added things
better_volcano_plot_MDA468<-bc_data %>% ggplot(aes(x=log_MDA468,y=neglog_MDA468,description=Gene_Symbol,color=significance))+
geom_point(alpha=0.7)+
scale_color_manual(values=plain_cols3)+
xlim(-6,6)+
theme_bw()+
theme(axis.text = element_text(colour = "black",size=14))+
theme(text = element_text(size=14))+
labs(x="log ratio of MDA468 compared to control, MCF10A",y="-log(p-value)")
better_volcano_plot_MDA468
## use ggplotly to see hover over the points to see the gene names. Record these for the next step
ggplotly(better_volcano_plot_MDA468)
## Significant up regulated MDA468 are: SYT7, ALDH1A3, AKR1C2, and NLRP2
## Significant down regulated MDA468 are: HLA-A, and PLAU
## BETTER VOLCANO PLOT for SKBR3
## Define the significant ones (by using ifelse and setting # parameters) so they can be colored
bc_data<-bc_data %>% mutate(significance=ifelse((log_SKBR3>2.7 & neglog_SKBR3>2.7),"UP", ifelse((log_SKBR3<c(-2.5) & neglog_SKBR3>2.5),"DOWN","NOT SIG")))
# adjusted significance values - too many down regulated for this cell line
## Some standard colors
plain_cols3<-c("red","gray","cyan")
## volcano plot as before with some added things
better_volcano_plot_SKBR3<-bc_data %>% ggplot(aes(x=log_SKBR3,y=neglog_SKBR3,description=Gene_Symbol,color=significance))+
geom_point(alpha=0.7)+
scale_color_manual(values=plain_cols3)+
xlim(-6,6)+
theme_bw()+
theme(axis.text = element_text(colour = "black",size=14))+
theme(text = element_text(size=14))+
labs(x="log ratio of MDA468 compared to control, MCF10A",y="-log(p-value)")
better_volcano_plot_SKBR3
## use ggplotly to see hover over the points to see the gene names. Record these for the next step
ggplotly(better_volcano_plot_SKBR3)
## Significant up regulated SKBR3are: HOOK1, ASS1, EFHD1, and ATP2A3
## Significant down regulated SKBR3 are: HLA-A, APOA1, MYO1B, and TMX4
# Significant up regulated MCF7 are: C17orf28, SYT7, CORO1A and AGR3 # Significant up regulated MDA231 are: ACSS3, PAFAH1B3,and SMEK2 # Significant up regulated MDA468 are: SYT7, ALDH1A3, AKR1C2, and NLRP2 # Significant up regulated SKBR3are: HOOK1, ASS1, EFHD1, and ATP2A3
# Significant down regulated MCF7 are: CTSZ and APOA1 # Significant down regulated MDA231 are: APOA1, HLA-A, HIST1H1A, and PTGFRN # Significant down regulated MDA468 are: HLA-A, and PLAU # Significant down regulated SKBR3 are: HLA-A, APOA1, MYO1B, and TMX4 #therefore, HLA-A and /or APOA1 are highly significant down regulated in all cell lines
# Make a pivot longer table of the data
bc_long<-pivot_longer(bc_data, cols = c(MCF10A_1:SKBR3_2), names_to = 'variable')%>% select(-c(pvalue_MCF7_vs_MCF10A:significance))%>%select(-Description, -Peptides)
head(bc_long)
## # A tibble: 6 × 3
## Gene_Symbol variable value
## <chr> <chr> <dbl>
## 1 NES MCF10A_1 9.54
## 2 NES MCF10A_2 4.58
## 3 NES MCF7_1 5.07
## 4 NES MCF7_2 5.42
## 5 NES MDA231_1 25.4
## 6 NES MDA231_2 27.4
## based on the gene symbols from plotly, select a few proteins from the table. Select just the data and gene symbols and pivot longer
Examples_Down<-bc_long %>% filter(Gene_Symbol=="HLA-A" | Gene_Symbol=="APOA1" | Gene_Symbol=="CTSZ" | Gene_Symbol=="HIST1H1A" | Gene_Symbol=="PTGFRN" | Gene_Symbol=="PLAU"| Gene_Symbol=="MYO1B" | Gene_Symbol=="TMX4")
## Significant down regulated MCF7 are: CTSZ and APOA1
## Significant down regulated MDA231 are: APOA1, HLA-A, HIST1H1A, and PTGFRN
## Significant down regulated MDA468 are: HLA-A, and PLAU
## Significant down regulated SKBR3 are: HLA-A, APOA1, MYO1B, and TMX4
## make barplots facetted by Gene Symbol (when working with other data sets - change x=order to x = variable)
Example_plot_down<-Examples_Down %>%
ggplot(aes(x=variable,y=value))+
geom_bar(stat="identity",fill="blue")+
facet_wrap(~Gene_Symbol)+
theme_bw()+
theme(axis.text = element_text(colour = "black",size=10))+
theme(text = element_text(size=14))+
theme(axis.text.x = element_text(angle=45, hjust=1))+
labs(x="sample",y="relative intensity")
Example_plot_down
##INTERPRETATION## ## What can you see in this figure? are the repeated measures/reps similar or different? What does this say about the precision and accuracy of them? ##How does the control compare to the variables? Is this what you might expect? Why? What would you look for in the literature to support this idea?
## Same process for the upregulated ones
Examples_Up<-bc_long %>% filter(Gene_Symbol=="C17orf28" | Gene_Symbol=="SYT7" | Gene_Symbol=="CORO1A" | Gene_Symbol=="AGR3" | Gene_Symbol=="ACSS3" | Gene_Symbol=="PAFAH1B3"| Gene_Symbol=="SMEK2" | Gene_Symbol=="SYT7" | Gene_Symbol=="ALDH1A3"| Gene_Symbol=="AKR1C2"| Gene_Symbol=="NLRP2"| Gene_Symbol=="HOOK1"| Gene_Symbol=="ASS1"| Gene_Symbol=="EFHD1"| Gene_Symbol=="ATP2A3")
## Significant up regulated MCF7 are: C17orf28, SYT7, CORO1A and AGR3
## Significant up regulated MDA231 are: ACSS3, PAFAH1B3,and SMEK2
## Significant up regulated MDA468 are: SYT7, ALDH1A3, AKR1C2, and NLRP2
## Significant up regulated SKBR3are: HOOK1, ASS1, EFHD1, and ATP2A3
Example_plot_up<-Examples_Up %>%
ggplot(aes(x=variable,y=value))+
geom_bar(stat="identity",fill="red")+
facet_wrap(~Gene_Symbol)+
theme_bw()+
theme(axis.text = element_text(colour = "black",size=10))+
theme(text = element_text(size=14))+
theme(axis.text.x = element_text(angle=45, hjust=1))+
labs(x="sample",y="relative intensity")
Example_plot_up
##INTERPRETATION## ## What can you see in this figure? are the repeated measures/reps similar or different? What does this say about the precision and accuracy of them? ##How does the control compare to the variables? Is this what you might expect? Why? What would you look for in the literature to support this idea?
#Upregulated genes:
upregulated<-bc_data%>% filter (Gene_Symbol=="C17orf28" | Gene_Symbol=="SYT7" | Gene_Symbol=="CORO1A" | Gene_Symbol=="AGR3" | Gene_Symbol=="ACSS3" | Gene_Symbol=="PAFAH1B3"| Gene_Symbol=="SMEK2" | Gene_Symbol=="SYT7" | Gene_Symbol=="ALDH1A3"| Gene_Symbol=="AKR1C2"| Gene_Symbol=="NLRP2"| Gene_Symbol=="HOOK1"| Gene_Symbol=="ASS1"| Gene_Symbol=="EFHD1"| Gene_Symbol=="ATP2A3") %>% select(Gene_Symbol, Description)
view(upregulated)
# and screen shot that table
head(upregulated)
## # A tibble: 6 × 2
## Gene_Symbol Description
## <chr> <chr>
## 1 AGR3 Anterior gradient protein 3 homolog OS=Homo sapiens GN=AGR3 PE=1 …
## 2 AKR1C2 Aldo-keto reductase family 1 member C2 OS=Homo sapiens GN=AKR1C2 …
## 3 NLRP2 Isoform 2 of NACHT, LRR and PYD domains-containing protein 2 OS=H…
## 4 SYT7 Synaptotagmin-7 OS=Homo sapiens GN=SYT7 PE=1 SV=3
## 5 ALDH1A3 Aldehyde dehydrogenase family 1 member A3 OS=Homo sapiens GN=ALDH…
## 6 ACSS3 Isoform 2 of Acyl-CoA synthetase short-chain family member 3, mit…
#Downregulated genes:
downregulated<-bc_data%>% filter (Gene_Symbol=="HLA-A" | Gene_Symbol=="APOA1" | Gene_Symbol=="CTSZ" | Gene_Symbol=="HIST1H1A" | Gene_Symbol=="PTGFRN" | Gene_Symbol=="PLAU"| Gene_Symbol=="MYO1B" | Gene_Symbol=="TMX4") %>% select(Gene_Symbol, Description)
view(downregulated)
# and screen shot that table
head(downregulated)
## # A tibble: 6 × 2
## Gene_Symbol Description
## <chr> <chr>
## 1 HLA-A HLA class I histocompatibility antigen, A-23 alpha chain OS=Homo …
## 2 HLA-A HLA class I histocompatibility antigen, A-30 alpha chain OS=Homo …
## 3 HLA-A HLA class I histocompatibility antigen, A-2 alpha chain OS=Homo s…
## 4 CTSZ Cathepsin Z OS=Homo sapiens GN=CTSZ PE=1 SV=1
## 5 PLAU Isoform 2 of Urokinase-type plasminogen activator OS=Homo sapiens…
## 6 HIST1H1A Histone H1.1 OS=Homo sapiens GN=HIST1H1A PE=1 SV=3
##now you can knit this and publish to save and share your code. be sure to include the links to your knitted doc for each of your group mates in your lab 6 ELN. #remeber: ##Annotate when you have trouble and reference which line of code you need help on ## good luck and have fun!