library(reshape2)
library(ggplot2)
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(plotly)
##
## Attaching package: 'plotly'
## The following object is masked from 'package:ggplot2':
##
## last_plot
## The following object is masked from 'package:stats':
##
## filter
## The following object is masked from 'package:graphics':
##
## layout
library(viridis)
## Loading required package: viridisLite
library(data.table)
##
## Attaching package: 'data.table'
## The following objects are masked from 'package:dplyr':
##
## between, first, last
## The following objects are masked from 'package:reshape2':
##
## dcast, melt
library(pheatmap)
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ forcats 1.0.0 ✔ stringr 1.5.1
## ✔ lubridate 1.9.4 ✔ tibble 3.2.1
## ✔ purrr 1.0.4 ✔ tidyr 1.3.1
## ✔ readr 2.1.5
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ data.table::between() masks dplyr::between()
## ✖ plotly::filter() masks dplyr::filter(), stats::filter()
## ✖ data.table::first() masks dplyr::first()
## ✖ lubridate::hour() masks data.table::hour()
## ✖ lubridate::isoweek() masks data.table::isoweek()
## ✖ dplyr::lag() masks stats::lag()
## ✖ data.table::last() masks dplyr::last()
## ✖ lubridate::mday() masks data.table::mday()
## ✖ lubridate::minute() masks data.table::minute()
## ✖ lubridate::month() masks data.table::month()
## ✖ lubridate::quarter() masks data.table::quarter()
## ✖ lubridate::second() masks data.table::second()
## ✖ purrr::transpose() masks data.table::transpose()
## ✖ lubridate::wday() masks data.table::wday()
## ✖ lubridate::week() masks data.table::week()
## ✖ lubridate::yday() masks data.table::yday()
## ✖ lubridate::year() masks data.table::year()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(ggthemes)
library(clipr)
## Welcome to clipr. See ?write_clip for advisories on writing to the clipboard in R.
library(tidyr)
library(Rcpp)
mycolors<-c(viridis(15))
felix_cols<-mycolors[c(5,2)]
felix_4cols<-mycolors[c(15,10,8,2)]
plain_cols1<-c("blue","gray")
plain_cols2<-c("red","gray")
pats_cols<-colorRampPalette(c("#FDE725FF", "white","#440154FF"))(21)
leos_cols<-colorRampPalette(c("white","blue"))(10)
## load the dataset (cells treated with COVID)
cancer_data<-read_csv(file="breast_cancer_cells.csv")
## Rows: 5144 Columns: 17
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (2): Gene_Symbol, Description
## dbl (15): Peptides, MCF10A_1, MCF10A_2, MCF7_1, MCF7_2, MDA231_1, MDA231_2, ...
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
#and then view it
view(cancer_data)
#make a matrix of just raw the data
cancer_mat1<-cancer_data %>%
select(MCF10A_1:SKBR3_2) %>%
as.matrix() %>%
round(.,2)
# what does that look like?
head(cancer_mat1)
## MCF10A_1 MCF10A_2 MCF7_1 MCF7_2 MDA231_1 MDA231_2 MDA468_1 MDA468_2
## [1,] 9.54 4.58 5.07 5.42 25.43 27.42 4.56 3.88
## [2,] 14.00 11.58 6.49 6.64 9.80 10.31 6.84 8.75
## [3,] 10.22 8.29 11.55 10.82 12.48 10.11 9.54 8.46
## [4,] 9.00 6.35 13.40 14.82 14.94 11.33 6.87 7.92
## [5,] 8.21 4.44 12.08 9.82 16.51 12.34 15.43 8.50
## [6,] 12.51 15.84 8.05 8.38 6.78 8.46 4.48 7.18
## SKBR3_1 SKBR3_2
## [1,] 8.46 5.64
## [2,] 11.91 13.67
## [3,] 9.10 9.43
## [4,] 8.54 6.82
## [5,] 7.93 4.75
## [6,] 13.27 15.05
## make a heatmap from the data
pheatmap(cancer_mat1, color=pats_cols,cellwidth=30,cellheight=.03,cluster_cols=FALSE,cluster_rows=TRUE,legend=TRUE,fontsize = 7,scale="column")
##INTERPRETATION##
## What can you see in this figure? are the repeated measures/reps similar or different? What does this say about the precision and accuracy of them?
##How does the control compare to the variables? Is this what you might expect? Why? What would you look for in the literature to support this idea?
#looks like 24hrs is different than the other cells lines. We can take a closer look at that in the next section, once we've done some further data manipulations
## first, make new columns that combine the two reps for each variable by averaging them
cancer_data2<-cancer_data %>% mutate(
mean_MCF10A= ((MCF10A_1 + MCF10A_2)/2),
mean_MCF7= ((MCF7_1 + MCF7_2)/2),
mean_MDA231= ((MDA231_1 + MDA231_2)/2),
mean_MDA468= ((MDA468_1 + MDA468_2)/2),
mean_SKBR3= ((SKBR3_1 + SKBR3_2)/2))
view(cancer_data2)
#did it create your new columns?
## then make some new columns that store the log ratios of the variable means you just created, as compared to the control
cancer_data2<-cancer_data2 %>% mutate(
log_MCF7=log2(mean_MCF7/mean_MCF10A),
log_MDA231=log2(mean_MDA231/mean_MCF10A),
log_MDA468=log2(mean_MDA468/mean_MCF10A),
log_SKBR3=log2(mean_SKBR3/mean_MCF10A))
colnames(cancer_data2)
## [1] "Gene_Symbol" "Description"
## [3] "Peptides" "MCF10A_1"
## [5] "MCF10A_2" "MCF7_1"
## [7] "MCF7_2" "MDA231_1"
## [9] "MDA231_2" "MDA468_1"
## [11] "MDA468_2" "SKBR3_1"
## [13] "SKBR3_2" "pvalue_MCF7_vs_MCF10A"
## [15] "pvalue_MDA231_vs_MCF10A" "pvalue_MDA468_vs_MCF10A"
## [17] "pvalue_SKBR3_vs_MCF10A" "mean_MCF10A"
## [19] "mean_MCF7" "mean_MDA231"
## [21] "mean_MDA468" "mean_SKBR3"
## [23] "log_MCF7" "log_MDA231"
## [25] "log_MDA468" "log_SKBR3"
#did it create your new columns?
## ALTERNATIVE CHOICE: make a matrix of just log values the data and a heat map of that. How do the two heat maps compare? This is not possible if your values contain 0's
cancer_mat2<-cancer_data2 %>%
select(log_MCF7:log_SKBR3)%>%
as.matrix() %>%
round(.,2)
pheatmap(cancer_mat2, color=pats_cols,cellwidth=30,cellheight=.03,cluster_cols=FALSE,cluster_rows=TRUE,legend=TRUE,fontsize = 7,scale="column")
#Volcano plot
##BASIC VOLCANO PLOT
## Add a column that stores the negative log of the pvalue of interest
cancer_data2<-cancer_data2 %>%
mutate(neglog_MDA468=-log10(pvalue_MDA468_vs_MCF10A))
## Use ggplot to plot the log ratio at 24 hours against the -log p-value at 24 hours
volcano_plot<-cancer_data2 %>% ggplot(aes(x=log_MDA468,y=neglog_MDA468,description=Gene_Symbol))+
geom_point(alpha=0.7,color="royalblue")
#to view it, type:
volcano_plot
## BETTER VOLCANO PLOT
## Define the significant ones (by using ifelse and setting # parameters) so they can be colored
cancer_data2<-cancer_data2 %>% mutate(significance=ifelse((log_MDA468>5 & neglog_MDA468>1.99),"UP", ifelse((log_MDA468<c(-5) & neglog_MDA468>1.99),"DOWN","NOT SIG")))
# Need 2-5 significant points in the cancer cells and to be displayed on the plot.
## Some standard colors
plain_cols3<-c("red","gray","blue")
## volcano plot as before with some added things
better_volcano_plot<-cancer_data2 %>% ggplot(aes(x=log_MDA468,y=neglog_MDA468,description=Gene_Symbol,color=significance))+
geom_point(alpha=0.7)+
scale_color_manual(values=plain_cols3)+
# Can be talked about in the report.
xlim(-10,10)+
theme_bw()+
theme(axis.text = element_text(colour = "black",size=14))+
theme(text = element_text(size=14))+
labs(x="Log ratio of relative gene expression in selected cell line compared to control cell line",y="-log(p-value)")
#to view it, type
better_volcano_plot
## Warning: Removed 11 rows containing missing values or values outside the scale range
## (`geom_point()`).
#check viewer and / or plots to see it
ggplotly(better_volcano_plot)
GENES DOWN: APOA1 HLA-A SDPR KCTD12 GENES UP: SULT1A1 PADI2 EFHD1
# Make a pivot longer table of the C19 data for Control_2h_1:Virus_24h_2
cancer_long<-pivot_longer(cancer_data2, cols = c(mean_MCF10A:mean_SKBR3), names_to = 'variable')%>%
select(-c(pvalue_MCF7_vs_MCF10A:pvalue_SKBR3_vs_MCF10A))%>%
select(-c(Description:Peptides)) %>%
select(-c(log_MCF7:significance)) %>%
select(-c(MCF10A_1:SKBR3_2))
head(cancer_long)
Down regulated relative gene expression
## based on the gene symbols from plotly, select a few proteins from the table. Select just the data and gene symbols and pivot longer
Examples_Down<-cancer_long %>% filter(Gene_Symbol=="APOA1" | Gene_Symbol=="HLA-A" | Gene_Symbol=="SDPR"| Gene_Symbol=="KCTD12")
head(Examples_Down)
## make barplots facetted by Gene Symbol (when working with other data sets - change x=order to x = variable)
Example_plot_down<-Examples_Down %>%
ggplot(aes(x=factor(variable,levels=c('mean_MCF10A','mean_MCF7','mean_MDA231','mean_MDA468','mean_SKBR3')),y=value))+
geom_bar(stat="identity",fill="red")+
facet_wrap(~Gene_Symbol)+
theme_bw()+
theme(axis.text = element_text(colour = "black",size=10))+
theme(text = element_text(size=14))+
theme(axis.text.x = element_text(angle=45, hjust=1))+
labs(x="sample",y="relative gene expression")
Example_plot_down
#check viewer and / or plots to see it
UP regulated relative gene expression
## based on the gene symbols from plotly, select a few proteins from the table. Select just the data and gene symbols and pivot longer
Examples_up<-cancer_long %>% filter(Gene_Symbol=="SULT1A1" | Gene_Symbol=="PADI2" | Gene_Symbol=="EFHD1")
head(Examples_up)
## make barplots facetted by Gene Symbol (when working with other data sets - change x=order to x = variable)
Example_plot_down<-Examples_up %>%
ggplot(aes(x=factor(variable,levels=c('mean_MCF10A','mean_MCF7','mean_MDA231','mean_MDA468','mean_SKBR3')),y=value))+
geom_bar(stat="identity",fill="royalblue")+
facet_wrap(~Gene_Symbol)+
theme_bw()+
theme(axis.text = element_text(colour = "black",size=10))+
theme(text = element_text(size=14))+
theme(axis.text.x = element_text(angle=45, hjust=1))+
labs(x="sample",y="relative gene expression")
Example_plot_down
#check viewer and / or plots to see it