Refer to the covid code as relevant code used.

library(reshape2)
library(ggplot2)
library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(plotly)
## 
## Attaching package: 'plotly'
## The following object is masked from 'package:ggplot2':
## 
##     last_plot
## The following object is masked from 'package:stats':
## 
##     filter
## The following object is masked from 'package:graphics':
## 
##     layout
library(viridis)
## Loading required package: viridisLite
library(data.table)
## 
## Attaching package: 'data.table'
## The following objects are masked from 'package:dplyr':
## 
##     between, first, last
## The following objects are masked from 'package:reshape2':
## 
##     dcast, melt
library(pheatmap)
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ forcats   1.0.0     ✔ stringr   1.5.1
## ✔ lubridate 1.9.4     ✔ tibble    3.2.1
## ✔ purrr     1.0.4     ✔ tidyr     1.3.1
## ✔ readr     2.1.5
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ data.table::between() masks dplyr::between()
## ✖ plotly::filter()      masks dplyr::filter(), stats::filter()
## ✖ data.table::first()   masks dplyr::first()
## ✖ lubridate::hour()     masks data.table::hour()
## ✖ lubridate::isoweek()  masks data.table::isoweek()
## ✖ dplyr::lag()          masks stats::lag()
## ✖ data.table::last()    masks dplyr::last()
## ✖ lubridate::mday()     masks data.table::mday()
## ✖ lubridate::minute()   masks data.table::minute()
## ✖ lubridate::month()    masks data.table::month()
## ✖ lubridate::quarter()  masks data.table::quarter()
## ✖ lubridate::second()   masks data.table::second()
## ✖ purrr::transpose()    masks data.table::transpose()
## ✖ lubridate::wday()     masks data.table::wday()
## ✖ lubridate::week()     masks data.table::week()
## ✖ lubridate::yday()     masks data.table::yday()
## ✖ lubridate::year()     masks data.table::year()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(ggthemes)
library(clipr)
## Welcome to clipr. See ?write_clip for advisories on writing to the clipboard in R.
library(tidyr)
library(Rcpp)

COLORS

mycolors<-c(viridis(15))
felix_cols<-mycolors[c(5,2)]
felix_4cols<-mycolors[c(15,10,8,2)]
plain_cols1<-c("blue","gray")
plain_cols2<-c("red","gray")

pats_cols<-colorRampPalette(c("#FDE725FF", "white","#440154FF"))(21)
leos_cols<-colorRampPalette(c("white","blue"))(10)

LOAD DATA AND MAKE AN OVERALL HEATMAP

## load the dataset (cells treated with COVID)
cancer_data<-read_csv(file="breast_cancer_cells.csv")
## Rows: 5144 Columns: 17
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr  (2): Gene_Symbol, Description
## dbl (15): Peptides, MCF10A_1, MCF10A_2, MCF7_1, MCF7_2, MDA231_1, MDA231_2, ...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
#and then view it 
view(cancer_data)

now let’s visualize the dataset and look for initial trends. We can do this by making a matrix so and then a heatmap to visualize the data

#make a matrix of just raw the data

cancer_mat1<-cancer_data %>% 
  select(MCF10A_1:SKBR3_2) %>%
  
  as.matrix() %>% 
  round(.,2)

# what does that look like?
head(cancer_mat1)
##      MCF10A_1 MCF10A_2 MCF7_1 MCF7_2 MDA231_1 MDA231_2 MDA468_1 MDA468_2
## [1,]     9.54     4.58   5.07   5.42    25.43    27.42     4.56     3.88
## [2,]    14.00    11.58   6.49   6.64     9.80    10.31     6.84     8.75
## [3,]    10.22     8.29  11.55  10.82    12.48    10.11     9.54     8.46
## [4,]     9.00     6.35  13.40  14.82    14.94    11.33     6.87     7.92
## [5,]     8.21     4.44  12.08   9.82    16.51    12.34    15.43     8.50
## [6,]    12.51    15.84   8.05   8.38     6.78     8.46     4.48     7.18
##      SKBR3_1 SKBR3_2
## [1,]    8.46    5.64
## [2,]   11.91   13.67
## [3,]    9.10    9.43
## [4,]    8.54    6.82
## [5,]    7.93    4.75
## [6,]   13.27   15.05
## make a heatmap from the data

pheatmap(cancer_mat1, color=pats_cols,cellwidth=30,cellheight=.03,cluster_cols=FALSE,cluster_rows=TRUE,legend=TRUE,fontsize = 7,scale="column")

##INTERPRETATION##
## What can you see in this figure? are the repeated measures/reps similar or different? What does this say about the precision and accuracy of them? 
##How does the control compare to the variables? Is this what you might expect? Why? What would you look for in the literature to support this idea?  

#looks like 24hrs is different than the other cells lines.  We can take a closer look at that in the next section, once we've done some further data manipulations

Data Manipulation

Control in the breast cancer: MCF10A cell line

## first, make new columns that combine the two reps for each variable by averaging them  

cancer_data2<-cancer_data %>% mutate(
  mean_MCF10A= ((MCF10A_1 + MCF10A_2)/2), 
  mean_MCF7= ((MCF7_1 + MCF7_2)/2), 
  mean_MDA231= ((MDA231_1 + MDA231_2)/2),
  mean_MDA468= ((MDA468_1 + MDA468_2)/2), 
  mean_SKBR3= ((SKBR3_1 + SKBR3_2)/2))
 
view(cancer_data2)
#did it create your new columns?
## then make some new columns that store the log ratios of the variable means you just created, as compared to the control

cancer_data2<-cancer_data2 %>% mutate(
  log_MCF7=log2(mean_MCF7/mean_MCF10A), 
  log_MDA231=log2(mean_MDA231/mean_MCF10A), 
  log_MDA468=log2(mean_MDA468/mean_MCF10A),
  log_SKBR3=log2(mean_SKBR3/mean_MCF10A)) 
 
colnames(cancer_data2)
##  [1] "Gene_Symbol"             "Description"            
##  [3] "Peptides"                "MCF10A_1"               
##  [5] "MCF10A_2"                "MCF7_1"                 
##  [7] "MCF7_2"                  "MDA231_1"               
##  [9] "MDA231_2"                "MDA468_1"               
## [11] "MDA468_2"                "SKBR3_1"                
## [13] "SKBR3_2"                 "pvalue_MCF7_vs_MCF10A"  
## [15] "pvalue_MDA231_vs_MCF10A" "pvalue_MDA468_vs_MCF10A"
## [17] "pvalue_SKBR3_vs_MCF10A"  "mean_MCF10A"            
## [19] "mean_MCF7"               "mean_MDA231"            
## [21] "mean_MDA468"             "mean_SKBR3"             
## [23] "log_MCF7"                "log_MDA231"             
## [25] "log_MDA468"              "log_SKBR3"
#did it create your new columns?
## ALTERNATIVE CHOICE: make a matrix of just log values the data and a heat map of that. How do the two heat maps compare? This is not possible if your values contain 0's

cancer_mat2<-cancer_data2 %>% 
  select(log_MCF7:log_SKBR3)%>% 
  as.matrix() %>% 
  round(.,2)

pheatmap(cancer_mat2, color=pats_cols,cellwidth=30,cellheight=.03,cluster_cols=FALSE,cluster_rows=TRUE,legend=TRUE,fontsize = 7,scale="column")

#Volcano plot

##BASIC VOLCANO PLOT

## Add a column that stores the negative log of the pvalue of interest
cancer_data2<-cancer_data2 %>% 
  mutate(neglog_MDA468=-log10(pvalue_MDA468_vs_MCF10A))

## Use ggplot to plot the log ratio at 24 hours against the -log p-value at 24 hours
volcano_plot<-cancer_data2 %>% ggplot(aes(x=log_MDA468,y=neglog_MDA468,description=Gene_Symbol))+
  geom_point(alpha=0.7,color="royalblue")
#to view it, type: 
volcano_plot

## BETTER VOLCANO PLOT

## Define the significant ones (by using ifelse and setting # parameters) so they can be colored
cancer_data2<-cancer_data2 %>% mutate(significance=ifelse((log_MDA468>5 & neglog_MDA468>1.99),"UP", ifelse((log_MDA468<c(-5) & neglog_MDA468>1.99),"DOWN","NOT SIG")))
# Need 2-5 significant points in the cancer cells and to be displayed on the plot.
## Some standard colors
plain_cols3<-c("red","gray","blue")

## volcano plot as before with some added things
better_volcano_plot<-cancer_data2 %>% ggplot(aes(x=log_MDA468,y=neglog_MDA468,description=Gene_Symbol,color=significance))+
  geom_point(alpha=0.7)+
  scale_color_manual(values=plain_cols3)+
  # Can be talked about in the report.
  xlim(-10,10)+
  theme_bw()+
  theme(axis.text = element_text(colour = "black",size=14))+
  theme(text = element_text(size=14))+
  labs(x="Log ratio of relative gene expression in selected cell line compared to control cell line",y="-log(p-value)")
  
#to view it, type
better_volcano_plot
## Warning: Removed 11 rows containing missing values or values outside the scale range
## (`geom_point()`).

#check viewer and / or plots to see it 
ggplotly(better_volcano_plot)

GENES DOWN: APOA1 HLA-A SDPR KCTD12 GENES UP: SULT1A1 PADI2 EFHD1

# Make a pivot longer table of the C19 data for Control_2h_1:Virus_24h_2
cancer_long<-pivot_longer(cancer_data2, cols = c(mean_MCF10A:mean_SKBR3), names_to = 'variable')%>% 
  select(-c(pvalue_MCF7_vs_MCF10A:pvalue_SKBR3_vs_MCF10A))%>%
  select(-c(Description:Peptides)) %>% 
  select(-c(log_MCF7:significance)) %>% 
  select(-c(MCF10A_1:SKBR3_2))

head(cancer_long)

Bar plots made with Facet wrap function

Down regulated relative gene expression

## based on the gene symbols from plotly, select a few proteins from the table. Select just the data and gene symbols and pivot longer
Examples_Down<-cancer_long %>% filter(Gene_Symbol=="APOA1" | Gene_Symbol=="HLA-A" | Gene_Symbol=="SDPR"| Gene_Symbol=="KCTD12") 

head(Examples_Down)
## make barplots facetted by Gene Symbol (when working with other data sets - change x=order to x = variable)

Example_plot_down<-Examples_Down %>% 
  ggplot(aes(x=factor(variable,levels=c('mean_MCF10A','mean_MCF7','mean_MDA231','mean_MDA468','mean_SKBR3')),y=value))+ 
  geom_bar(stat="identity",fill="red")+
  facet_wrap(~Gene_Symbol)+
  theme_bw()+
  theme(axis.text = element_text(colour = "black",size=10))+
  theme(text = element_text(size=14))+
  theme(axis.text.x = element_text(angle=45, hjust=1))+
  labs(x="sample",y="relative gene expression")
  
Example_plot_down

#check viewer and / or plots to see it 

UP regulated relative gene expression

## based on the gene symbols from plotly, select a few proteins from the table. Select just the data and gene symbols and pivot longer
Examples_up<-cancer_long %>% filter(Gene_Symbol=="SULT1A1" | Gene_Symbol=="PADI2" | Gene_Symbol=="EFHD1") 

head(Examples_up)
## make barplots facetted by Gene Symbol (when working with other data sets - change x=order to x = variable)

Example_plot_down<-Examples_up %>% 
  ggplot(aes(x=factor(variable,levels=c('mean_MCF10A','mean_MCF7','mean_MDA231','mean_MDA468','mean_SKBR3')),y=value))+ 
  geom_bar(stat="identity",fill="royalblue")+
  facet_wrap(~Gene_Symbol)+
  theme_bw()+
  theme(axis.text = element_text(colour = "black",size=10))+
  theme(text = element_text(size=14))+
  theme(axis.text.x = element_text(angle=45, hjust=1))+
  labs(x="sample",y="relative gene expression")
  
Example_plot_down

#check viewer and / or plots to see it