Refer to the covid code as relevant code used.

library(reshape2)
library(ggplot2)
library(dplyr)

## 
## Attaching package: 'dplyr'

## The following objects are masked from 'package:stats':
## 
##     filter, lag

## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union

library(plotly)

## 
## Attaching package: 'plotly'

## The following object is masked from 'package:ggplot2':
## 
##     last_plot

## The following object is masked from 'package:stats':
## 
##     filter

## The following object is masked from 'package:graphics':
## 
##     layout

library(viridis)

## Loading required package: viridisLite

library(data.table)

## 
## Attaching package: 'data.table'

## The following objects are masked from 'package:dplyr':
## 
##     between, first, last

## The following objects are masked from 'package:reshape2':
## 
##     dcast, melt

library(pheatmap)
library(tidyverse)

## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ forcats   1.0.0     ✔ stringr   1.5.1
## ✔ lubridate 1.9.4     ✔ tibble    3.2.1
## ✔ purrr     1.0.4     ✔ tidyr     1.3.1
## ✔ readr     2.1.5

## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ data.table::between() masks dplyr::between()
## ✖ plotly::filter()      masks dplyr::filter(), stats::filter()
## ✖ data.table::first()   masks dplyr::first()
## ✖ lubridate::hour()     masks data.table::hour()
## ✖ lubridate::isoweek()  masks data.table::isoweek()
## ✖ dplyr::lag()          masks stats::lag()
## ✖ data.table::last()    masks dplyr::last()
## ✖ lubridate::mday()     masks data.table::mday()
## ✖ lubridate::minute()   masks data.table::minute()
## ✖ lubridate::month()    masks data.table::month()
## ✖ lubridate::quarter()  masks data.table::quarter()
## ✖ lubridate::second()   masks data.table::second()
## ✖ purrr::transpose()    masks data.table::transpose()
## ✖ lubridate::wday()     masks data.table::wday()
## ✖ lubridate::week()     masks data.table::week()
## ✖ lubridate::yday()     masks data.table::yday()
## ✖ lubridate::year()     masks data.table::year()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors

library(ggthemes)
library(clipr)

## Welcome to clipr. See ?write_clip for advisories on writing to the clipboard in R.

library(tidyr)
library(Rcpp)

COLORS

mycolors<-c(viridis(15))
felix_cols<-mycolors[c(5,2)]
felix_4cols<-mycolors[c(15,10,8,2)]
plain_cols1<-c("blue","gray")
plain_cols2<-c("red","gray")

pats_cols<-colorRampPalette(c("#FDE725FF", "white","#440154FF"))(21)
leos_cols<-colorRampPalette(c("white","blue"))(10)

LOAD DATA AND MAKE AN OVERALL HEATMAP

## load the dataset (cells treated with COVID)
cancer_data<-read_csv(file="breast_cancer_cells.csv")

## Rows: 5144 Columns: 17
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr  (2): Gene_Symbol, Description
## dbl (15): Peptides, MCF10A_1, MCF10A_2, MCF7_1, MCF7_2, MDA231_1, MDA231_2, ...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.

#and then view it 
view(cancer_data)

now let’s visualize the dataset and look for initial trends. We can do this by making a matrix so and then a heatmap to visualize the data

#make a matrix of just raw the data

cancer_mat1<-cancer_data %>% 
  select(MCF10A_1:SKBR3_2) %>%
  
  as.matrix() %>% 
  round(.,2)

# what does that look like?
head(cancer_mat1)

##      MCF10A_1 MCF10A_2 MCF7_1 MCF7_2 MDA231_1 MDA231_2 MDA468_1 MDA468_2
## [1,]     9.54     4.58   5.07   5.42    25.43    27.42     4.56     3.88
## [2,]    14.00    11.58   6.49   6.64     9.80    10.31     6.84     8.75
## [3,]    10.22     8.29  11.55  10.82    12.48    10.11     9.54     8.46
## [4,]     9.00     6.35  13.40  14.82    14.94    11.33     6.87     7.92
## [5,]     8.21     4.44  12.08   9.82    16.51    12.34    15.43     8.50
## [6,]    12.51    15.84   8.05   8.38     6.78     8.46     4.48     7.18
##      SKBR3_1 SKBR3_2
## [1,]    8.46    5.64
## [2,]   11.91   13.67
## [3,]    9.10    9.43
## [4,]    8.54    6.82
## [5,]    7.93    4.75
## [6,]   13.27   15.05

## make a heatmap from the data

pheatmap(cancer_mat1, color=pats_cols,cellwidth=30,cellheight=.03,cluster_cols=FALSE,cluster_rows=TRUE,legend=TRUE,fontsize = 7,scale="column")

##INTERPRETATION##
## What can you see in this figure? are the repeated measures/reps similar or different? What does this say about the precision and accuracy of them? 
##How does the control compare to the variables? Is this what you might expect? Why? What would you look for in the literature to support this idea?  

#looks like 24hrs is different than the other cells lines.  We can take a closer look at that in the next section, once we've done some further data manipulations

Data Manipulation

Control in the breast cancer: MCF10A cell line

## first, make new columns that combine the two reps for each variable by averaging them  

cancer_data2<-cancer_data %>% mutate(
  mean_MCF10A= ((MCF10A_1 + MCF10A_2)/2), 
  mean_MCF7= ((MCF7_1 + MCF7_2)/2), 
  mean_MDA231= ((MDA231_1 + MDA231_2)/2),
  mean_MDA468= ((MDA468_1 + MDA468_2)/2), 
  mean_SKBR3= ((SKBR3_1 + SKBR3_2)/2))
 
view(cancer_data2)
#did it create your new columns?

## then make some new columns that store the log ratios of the variable means you just created, as compared to the control

cancer_data2<-cancer_data2 %>% mutate(
  log_MCF7=log2(mean_MCF7/mean_MCF10A), 
  log_MDA231=log2(mean_MDA231/mean_MCF10A), 
  log_MDA468=log2(mean_MDA468/mean_MCF10A),
  log_SKBR3=log2(mean_SKBR3/mean_MCF10A)) 
 
colnames(cancer_data2)

##  [1] "Gene_Symbol"             "Description"            
##  [3] "Peptides"                "MCF10A_1"               
##  [5] "MCF10A_2"                "MCF7_1"                 
##  [7] "MCF7_2"                  "MDA231_1"               
##  [9] "MDA231_2"                "MDA468_1"               
## [11] "MDA468_2"                "SKBR3_1"                
## [13] "SKBR3_2"                 "pvalue_MCF7_vs_MCF10A"  
## [15] "pvalue_MDA231_vs_MCF10A" "pvalue_MDA468_vs_MCF10A"
## [17] "pvalue_SKBR3_vs_MCF10A"  "mean_MCF10A"            
## [19] "mean_MCF7"               "mean_MDA231"            
## [21] "mean_MDA468"             "mean_SKBR3"             
## [23] "log_MCF7"                "log_MDA231"             
## [25] "log_MDA468"              "log_SKBR3"

#did it create your new columns?

## ALTERNATIVE CHOICE: make a matrix of just log values the data and a heat map of that. How do the two heat maps compare? This is not possible if your values contain 0's

cancer_mat2<-cancer_data2 %>% 
  select(log_MCF7:log_SKBR3)%>% 
  as.matrix() %>% 
  round(.,2)

pheatmap(cancer_mat2, color=pats_cols,cellwidth=30,cellheight=.03,cluster_cols=FALSE,cluster_rows=TRUE,legend=TRUE,fontsize = 7,scale="column")

#Volcano plot

##BASIC VOLCANO PLOT

## Add a column that stores the negative log of the pvalue of interest
cancer_data2<-cancer_data2 %>% 
  mutate(neglog_MDA468=-log10(pvalue_MDA468_vs_MCF10A))

## Use ggplot to plot the log ratio at 24 hours against the -log p-value at 24 hours
volcano_plot<-cancer_data2 %>% ggplot(aes(x=log_MDA468,y=neglog_MDA468,description=Gene_Symbol))+
  geom_point(alpha=0.7,color="royalblue")
#to view it, type: 
volcano_plot

## BETTER VOLCANO PLOT

## Define the significant ones (by using ifelse and setting # parameters) so they can be colored
cancer_data2<-cancer_data2 %>% mutate(significance=ifelse((log_MDA468>5 & neglog_MDA468>1.99),"UP", ifelse((log_MDA468<c(-5) & neglog_MDA468>1.99),"DOWN","NOT SIG")))
# Need 2-5 significant points in the cancer cells and to be displayed on the plot.
## Some standard colors
plain_cols3<-c("red","gray","blue")

## volcano plot as before with some added things
better_volcano_plot<-cancer_data2 %>% ggplot(aes(x=log_MDA468,y=neglog_MDA468,description=Gene_Symbol,color=significance))+
  geom_point(alpha=0.7)+
  scale_color_manual(values=plain_cols3)+
  # Can be talked about in the report.
  xlim(-10,10)+
  theme_bw()+
  theme(axis.text = element_text(colour = "black",size=14))+
  theme(text = element_text(size=14))+
  labs(x="Log ratio of relative gene expression in selected cell line compared to control cell line",y="-log(p-value)")
  
#to view it, type
better_volcano_plot

## Warning: Removed 11 rows containing missing values or values outside the scale range
## (`geom_point()`).

#check viewer and / or plots to see it 
ggplotly(better_volcano_plot)

GENES DOWN: APOA1 HLA-A SDPR KCTD12 GENES UP: SULT1A1 PADI2 EFHD1

# Make a pivot longer table of the C19 data for Control_2h_1:Virus_24h_2
cancer_long<-pivot_longer(cancer_data2, cols = c(mean_MCF10A:mean_SKBR3), names_to = 'variable')%>% 
  select(-c(pvalue_MCF7_vs_MCF10A:pvalue_SKBR3_vs_MCF10A))%>%
  select(-c(Description:Peptides)) %>% 
  select(-c(log_MCF7:significance)) %>% 
  select(-c(MCF10A_1:SKBR3_2))

head(cancer_long)

Bar plots made with Facet wrap function

Down regulated relative gene expression

## based on the gene symbols from plotly, select a few proteins from the table. Select just the data and gene symbols and pivot longer
Examples_Down<-cancer_long %>% filter(Gene_Symbol=="APOA1" | Gene_Symbol=="HLA-A" | Gene_Symbol=="SDPR"| Gene_Symbol=="KCTD12") 

head(Examples_Down)

## make barplots facetted by Gene Symbol (when working with other data sets - change x=order to x = variable)

Example_plot_down<-Examples_Down %>% 
  ggplot(aes(x=factor(variable,levels=c('mean_MCF10A','mean_MCF7','mean_MDA231','mean_MDA468','mean_SKBR3')),y=value))+ 
  geom_bar(stat="identity",fill="red")+
  facet_wrap(~Gene_Symbol)+
  theme_bw()+
  theme(axis.text = element_text(colour = "black",size=10))+
  theme(text = element_text(size=14))+
  theme(axis.text.x = element_text(angle=45, hjust=1))+
  labs(x="sample",y="relative gene expression")
  
Example_plot_down

#check viewer and / or plots to see it

UP regulated relative gene expression

## based on the gene symbols from plotly, select a few proteins from the table. Select just the data and gene symbols and pivot longer
Examples_up<-cancer_long %>% filter(Gene_Symbol=="SULT1A1" | Gene_Symbol=="PADI2" | Gene_Symbol=="EFHD1") 

head(Examples_up)

## make barplots facetted by Gene Symbol (when working with other data sets - change x=order to x = variable)

Example_plot_down<-Examples_up %>% 
  ggplot(aes(x=factor(variable,levels=c('mean_MCF10A','mean_MCF7','mean_MDA231','mean_MDA468','mean_SKBR3')),y=value))+ 
  geom_bar(stat="identity",fill="royalblue")+
  facet_wrap(~Gene_Symbol)+
  theme_bw()+
  theme(axis.text = element_text(colour = "black",size=10))+
  theme(text = element_text(size=14))+
  theme(axis.text.x = element_text(angle=45, hjust=1))+
  labs(x="sample",y="relative gene expression")
  
Example_plot_down

#check viewer and / or plots to see it

Breast_Cancer_cells

Victor Singh

2025-03-11

Refer to the covid code as relevant code used.

COLORS

LOAD DATA AND MAKE AN OVERALL HEATMAP

now let’s visualize the dataset and look for initial trends. We can do this by making a matrix so and then a heatmap to visualize the data

Data Manipulation

Control in the breast cancer: MCF10A cell line

Bar plots made with Facet wrap function