mycolors<-c(viridis(15))
felix_cols<-mycolors[c(5,2)]
felix_4cols<-mycolors[c(15,10,8,2)]
plain_cols1<-c("blue","gray")
plain_cols2<-c("red","gray")

pats_cols<-colorRampPalette(c("#FDE725FF", "white","#440154FF"))(21)
leos_cols<-colorRampPalette(c("white","blue"))(10)
## load the dataset 
BCC<-read.csv("breast_cancer_cells.csv")

#and then view it 
view(BCC)
#make a matrix of just raw the data
BCC_MAT<-BCC%>% select(MCF10A_1:SKBR3_2)%>% round(.,02)


# what does that look like?
head(BCC_MAT)
##   MCF10A_1 MCF10A_2 MCF7_1 MCF7_2 MDA231_1 MDA231_2 MDA468_1 MDA468_2 SKBR3_1
## 1     9.54     4.58   5.07   5.42    25.43    27.42     4.56     3.88    8.46
## 2    14.00    11.58   6.49   6.64     9.80    10.31     6.84     8.75   11.91
## 3    10.22     8.29  11.55  10.82    12.48    10.11     9.54     8.46    9.10
## 4     9.00     6.35  13.40  14.82    14.94    11.33     6.87     7.92    8.54
## 5     8.21     4.44  12.08   9.82    16.51    12.34    15.43     8.50    7.93
## 6    12.51    15.84   8.05   8.38     6.78     8.46     4.48     7.18   13.27
##   SKBR3_2
## 1    5.64
## 2   13.67
## 3    9.43
## 4    6.82
## 5    4.75
## 6   15.05
##   MCF10A_1 MCF10A_2 MCF7_1 MCF7_2 MDA231_1 MDA231_2 MDA468_1 MDA468_2 SKBR3_1
## 1     9.54     4.58   5.07   5.42    25.43    27.42     4.56     3.88    8.46
## 2    14.00    11.58   6.49   6.64     9.80    10.31     6.84     8.75   11.91
## 3    10.22     8.29  11.55  10.82    12.48    10.11     9.54     8.46    9.10
## 4     9.00     6.35  13.40  14.82    14.94    11.33     6.87     7.92    8.54
## 5     8.21     4.44  12.08   9.82    16.51    12.34    15.43     8.50    7.93
## 6    12.51    15.84   8.05   8.38     6.78     8.46     4.48     7.18   13.27
##   SKBR3_2
## 1    5.64
## 2   13.67
## 3    9.43
## 4    6.82
## 5    4.75
## 6   15.05
## make a heatmap from the data
pheatmap(BCC_MAT, color=pats_cols,cellwidth=30,cellheight=.03,cluster_cols=FALSE,cluster_rows=TRUE,legend=TRUE,fontsize = 7,scale="column")

##INTERPRETATION##
## What can you see in this figure?(gene expression levels across different breast cancer cell lines) are the repeated measures/reps similar or different?(the repeated measures are similar) What does this say about the precision and accuracy of them? (accuracy cant be determined but similarity indicates high precision)
##How does the control compare to the variables? (the control line is distinct (MCF10A)) Is this what you might expect? Why?(yes,because normal and cancer cells have different gene expressions) What would you look for in the literature to support this idea? (research papers and studies showing the origin and details of these cell lines)

#looks like MCF10A line  is different than the other cells lines.  We will take a closer look at that in the next section, once we've done some further data manipulations
## first, make new columns that combine the two reps for each variable by averaging them  
BCC.2<- BCC %>% mutate(mean_MCF10A= ((MCF10A_1+MCF10A_2)/2),mean_MCF7= ((MCF7_1+MCF7_2)/2),mean_MDA231= ((MDA231_1+MDA231_2)/2),mean_MDA468= ((MDA468_1+MDA468_2)/2),mean_SKBR3= ((SKBR3_1+SKBR3_2)/2))



#did it create your new columns? YES
head(BCC.2)
##   Gene_Symbol
## 1         NES
## 2      ZC3HC1
## 3   CAMSAP1L1
## 4      COBRA1
## 5       HAUS6
## 6      CHCHD4
##                                                                                                Description
## 1                                                                  Nestin OS=Homo sapiens GN=NES PE=1 SV=2
## 2                                Isoform 2 of Nuclear-interacting partner of ALK OS=Homo sapiens GN=ZC3HC1
## 3             Isoform 2 of Calmodulin-regulated spectrin-associated protein 2 OS=Homo sapiens GN=CAMSAP1L1
## 4                                         Negative elongation factor B OS=Homo sapiens GN=COBRA1 PE=1 SV=1
## 5                                 Isoform 3 of HAUS augmin-like complex subunit 6 OS=Homo sapiens GN=HAUS6
## 6  Isoform 2 of Mitochondrial intermembrane space import and assembly protein 40 OS=Homo sapiens GN=CHCHD4
##   Peptides MCF10A_1 MCF10A_2 MCF7_1 MCF7_2 MDA231_1 MDA231_2 MDA468_1 MDA468_2
## 1        7     9.54     4.58   5.07   5.42    25.43    27.42     4.56     3.88
## 2        2    14.00    11.58   6.49   6.64     9.80    10.31     6.84     8.75
## 3        5    10.22     8.29  11.55  10.82    12.48    10.11     9.54     8.46
## 4        3     9.00     6.35  13.40  14.82    14.94    11.33     6.87     7.92
## 5        5     8.21     4.44  12.08   9.82    16.51    12.34    15.43     8.50
## 6        5    12.51    15.84   8.05   8.38     6.78     8.46     4.48     7.18
##   SKBR3_1 SKBR3_2 pvalue_MCF7_vs_MCF10A pvalue_MDA231_vs_MCF10A
## 1    8.46    5.64            0.54152658              0.01849512
## 2   11.91   13.67            0.03589005              0.15732968
## 3    9.10    9.43            0.20243066              0.31440425
## 4    8.54    6.82            0.05045427              0.13484982
## 5    7.93    4.75            0.16972788              0.10216410
## 6   13.27   15.05            0.07037866              0.07222737
##   pvalue_MDA468_vs_MCF10A pvalue_SKBR3_vs_MCF10A mean_MCF10A mean_MCF7
## 1              0.37423781              0.9983818       7.060     5.245
## 2              0.08358401              0.9980974      12.790     6.565
## 3              0.83825377              0.9967306       9.255    11.185
## 4              0.86215724              0.9958838       7.675    14.110
## 5              0.28867772              0.9957949       6.325    10.950
## 6              0.06005626              0.9948536      14.175     8.215
##   mean_MDA231 mean_MDA468 mean_SKBR3
## 1      26.425       4.220      7.050
## 2      10.055       7.795     12.790
## 3      11.295       9.000      9.265
## 4      13.135       7.395      7.680
## 5      14.425      11.965      6.340
## 6       7.620       5.830     14.160
##   Gene_Symbol
## 1         NES
## 2      ZC3HC1
## 3   CAMSAP1L1
## 4      COBRA1
## 5       HAUS6
## 6      CHCHD4
##                                                                                                Description
## 1                                                                  Nestin OS=Homo sapiens GN=NES PE=1 SV=2
## 2                                Isoform 2 of Nuclear-interacting partner of ALK OS=Homo sapiens GN=ZC3HC1
## 3             Isoform 2 of Calmodulin-regulated spectrin-associated protein 2 OS=Homo sapiens GN=CAMSAP1L1
## 4                                         Negative elongation factor B OS=Homo sapiens GN=COBRA1 PE=1 SV=1
## 5                                 Isoform 3 of HAUS augmin-like complex subunit 6 OS=Homo sapiens GN=HAUS6
## 6  Isoform 2 of Mitochondrial intermembrane space import and assembly protein 40 OS=Homo sapiens GN=CHCHD4
##   Peptides MCF10A_1 MCF10A_2 MCF7_1 MCF7_2 MDA231_1 MDA231_2 MDA468_1 MDA468_2
## 1        7     9.54     4.58   5.07   5.42    25.43    27.42     4.56     3.88
## 2        2    14.00    11.58   6.49   6.64     9.80    10.31     6.84     8.75
## 3        5    10.22     8.29  11.55  10.82    12.48    10.11     9.54     8.46
## 4        3     9.00     6.35  13.40  14.82    14.94    11.33     6.87     7.92
## 5        5     8.21     4.44  12.08   9.82    16.51    12.34    15.43     8.50
## 6        5    12.51    15.84   8.05   8.38     6.78     8.46     4.48     7.18
##   SKBR3_1 SKBR3_2 pvalue_MCF7_vs_MCF10A pvalue_MDA231_vs_MCF10A
## 1    8.46    5.64            0.54152658              0.01849512
## 2   11.91   13.67            0.03589005              0.15732968
## 3    9.10    9.43            0.20243066              0.31440425
## 4    8.54    6.82            0.05045427              0.13484982
## 5    7.93    4.75            0.16972788              0.10216410
## 6   13.27   15.05            0.07037866              0.07222737
##   pvalue_MDA468_vs_MCF10A pvalue_SKBR3_vs_MCF10A mean_MCF10A mean_MCF7
## 1              0.37423781              0.9983818       7.060     5.245
## 2              0.08358401              0.9980974      12.790     6.565
## 3              0.83825377              0.9967306       9.255    11.185
## 4              0.86215724              0.9958838       7.675    14.110
## 5              0.28867772              0.9957949       6.325    10.950
## 6              0.06005626              0.9948536      14.175     8.215
##   mean_MDA231 mean_MDA468 mean_SKBR3
## 1      26.425       4.220      7.050
## 2      10.055       7.795     12.790
## 3      11.295       9.000      9.265
## 4      13.135       7.395      7.680
## 5      14.425      11.965      6.340
## 6       7.620       5.830     14.160
## then make some new columns that store the log ratios of the variable means you just created, as compared to the control
BCC.2<-BCC.2 %>% mutate(log_MCF7=log2(mean_MCF7/mean_MCF10A),log_MDA231=log2(mean_MDA231/mean_MCF10A),log_MDA468=log2(mean_MDA468/mean_MCF10A),log_SKBR3=log2(mean_SKBR3/mean_MCF10A))

colnames(BCC.2)
##  [1] "Gene_Symbol"             "Description"            
##  [3] "Peptides"                "MCF10A_1"               
##  [5] "MCF10A_2"                "MCF7_1"                 
##  [7] "MCF7_2"                  "MDA231_1"               
##  [9] "MDA231_2"                "MDA468_1"               
## [11] "MDA468_2"                "SKBR3_1"                
## [13] "SKBR3_2"                 "pvalue_MCF7_vs_MCF10A"  
## [15] "pvalue_MDA231_vs_MCF10A" "pvalue_MDA468_vs_MCF10A"
## [17] "pvalue_SKBR3_vs_MCF10A"  "mean_MCF10A"            
## [19] "mean_MCF7"               "mean_MDA231"            
## [21] "mean_MDA468"             "mean_SKBR3"             
## [23] "log_MCF7"                "log_MDA231"             
## [25] "log_MDA468"              "log_SKBR3"
##  [1] "Gene_Symbol"             "Description"            
##  [3] "Peptides"                "MCF10A_1"               
##  [5] "MCF10A_2"                "MCF7_1"                 
##  [7] "MCF7_2"                  "MDA231_1"               
##  [9] "MDA231_2"                "MDA468_1"               
## [11] "MDA468_2"                "SKBR3_1"                
## [13] "SKBR3_2"                 "pvalue_MCF7_vs_MCF10A"  
## [15] "pvalue_MDA231_vs_MCF10A" "pvalue_MDA468_vs_MCF10A"
## [17] "pvalue_SKBR3_vs_MCF10A"  "mean_MCF10A"            
## [19] "mean_MCF7"               "mean_MDA231"            
## [21] "mean_MDA468"             "mean_SKBR3"             
## [23] "log_MCF7"                "log_MDA231"             
## [25] "log_MDA468"              "log_SKBR3"


#did it create your new columns? YES
## ALTERNATIVE CHOICE: make a matrix of just log values the data and a heat map of that. How do the two heat maps compare? This is not possible if your values contain 0's
BCC_MAT2<-BCC.2 %>% select(log_MCF7:log_SKBR3) %>% as.matrix() %>% round(.,2)
pheatmap(BCC_MAT2, color=pats_cols,cellwidth=30,cellheight=.03,cluster_cols=FALSE,cluster_rows=TRUE,legend=TRUE,fontsize = 7,scale="column")

##BASIC VOLCANO PLOT

## Add a column that stores the negative log of the pvalue of interest (in this case, SKBR3)
BCC.2<-BCC.2%>% mutate(neglog_SKBR3=-log10(pvalue_SKBR3_vs_MCF10A))

## Use ggplot to plot the log ratio of SKBR3 against the -log p-value of SKBR3
volcano_plot<-BCC.2%>%ggplot(aes(x=log_SKBR3,y=neglog_SKBR3,Description=Gene_Symbol))+geom_point(alpha=0.7,color="royalblue")

#to view it, type: 
volcano_plot

## BETTER VOLCANO PLOT
BCC.2<-BCC.2%>%mutate(significance=ifelse((log_SKBR3>2&neglog_SKBR3>2.99),"UP",ifelse((log_SKBR3<c(-2)&neglog_SKBR3>2.99),"DOWN","NOT SIG")))
plain_cols3<-c("red","gray","blue")
better_volvano_plot<-BCC.2%>%ggplot(aes(x=log_SKBR3,y=neglog_SKBR3,Description=Gene_Symbol,color=significance))+geom_point(alpha=0.7)+scale_color_manual(values=plain_cols3)+xlim(-6,6)+theme_bw()+theme(axis.text = element_text(colour = "black",size = 14))+theme(text=element_text(size=14))+labs(x="log_SKBR3 compared to MCF10A",y="-log(p-value)")
better_volvano_plot
## Warning: Removed 12 rows containing missing values or values outside the scale range
## (`geom_point()`).

## Define the significant ones (by using ifelse and setting # parameters) so they can be colored
better_volcano_plot <- BCC.2 %>%
  ggplot(aes(x = log_SKBR3,
             y = neglog_SKBR3,
             color = significance,
             text = Gene_Symbol)) +
  geom_point(alpha = 0.7) +
  scale_color_manual(values = plain_cols3) +
  xlim(-6,6) +
  theme_bw() +
  labs(x = "log_SKBR3 compared to MCF10A",
       y = "-log(p-value)")
ggplotly(better_volcano_plot, tooltip = "text")
## Some standard colors


## volcano plot as before with some added things





#to view it, type


#check viewer and / or plots to see it
## use ggplotly to see hover over the points to see the gene names. Record these for the next step 
ggplotly(better_volvano_plot)
##INTERPRETATION##
## Significant up regulated are: TDP2, KRT23 and GCLC
## Significant down regulated are:APOA1,HLA-A and MYO1B 
#Why? How?
# Make a pivot longer table of the C19 data for Control_2h_1:Virus_24h_2
BCC_long<-pivot_longer(BCC.2,cols = c(MCF10A_1:SKBR3_2), names_to ='variable')%>% select(-c(pvalue_MCF7_vs_MCF10A:significance))
head(BCC_long)
## # A tibble: 6 × 5
##   Gene_Symbol Description                                Peptides variable value
##   <chr>       <chr>                                         <int> <chr>    <dbl>
## 1 NES         " Nestin OS=Homo sapiens GN=NES PE=1 SV=2"        7 MCF10A_1  9.54
## 2 NES         " Nestin OS=Homo sapiens GN=NES PE=1 SV=2"        7 MCF10A_2  4.58
## 3 NES         " Nestin OS=Homo sapiens GN=NES PE=1 SV=2"        7 MCF7_1    5.07
## 4 NES         " Nestin OS=Homo sapiens GN=NES PE=1 SV=2"        7 MCF7_2    5.42
## 5 NES         " Nestin OS=Homo sapiens GN=NES PE=1 SV=2"        7 MDA231_1 25.4 
## 6 NES         " Nestin OS=Homo sapiens GN=NES PE=1 SV=2"        7 MDA231_2 27.4
## # A tibble: 6 × 5
##   Gene_Symbol Description                                Peptides variable value
##   <chr>       <chr>                                         <int> <chr>    <dbl>
## 1 NES         " Nestin OS=Homo sapiens GN=NES PE=1 SV=2"        7 MCF10A_1  9.54
## 2 NES         " Nestin OS=Homo sapiens GN=NES PE=1 SV=2"        7 MCF10A_2  4.58
## 3 NES         " Nestin OS=Homo sapiens GN=NES PE=1 SV=2"        7 MCF7_1    5.07
## 4 NES         " Nestin OS=Homo sapiens GN=NES PE=1 SV=2"        7 MCF7_2    5.42
## 5 NES         " Nestin OS=Homo sapiens GN=NES PE=1 SV=2"        7 MDA231_1 25.4 
## 6 NES         " Nestin OS=Homo sapiens GN=NES PE=1 SV=2"        7 MDA231_2 27.4
## based on the gene symbols from plotly, select a few proteins from the table. Select just the data and gene symbols and pivot longer
Examples_Down<-BCC_long%>%filter(Gene_Symbol=="APOA1"|Gene_Symbol=="HLA-A;MYO1B"|Gene_Symbol=="HMGN5")
head(Examples_Down)
## # A tibble: 6 × 5
##   Gene_Symbol Description                                Peptides variable value
##   <chr>       <chr>                                         <int> <chr>    <dbl>
## 1 HMGN5       " High mobility group nucleosome-binding …        4 MCF10A_1 24.0 
## 2 HMGN5       " High mobility group nucleosome-binding …        4 MCF10A_2 24.8 
## 3 HMGN5       " High mobility group nucleosome-binding …        4 MCF7_1   11.0 
## 4 HMGN5       " High mobility group nucleosome-binding …        4 MCF7_2   11.0 
## 5 HMGN5       " High mobility group nucleosome-binding …        4 MDA231_1  4.62
## 6 HMGN5       " High mobility group nucleosome-binding …        4 MDA231_2  5.69
## # A tibble: 6 × 5
##   Gene_Symbol Description                                Peptides variable value
##   <chr>       <chr>                                         <int> <chr>    <dbl>
## 1 HMGN5       " High mobility group nucleosome-binding …        4 MCF10A_1 24.0 
## 2 HMGN5       " High mobility group nucleosome-binding …        4 MCF10A_2 24.8 
## 3 HMGN5       " High mobility group nucleosome-binding …        4 MCF7_1   11.0 
## 4 HMGN5       " High mobility group nucleosome-binding …        4 MCF7_2   11.0 
## 5 HMGN5       " High mobility group nucleosome-binding …        4 MDA231_1  4.62
## 6 HMGN5       " High mobility group nucleosome-binding …        4 MDA231_2  5.69


## make barplots facetted by Gene Symbol (when working with other data sets - change x=order to x = variable)
Example_plot_down<-Examples_Down%>%ggplot(aes(x=factor(variable,levels = c('MCF10A_1','MCF10A_2','MCF7_1','MCF7_2','MDA231_1','MDA231_2','MDAA468_1','MDA468_2','SKBR3_1','SKBR3_2')),y= value))+geom_bar(stat = "identity",fill="red")+facet_wrap(~Gene_Symbol)+theme_bw()+theme(axis.text = element_text(colour="black",size=10))+theme(text= element_text(size=14))+theme(axis.text.x = element_text(angle=45, hjust = 1))+labs(x="sample",y="relative intensity")
Example_plot_down

## Same process for the upregulated ones
Examples_up<-BCC_long %>% filter(Gene_Symbol=="FNBP1L"|Gene_Symbol=="KRT23")
head(Examples_up)
## # A tibble: 6 × 5
##   Gene_Symbol Description                                Peptides variable value
##   <chr>       <chr>                                         <int> <chr>    <dbl>
## 1 KRT23       " Keratin, type I cytoskeletal 23 OS=Homo…        3 MCF10A_1  2.39
## 2 KRT23       " Keratin, type I cytoskeletal 23 OS=Homo…        3 MCF10A_2  2.49
## 3 KRT23       " Keratin, type I cytoskeletal 23 OS=Homo…        3 MCF7_1    4.03
## 4 KRT23       " Keratin, type I cytoskeletal 23 OS=Homo…        3 MCF7_2    5.53
## 5 KRT23       " Keratin, type I cytoskeletal 23 OS=Homo…        3 MDA231_1  2.89
## 6 KRT23       " Keratin, type I cytoskeletal 23 OS=Homo…        3 MDA231_2  4.76
## # A tibble: 6 × 5
##   Gene_Symbol Description                                Peptides variable value
##   <chr>       <chr>                                         <int> <chr>    <dbl>
## 1 KRT23       " Keratin, type I cytoskeletal 23 OS=Homo…        3 MCF10A_1  2.39
## 2 KRT23       " Keratin, type I cytoskeletal 23 OS=Homo…        3 MCF10A_2  2.49
## 3 KRT23       " Keratin, type I cytoskeletal 23 OS=Homo…        3 MCF7_1    4.03
## 4 KRT23       " Keratin, type I cytoskeletal 23 OS=Homo…        3 MCF7_2    5.53
## 5 KRT23       " Keratin, type I cytoskeletal 23 OS=Homo…        3 MDA231_1  2.89
## 6 KRT23       " Keratin, type I cytoskeletal 23 OS=Homo…        3 MDA231_2  4.76
Example_plot_up<-Examples_up %>%ggplot(aes(x=factor(variable,levels = c('MCF10A_1','MCF10A_2','MCF7_1','MCF7_2','MDA231_1','MDA231_2','MDAA468_1','MDA468_2','SKBR3_1','SKBR3_2')),y= value))+geom_bar(stat = "identity",fill="royalblue")+facet_wrap(~Gene_Symbol)+theme_bw()+theme(axis.text = element_text(colour="black",size=10))+theme(text= element_text(size=14))+theme(axis.text.x = element_text(angle=45, hjust = 1))+labs(x="sample",y="relative intensity") 
Example_plot_up