library(NanoStringNorm)
library(nanostringr)
library(ggplot2)
library(tidyr)
library(tidyverse)
library(gplots)
library(heatmaply)
library(tibble)
library(readr)


files.dir <-
  read_rcc(path = "C:/Users/Charles/OneDrive/Desktop/PFM/DS_Case_Study_Data/")
ds_pmf_files <-
  system.file("files.dir", "GSM2055823_01_4353_PD_mRNA.RCC", package = "nanostringr")


#read_rcc is used to extract multiple files from the folder

ds_pmf_files <-
  read_rcc(path = "C:/Users/Charles/OneDrive/Desktop/PFM/DS_Case_Study_Data/")
#ds_pfm_files is list with the raw data and attributes combined, to extract the raw data do ds_pmf$raw



gene.counts <- ds_pmf_files$raw

gene.attributes <- ds_pmf_files$exp


## clean the gene count data to use:

ds.count <- ds_pmf_files$raw



## Task 1. Assessing quality control by heatmap

#subset data to remain with controls
# data used for heat map is clean.controls

controls <-
  ds.count %>% filter(Code.Class %in% c("Positive", "Negative"))

#remove the annotations in col 1 and 3
# remove the (0*) number string in the control names

clean.controls <- controls[,-c(1, 3)]

clean.controls$Name <- gsub("[(0-9.)]", "", clean.controls$Name)




#make a matrix of the cleaned controls and transpose the matrix
controls <- clean.controls
rownames(controls) <- clean.controls$Name
controls.matrix <- data.matrix(controls)
controls.matrix <- t(controls.matrix)

# remove first row of the matrix and recreate the heat map
controls.matrix <- controls.matrix[-1,]
controls.matrix
##                            POS_C POS_A POS_F POS_D POS_B POS_E NEG_C NEG_D
## GSM2055823_01_4353_PD_mRNA  5597 60232    72  1321 17943   269     1     6
## GSM2055824_02_4355_PD_mRNA  6110 68027   108  1435 20662   286     1     4
## GSM2055825_03_3366_PD_mRNA  5506 62583    92  1406 19037   287     4     4
## GSM2055826_04_4078_PD_mRNA  5590 61885    78  1312 18853   266     2    10
## GSM2055827_05_4846_PD_mRNA  6183 68191    88  1497 20344   293     7    10
## GSM2055828_06_3746_PD_mRNA  5223 59144    78  1343 17895   249    11     3
## GSM2055829_07_3760_PD_mRNA  5746 65328    86  1451 20806   250     4     4
## GSM2055830_08_3790_PD_mRNA  5605 64831   102  1363 19949   243     4     4
## GSM2055831_09_4436_PD_mRNA  5413 64079    98  1444 19799   253     8     4
## GSM2055832_10_4050_PD_mRNA  5780 66646    95  1519 20289   245     4     1
##                            NEG_E NEG_A NEG_H NEG_G NEG_F NEG_B
## GSM2055823_01_4353_PD_mRNA    20    20     9     3    14    10
## GSM2055824_02_4355_PD_mRNA    19    20     8     6    15     5
## GSM2055825_03_3366_PD_mRNA    48    21     6     4    15    13
## GSM2055826_04_4078_PD_mRNA    26    18     9     4    13    10
## GSM2055827_05_4846_PD_mRNA    25    14     8     5    13     7
## GSM2055828_06_3746_PD_mRNA    79    17     6     2    12     6
## GSM2055829_07_3760_PD_mRNA    18    21     8     2    14     9
## GSM2055830_08_3790_PD_mRNA    20    12    10     4    15     3
## GSM2055831_09_4436_PD_mRNA    29    15     8     5    13     5
## GSM2055832_10_4050_PD_mRNA    23    16     9     8    16    10
# change the colors of the heatmap to be more friendly to color blind people
#use red for high percentages and green for low percentages
rb <- colorRampPalette(c("green", "white", "red"))


### Heatmap showing positive and negative control genes in columns and samples in rows.

#normalize the scale of data in the control counts
# Set heatmap colors to be more friendly to color blind people
#red for high percentages and green for low percentages

heatmaply(
  normalize(controls.matrix),
  # Using the Min-Max normalization function
  xlab = "Positive and Negative Controls",
  ylab = "Samples",
  main = "Control Genes",
  Rowv = FALSE,
  Colv = FALSE,
  colors = rb,
  fontsize_row = 8,
  fontsize_col = 8
)
# Task 2

# read in the metadata/annotations
case_study_annotations <- read_csv("case_study_annotations.csv")

#normalize the counts by the housekeeping genes and controls
#specify the housekeeping genes

gene.counts[gene.counts$Name %in%
              c('PSMB2', 'ACTB', 'CHMP2A', 'C1orf43'), 'Code.Class'] <-
  'Housekeeping'

gene.counts <- NanoStringNorm(
  x = gene.counts,
  anno = NA,
  CodeCount = 'geo.mean',
  Background = 'mean',
  SampleContent = 'housekeeping.geo.mean',
  round.values = TRUE,
  take.log = TRUE,
  return.matrix.of.endogenous.probes = TRUE
)
## 
## ##############################
## ### NanoStringNorm v1.2.1.1 ###
## ##############################
## 
## There are 10 samples and 130 Endogenous genes 
## 
## Background: After correction 10 samples and 128 
##  Endogenous genes have less than 90% missing. 
## 
## log: Setting values less than 1 to 1 in order to calculate the log in positive space.
# prepare gene counts and treatment or timepoints conditions

gene.counts <- data.frame(gene.counts)

# define the timepoints
Baseline <-
  c(
    "GSM2055823_01_4353_PD_mRNA",
    "GSM2055825_03_3366_PD_mRNA",
    "GSM2055827_05_4846_PD_mRNA",
    "GSM2055829_07_3760_PD_mRNA",
    "GSM2055831_09_4436_PD_mRNA"
  )
Post_Treatment <-
  c(
    "GSM2055824_02_4355_PD_mRNA",
    "GSM2055826_04_4078_PD_mRNA",
    "GSM2055828_06_3746_PD_mRNA",
    "GSM2055830_08_3790_PD_mRNA",
    "GSM2055832_10_4050_PD_mRNA"
  )


#filter for genes of interest and plot

MCL1 <- gene.counts["MCL1",] %>%
  gather() %>%
  mutate(category = ifelse(key %in% Baseline, "Baseline", "Post_treatment"))

MCL1
##                           key value       category
## 1  GSM2055823_01_4353_PD_mRNA 12.11       Baseline
## 2  GSM2055824_02_4355_PD_mRNA 11.28 Post_treatment
## 3  GSM2055825_03_3366_PD_mRNA 12.20       Baseline
## 4  GSM2055826_04_4078_PD_mRNA 11.31 Post_treatment
## 5  GSM2055827_05_4846_PD_mRNA 11.83       Baseline
## 6  GSM2055828_06_3746_PD_mRNA 12.74 Post_treatment
## 7  GSM2055829_07_3760_PD_mRNA 11.93       Baseline
## 8  GSM2055830_08_3790_PD_mRNA 12.12 Post_treatment
## 9  GSM2055831_09_4436_PD_mRNA 12.11       Baseline
## 10 GSM2055832_10_4050_PD_mRNA 10.96 Post_treatment
CXCL1 <- gene.counts["CXCL1",] %>%
  gather() %>%
  mutate(category = ifelse(key %in% Baseline, "Baseline", "Post_treatment"))

CXCL1
##                           key value       category
## 1  GSM2055823_01_4353_PD_mRNA 7.721       Baseline
## 2  GSM2055824_02_4355_PD_mRNA 6.022 Post_treatment
## 3  GSM2055825_03_3366_PD_mRNA 7.852       Baseline
## 4  GSM2055826_04_4078_PD_mRNA 5.555 Post_treatment
## 5  GSM2055827_05_4846_PD_mRNA 7.150       Baseline
## 6  GSM2055828_06_3746_PD_mRNA 9.255 Post_treatment
## 7  GSM2055829_07_3760_PD_mRNA 8.409       Baseline
## 8  GSM2055830_08_3790_PD_mRNA 5.700 Post_treatment
## 9  GSM2055831_09_4436_PD_mRNA 7.451       Baseline
## 10 GSM2055832_10_4050_PD_mRNA 6.000 Post_treatment
### Box plots of MCL1 and CXCL1 respectively

#jitter has been used to adds a small amount of random variation of each data point
qplot(
  x = category,
  y = value,
  data = MCL1,
  geom = c("boxplot", "jitter"),
  fill = category,
  xlab = "Time Points",
  main = "Differerence in Timepoint for MCL1 gene across samples"
) +
  labs(fill = "Timepoint & Sample Overlay")

qplot(
  x = category,
  y = value,
  data = CXCL1,
  geom = c("boxplot", "jitter"),
  fill = category,
  xlab = "Time Points",
  main = "Differerence in Timepoint for CXCL1 gene across samples"
) +
  labs(fill = "Timepoint & Sample Overlay")