library(NanoStringNorm)
library(nanostringr)
library(ggplot2)
library(tidyr)
library(tidyverse)
library(gplots)
library(heatmaply)
library(tibble)
library(readr)
files.dir <-
read_rcc(path = "C:/Users/Charles/OneDrive/Desktop/PFM/DS_Case_Study_Data/")
ds_pmf_files <-
system.file("files.dir", "GSM2055823_01_4353_PD_mRNA.RCC", package = "nanostringr")
#read_rcc is used to extract multiple files from the folder
ds_pmf_files <-
read_rcc(path = "C:/Users/Charles/OneDrive/Desktop/PFM/DS_Case_Study_Data/")
#ds_pfm_files is list with the raw data and attributes combined, to extract the raw data do ds_pmf$raw
gene.counts <- ds_pmf_files$raw
gene.attributes <- ds_pmf_files$exp
## clean the gene count data to use:
ds.count <- ds_pmf_files$raw
## Task 1. Assessing quality control by heatmap
#subset data to remain with controls
# data used for heat map is clean.controls
controls <-
ds.count %>% filter(Code.Class %in% c("Positive", "Negative"))
#remove the annotations in col 1 and 3
# remove the (0*) number string in the control names
clean.controls <- controls[,-c(1, 3)]
clean.controls$Name <- gsub("[(0-9.)]", "", clean.controls$Name)
#make a matrix of the cleaned controls and transpose the matrix
controls <- clean.controls
rownames(controls) <- clean.controls$Name
controls.matrix <- data.matrix(controls)
controls.matrix <- t(controls.matrix)
# remove first row of the matrix and recreate the heat map
controls.matrix <- controls.matrix[-1,]
controls.matrix
## POS_C POS_A POS_F POS_D POS_B POS_E NEG_C NEG_D
## GSM2055823_01_4353_PD_mRNA 5597 60232 72 1321 17943 269 1 6
## GSM2055824_02_4355_PD_mRNA 6110 68027 108 1435 20662 286 1 4
## GSM2055825_03_3366_PD_mRNA 5506 62583 92 1406 19037 287 4 4
## GSM2055826_04_4078_PD_mRNA 5590 61885 78 1312 18853 266 2 10
## GSM2055827_05_4846_PD_mRNA 6183 68191 88 1497 20344 293 7 10
## GSM2055828_06_3746_PD_mRNA 5223 59144 78 1343 17895 249 11 3
## GSM2055829_07_3760_PD_mRNA 5746 65328 86 1451 20806 250 4 4
## GSM2055830_08_3790_PD_mRNA 5605 64831 102 1363 19949 243 4 4
## GSM2055831_09_4436_PD_mRNA 5413 64079 98 1444 19799 253 8 4
## GSM2055832_10_4050_PD_mRNA 5780 66646 95 1519 20289 245 4 1
## NEG_E NEG_A NEG_H NEG_G NEG_F NEG_B
## GSM2055823_01_4353_PD_mRNA 20 20 9 3 14 10
## GSM2055824_02_4355_PD_mRNA 19 20 8 6 15 5
## GSM2055825_03_3366_PD_mRNA 48 21 6 4 15 13
## GSM2055826_04_4078_PD_mRNA 26 18 9 4 13 10
## GSM2055827_05_4846_PD_mRNA 25 14 8 5 13 7
## GSM2055828_06_3746_PD_mRNA 79 17 6 2 12 6
## GSM2055829_07_3760_PD_mRNA 18 21 8 2 14 9
## GSM2055830_08_3790_PD_mRNA 20 12 10 4 15 3
## GSM2055831_09_4436_PD_mRNA 29 15 8 5 13 5
## GSM2055832_10_4050_PD_mRNA 23 16 9 8 16 10
# change the colors of the heatmap to be more friendly to color blind people
#use red for high percentages and green for low percentages
rb <- colorRampPalette(c("green", "white", "red"))
### Heatmap showing positive and negative control genes in columns and samples in rows.
#normalize the scale of data in the control counts
# Set heatmap colors to be more friendly to color blind people
#red for high percentages and green for low percentages
heatmaply(
normalize(controls.matrix),
# Using the Min-Max normalization function
xlab = "Positive and Negative Controls",
ylab = "Samples",
main = "Control Genes",
Rowv = FALSE,
Colv = FALSE,
colors = rb,
fontsize_row = 8,
fontsize_col = 8
)
# Task 2
# read in the metadata/annotations
case_study_annotations <- read_csv("case_study_annotations.csv")
#normalize the counts by the housekeeping genes and controls
#specify the housekeeping genes
gene.counts[gene.counts$Name %in%
c('PSMB2', 'ACTB', 'CHMP2A', 'C1orf43'), 'Code.Class'] <-
'Housekeeping'
gene.counts <- NanoStringNorm(
x = gene.counts,
anno = NA,
CodeCount = 'geo.mean',
Background = 'mean',
SampleContent = 'housekeeping.geo.mean',
round.values = TRUE,
take.log = TRUE,
return.matrix.of.endogenous.probes = TRUE
)
##
## ##############################
## ### NanoStringNorm v1.2.1.1 ###
## ##############################
##
## There are 10 samples and 130 Endogenous genes
##
## Background: After correction 10 samples and 128
## Endogenous genes have less than 90% missing.
##
## log: Setting values less than 1 to 1 in order to calculate the log in positive space.
# prepare gene counts and treatment or timepoints conditions
gene.counts <- data.frame(gene.counts)
# define the timepoints
Baseline <-
c(
"GSM2055823_01_4353_PD_mRNA",
"GSM2055825_03_3366_PD_mRNA",
"GSM2055827_05_4846_PD_mRNA",
"GSM2055829_07_3760_PD_mRNA",
"GSM2055831_09_4436_PD_mRNA"
)
Post_Treatment <-
c(
"GSM2055824_02_4355_PD_mRNA",
"GSM2055826_04_4078_PD_mRNA",
"GSM2055828_06_3746_PD_mRNA",
"GSM2055830_08_3790_PD_mRNA",
"GSM2055832_10_4050_PD_mRNA"
)
#filter for genes of interest and plot
MCL1 <- gene.counts["MCL1",] %>%
gather() %>%
mutate(category = ifelse(key %in% Baseline, "Baseline", "Post_treatment"))
MCL1
## key value category
## 1 GSM2055823_01_4353_PD_mRNA 12.11 Baseline
## 2 GSM2055824_02_4355_PD_mRNA 11.28 Post_treatment
## 3 GSM2055825_03_3366_PD_mRNA 12.20 Baseline
## 4 GSM2055826_04_4078_PD_mRNA 11.31 Post_treatment
## 5 GSM2055827_05_4846_PD_mRNA 11.83 Baseline
## 6 GSM2055828_06_3746_PD_mRNA 12.74 Post_treatment
## 7 GSM2055829_07_3760_PD_mRNA 11.93 Baseline
## 8 GSM2055830_08_3790_PD_mRNA 12.12 Post_treatment
## 9 GSM2055831_09_4436_PD_mRNA 12.11 Baseline
## 10 GSM2055832_10_4050_PD_mRNA 10.96 Post_treatment
CXCL1 <- gene.counts["CXCL1",] %>%
gather() %>%
mutate(category = ifelse(key %in% Baseline, "Baseline", "Post_treatment"))
CXCL1
## key value category
## 1 GSM2055823_01_4353_PD_mRNA 7.721 Baseline
## 2 GSM2055824_02_4355_PD_mRNA 6.022 Post_treatment
## 3 GSM2055825_03_3366_PD_mRNA 7.852 Baseline
## 4 GSM2055826_04_4078_PD_mRNA 5.555 Post_treatment
## 5 GSM2055827_05_4846_PD_mRNA 7.150 Baseline
## 6 GSM2055828_06_3746_PD_mRNA 9.255 Post_treatment
## 7 GSM2055829_07_3760_PD_mRNA 8.409 Baseline
## 8 GSM2055830_08_3790_PD_mRNA 5.700 Post_treatment
## 9 GSM2055831_09_4436_PD_mRNA 7.451 Baseline
## 10 GSM2055832_10_4050_PD_mRNA 6.000 Post_treatment
### Box plots of MCL1 and CXCL1 respectively
#jitter has been used to adds a small amount of random variation of each data point
qplot(
x = category,
y = value,
data = MCL1,
geom = c("boxplot", "jitter"),
fill = category,
xlab = "Time Points",
main = "Differerence in Timepoint for MCL1 gene across samples"
) +
labs(fill = "Timepoint & Sample Overlay")

qplot(
x = category,
y = value,
data = CXCL1,
geom = c("boxplot", "jitter"),
fill = category,
xlab = "Time Points",
main = "Differerence in Timepoint for CXCL1 gene across samples"
) +
labs(fill = "Timepoint & Sample Overlay")
