#Importing packages required for importing and manuplating data
library(tidyverse)
## ── Attaching packages ─────────────────────────────────────── tidyverse 1.3.1 ──
## ✓ ggplot2 3.3.5     ✓ purrr   0.3.4
## ✓ tibble  3.1.3     ✓ dplyr   1.0.7
## ✓ tidyr   1.1.3     ✓ stringr 1.4.0
## ✓ readr   2.0.0     ✓ forcats 0.5.1
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## x dplyr::filter() masks stats::filter()
## x dplyr::lag()    masks stats::lag()
library(XML)
knitr::opts_chunk$set(echo = TRUE, fig.width = 10, fig.height = 5)
# creating function to read and import rcc data file
read.markup.RCC <- function(rcc.path = ".", rcc.pattern = "*.RCC|*.rcc", exclude = NULL, include = NULL, nprobes = -1) {
  
  # check if path exist
  if (! file.exists(rcc.path) ) {
    stop(paste("READ.MARKUP.RCC: path was not found.  \n")) ;
  }
  
  rcc.files <- list.files(path = rcc.path, pattern = rcc.pattern);
  rcc.files <- rcc.files[!rcc.files %in% exclude | rcc.files %in% include]
  
  # check if any rcc.files
  if (length(rcc.files) == 0) {
    stop(paste("READ.MARKUP.RCC: no RCC files found.  \n")) ;
  }
  
  rcc.header.merged <- NULL;
  rcc.data.merged <- NULL;
  
  count = 1;
  for (rcc.file in rcc.files) {
    
    cat("\nreading RCC file [", count, "/", length(rcc.files), "]: ", rcc.file, sep = "");
    
    # read RCC file and enclose in valid document tags for XML parser
    data <- xmlParse(
      paste(
        "<doc>", 
        paste(readLines(paste(rcc.path, rcc.file, sep = "/"), warn = FALSE), collapse = "\r"), 
        "</doc>", 
        sep = ""
      )
    );
    
    # extract info for various tags
    header.info <- xmlToDataFrame(getNodeSet(data, "/doc//Header"));
    sample.info <- xmlToDataFrame(getNodeSet(data, "/doc//Sample_Attributes"));
    lane.info <- xmlToDataFrame(getNodeSet(data, "/doc//Lane_Attributes"));
    code.info <- xmlToDataFrame(getNodeSet(data, "/doc//Code_Summary"));
    
    # convert data into table for all data structures
    header.con <- textConnection(as.character(header.info$text));
    header.data <- read.csv(header.con, header = FALSE, stringsAsFactors = FALSE);
    close(header.con);
    
    sample.con <- textConnection(as.character(sample.info$text));
    sample.data <- read.csv(sample.con, header = FALSE, stringsAsFactors = FALSE);
    sample.data[which(sample.data[, 1] == "ID"), 1] <- "sample.id";
    close(sample.con);
    
    lane.con <- textConnection(as.character(lane.info$text));
    lane.data <- read.csv(lane.con, header = FALSE, stringsAsFactors = FALSE);
    lane.data[which(lane.data[, 1] == "ID"), 1] <- "lane.id";
    close(lane.con);
    
    code.con <- textConnection(as.character(code.info$text));
    code.data <- read.csv(code.con, header = TRUE, stringsAsFactors = FALSE);
    close(code.con);
    
    # combine metadata
    rcc.header <- do.call(rbind, list(header.data, sample.data, lane.data));
    rcc.data <- code.data;
    
    # assign rownames for lookup
    rownames(rcc.header) <- rcc.header[, 1];
    
    # assign sample names
    sample.name <- gsub(".RCC", "", gsub(" ", "_", rcc.file));
    colnames(rcc.header)[2] <- sample.name;
    colnames(rcc.data)[4] <- sample.name;
    
    if (count == 1) {
      rcc.header.merged <- rcc.header;
      rcc.data.merged <- rcc.data;
    }
    else {
      rcc.header.merged <- data.frame(
        rcc.header.merged, 
        subset(rcc.header[rcc.header.merged[, 1], ], select = 2)
      );
      rcc.data.merged <- data.frame(rcc.data.merged, subset(rcc.data, select = 4));
    }
    
    count=count+1;
  }
  
  # assign rownames
  rownames(rcc.header.merged) <- rcc.header.merged[,1];
  rcc.header.merged <- rcc.header.merged[,-1];
  
  # set column name
  colnames(rcc.data.merged[1]) <- "Code.Count";
  
  # merge data structures
  x <- list(x = rcc.data.merged, header = rcc.header.merged);
  
  class(x) <- 'NanoString';
  
  return(x);
  
}
#importing the RCC files 
rccfile<-read.markup.RCC("/Users/shashidhar/Downloads/case_study_datascientist_data")
## 
## reading RCC file [1/10]: GSM2055823_01_4353_PD_mRNA.RCC
## reading RCC file [2/10]: GSM2055824_02_4355_PD_mRNA.RCC
## reading RCC file [3/10]: GSM2055825_03_3366_PD_mRNA.RCC
## reading RCC file [4/10]: GSM2055826_04_4078_PD_mRNA.RCC
## reading RCC file [5/10]: GSM2055827_05_4846_PD_mRNA.RCC
## reading RCC file [6/10]: GSM2055828_06_3746_PD_mRNA.RCC
## reading RCC file [7/10]: GSM2055829_07_3760_PD_mRNA.RCC
## reading RCC file [8/10]: GSM2055830_08_3790_PD_mRNA.RCC
## reading RCC file [9/10]: GSM2055831_09_4436_PD_mRNA.RCC
## reading RCC file [10/10]: GSM2055832_10_4050_PD_mRNA.RCC
#output of the file
str(rccfile)
## List of 2
##  $ x     :'data.frame':  148 obs. of  13 variables:
##   ..$ CodeClass                 : chr [1:148] "Endogenous" "Endogenous" "Endogenous" "Endogenous" ...
##   ..$ Name                      : chr [1:148] "TNFSF13B" "CCL5" "MCL1" "PPBP" ...
##   ..$ Accession                 : chr [1:148] "NM_001145645.2" "NM_002985.2" "NM_021960.3" "NM_002704.2" ...
##   ..$ GSM2055823_01_4353_PD_mRNA: int [1:148] 437 111 4844 214 45 74 82 156 32 95 ...
##   ..$ GSM2055824_02_4355_PD_mRNA: int [1:148] 119 31 1698 45 9 8 17 27 10 42 ...
##   ..$ GSM2055825_03_3366_PD_mRNA: int [1:148] 405 50 7740 53 21 15 44 126 29 100 ...
##   ..$ GSM2055826_04_4078_PD_mRNA: int [1:148] 251 20 4479 25 17 12 25 163 22 95 ...
##   ..$ GSM2055827_05_4846_PD_mRNA: int [1:148] 246 31 2576 24 28 30 28 51 20 43 ...
##   ..$ GSM2055828_06_3746_PD_mRNA: int [1:148] 797 28 9109 35 21 28 48 194 25 81 ...
##   ..$ GSM2055829_07_3760_PD_mRNA: int [1:148] 98 19 1868 37 20 19 25 35 20 19 ...
##   ..$ GSM2055830_08_3790_PD_mRNA: int [1:148] 257 17 4045 11 18 8 16 153 5 93 ...
##   ..$ GSM2055831_09_4436_PD_mRNA: int [1:148] 145 22 1735 20 4 17 13 28 7 27 ...
##   ..$ GSM2055832_10_4050_PD_mRNA: int [1:148] 90 27 1749 10 7 7 11 60 5 32 ...
##  $ header:'data.frame':  16 obs. of  10 variables:
##   ..$ GSM2055823_01_4353_PD_mRNA: chr [1:16] "1.7" "3.0.1.4" "01" "venu" ...
##   ..$ GSM2055824_02_4355_PD_mRNA: chr [1:16] "1.7" "3.0.1.4" "01" "venu" ...
##   ..$ GSM2055825_03_3366_PD_mRNA: chr [1:16] "1.7" "3.0.1.4" "01" "venu" ...
##   ..$ GSM2055826_04_4078_PD_mRNA: chr [1:16] "1.7" "3.0.1.4" "01" "venu" ...
##   ..$ GSM2055827_05_4846_PD_mRNA: chr [1:16] "1.7" "3.0.1.4" "01" "venu" ...
##   ..$ GSM2055828_06_3746_PD_mRNA: chr [1:16] "1.7" "3.0.1.4" "01" "venu" ...
##   ..$ GSM2055829_07_3760_PD_mRNA: chr [1:16] "1.7" "3.0.1.4" "01" "venu" ...
##   ..$ GSM2055830_08_3790_PD_mRNA: chr [1:16] "1.7" "3.0.1.4" "01" "venu" ...
##   ..$ GSM2055831_09_4436_PD_mRNA: chr [1:16] "1.7" "3.0.1.4" "01" "venu" ...
##   ..$ GSM2055832_10_4050_PD_mRNA: chr [1:16] "1.7" "3.0.1.4" "01" "venu" ...
##  - attr(*, "class")= chr "NanoString"
#creating separate data frame for table data in the file 
rss<-rccfile$x
#subsetting the data and retaining the columns which are required
df3 <- subset(rss, select = -c(Accession))
df4 <- df3 %>% gather("key", "value", -c(Name, CodeClass))%>%filter(CodeClass=="Positive"|CodeClass=='Negative')

#normalizing the data 
df5 <- df4 %>% 
  group_by(Name) %>% 
  mutate(Nor = value/max(value)) %>% ungroup()
#plotting the heatmap for the genes
ggplot(df5,aes(x = Name,y=key)) + 
  geom_tile(aes(fill = Nor)) + scale_fill_gradient2(low = "green", high = "red", mid = "white", midpoint = 0.5) +
  coord_trans(y = "reverse")+labs(x="Positive and Negative Controls",y="Samples")+theme(axis.text=element_text(size=06),
                                                                                        axis.title=element_text(size=06,face="bold"))+ggtitle("Control Genes")

#Task 2.
#The differences between the baseline and post-treatment timepoints for two genes of interest: MCL1 and CXCL1
library(tabulizer)
library(miniUI)
#importing the table for case study annotations from case study pdf
table<-extract_tables("/Users/shashidhar/Downloads/DataScientist_CaseStudy.pdf",pages=2)
#creating data frame for table list
Sampletable<-as.data.frame(table,header=T)
#making first row as table header
colnames(Sampletable)<-as.character(Sampletable[1,])
Sampletable<-Sampletable[-1,]
# creating new table to import gene Data
jointable<-rss%>%gather('key','values',-c(Name,CodeClass,Accession))%>%dplyr::select(Name,key,values)


# creating MCL1 table with Normalization of count value
MCL1table<-Sampletable%>%inner_join(jointable,by=c("RCC File"= "key"))%>%filter(Name=="MCL1")%>%mutate(values=log(values))
MCL1table
##                      RCC File Subject      Timepoint Name   values
## 1  GSM2055823_01_4353_PD_mRNA       1       Baseline MCL1 8.485496
## 2  GSM2055824_02_4355_PD_mRNA       1 Post-Treatment MCL1 7.437206
## 3  GSM2055825_03_3366_PD_mRNA       2       Baseline MCL1 8.954157
## 4  GSM2055826_04_4078_PD_mRNA       2 Post-Treatment MCL1 8.407155
## 5  GSM2055827_05_4846_PD_mRNA       3       Baseline MCL1 7.853993
## 6  GSM2055828_06_3746_PD_mRNA       3 Post-Treatment MCL1 9.117018
## 7  GSM2055829_07_3760_PD_mRNA       4       Baseline MCL1 7.532624
## 8  GSM2055830_08_3790_PD_mRNA       4 Post-Treatment MCL1 8.305237
## 9  GSM2055831_09_4436_PD_mRNA       5       Baseline MCL1 7.458763
## 10 GSM2055832_10_4050_PD_mRNA       5 Post-Treatment MCL1 7.466799
# creating CXCL1 table with normalization of count value
CXCL1table<-Sampletable%>%inner_join(jointable,by=c("RCC File"= "key"))%>%filter(Name=='CXCL1')%>%mutate(values=log(values))
CXCL1table
##                      RCC File Subject      Timepoint  Name   values
## 1  GSM2055823_01_4353_PD_mRNA       1       Baseline CXCL1 5.488938
## 2  GSM2055824_02_4355_PD_mRNA       1 Post-Treatment CXCL1 3.988984
## 3  GSM2055825_03_3366_PD_mRNA       2       Baseline CXCL1 5.978886
## 4  GSM2055826_04_4078_PD_mRNA       2 Post-Treatment CXCL1 4.543295
## 5  GSM2055827_05_4846_PD_mRNA       3       Baseline CXCL1 4.709530
## 6  GSM2055828_06_3746_PD_mRNA       3 Post-Treatment CXCL1 6.719013
## 7  GSM2055829_07_3760_PD_mRNA       4       Baseline CXCL1 5.147494
## 8  GSM2055830_08_3790_PD_mRNA       4 Post-Treatment CXCL1 4.025352
## 9  GSM2055831_09_4436_PD_mRNA       5       Baseline CXCL1 4.369448
## 10 GSM2055832_10_4050_PD_mRNA       5 Post-Treatment CXCL1 4.204693
#Box plots of MCL1 and CXCL1 respectively Boxplots of summary statistics (minimum, 25th percentile, mean, median, 75th percentile, and maximum) for each timepoint by gene with overlay of individual data points for each sample


#box plot for MCL1 Gene
ggplot(MCL1table,aes(Timepoint,values,fill=Timepoint))+geom_boxplot() +
  geom_jitter(color="blue", size=0.9, alpha=0.9)+ggtitle("Timepoint difference for MCL1 Gene")

#box plot for CXCL1 gene  
ggplot(CXCL1table,aes(Timepoint,values,fill=Timepoint))+geom_boxplot()+geom_jitter(color="blue", size=0.9, alpha=0.9)+ggtitle("Timepoint difference for CXCL1 Gene")