Introduction

This document recorded the pre-Processing of G-DILI data, including following contents:

  1. Remove Control Probes;
  2. Generate random training and testing dataset with fixed random seed.
  3. Data Scaling and Normalization;
# rm(list=ls())
# Load data saved in Data_extraction_Process.Rmd
load("GDILI_traindata_mas5.RData")

Remove Control Probes and Log2 Transform

The platform information of Affy HG-u133a2 (GPL571) could be found from GEO.

platform_info <- read.table('Data/GPL571-17391.txt',skip = 16, header = T, sep="\t", fill = T, quote = "")

The General information of probes in this platform is shown as belows: (10 examples)

as.data.frame(platform_info[1:10,c(1,2,6,7,11,12)]) %>%
  kable("html", align= "l", escape = F) %>%
  kable_styling("striped") %>%scroll_box(height = "300px")
ID GB_ACC Sequence.Type Sequence.Source Gene.Symbol ENTREZ_GENE_ID
1007_s_at U48705 Exemplar sequence Affymetrix Proprietary Database DDR1 /// MIR4640 780 /// 100616237
1053_at M87338 Exemplar sequence GenBank RFC2 5982
117_at X51757 Exemplar sequence Affymetrix Proprietary Database HSPA6 3310
121_at X69699 Exemplar sequence GenBank PAX8 7849
1255_g_at L36861 Exemplar sequence Affymetrix Proprietary Database GUCA1A 2978
1294_at L13852 Exemplar sequence GenBank MIR5193 /// UBA7 7318 /// 100847079
1316_at X55005 Exemplar sequence Affymetrix Proprietary Database THRA 7067
1320_at X79510 Exemplar sequence Affymetrix Proprietary Database PTPN21 11099
1405_i_at M21121 Exemplar sequence GenBank CCL5 6352
1431_at J02843 Exemplar sequence Affymetrix Proprietary Database CYP2E1 1571

Statistics of Sequence type:

table(platform_info$Sequence.Type)
## 
## Consensus sequence   Control sequence  Exemplar sequence 
##               8645                 62              13570

Remove Control sequence from the dataset

Control_probe <- as.matrix(platform_info$ID[platform_info$Sequence.Type=="Control sequence"])
data_MCF7_clean <- data_train_MCF7[,!(colnames(data_train_MCF7) %in% Control_probe)]
data_PC3_clean <- data_train_PC3[,!(colnames(data_train_PC3) %in% Control_probe)]

# Log2 Transform
Label <- data_MCF7_clean$Label
data_MCF7_clean_log2 <- log2(data_MCF7_clean[,-ncol(data_MCF7_clean)]) %>% mutate(Label=Label)
Label <- data_PC3_clean$Label
data_PC3_clean_log2 <- log2(data_PC3_clean[,-ncol(data_PC3_clean)]) %>% mutate(Label=Label)

Generate random training and testing dataset with fixed random seed.

Here we used the log2 transformed dataset; you can also use original expr dataset.

# InTraining should be generated in Data_extraction_Process document
# If not, generate it via following script, note that random seed used in previous was 2008
if (!exists("inTraining")){
  set.seed(2008)
  Label <- factor(sample_train$Label,levels = c(1,0))
  levels(Label) <- c("Positive","Negative") # rename the Label to be more intuitive. 
  inTraining <- createDataPartition(Label, p=0.6, list=FALSE, times=1)
}

used_data ="Log2" # or Intensity
if (used_data == "Log2"){
  traindata_MCF7 <- data_MCF7_clean_log2[inTraining,]
  validatingdata_MCF7 <- data_MCF7_clean_log2[-inTraining,]
  
  traindata_PC3 <- data_PC3_clean_log2[inTraining,]
  validatingdata_PC3 <- data_PC3_clean_log2[-inTraining,]
}else{
  traindata_MCF7 <- data_MCF7_clean[inTraining,]
  validatingdata_MCF7 <- data_MCF7_clean[-inTraining,]
  
  traindata_PC3 <- data_PC3_clean[inTraining,]
  validatingdata_PC3 <- data_PC3_clean[-inTraining,]
}

Data Scaling and Normalization

# Log2 transform

#MCF7
norm_proc_MCF7 <- preProcess(traindata_MCF7,method=c("scale","center"))
traindata_MCF7_norm <- predict(norm_proc_MCF7, traindata_MCF7)
validatingdata_MCF7_norm <- predict(norm_proc_MCF7, validatingdata_MCF7)

#PC3
norm_proc_PC3 <- preProcess(traindata_MCF7,method=c("scale","center"))
traindata_PC3_norm <- predict(norm_proc_PC3, traindata_PC3)
validatingdata_PC3_norm <- predict(norm_proc_PC3, validatingdata_PC3)

#Release space from unused matrix and reduce confusion about "Training"
# rm(traindata_MCF7, traindata_PC3, validatingdata_MCF7, validatingdata_PC3, 
#      data_MCF7_clean, data_MCF7_clean_log2, data_PC3_clean, data_PC3_clean_log2, 
#       data_train_MCF7, data_train_PC3)

Save processed Data to file

Most important step in the document :)

save(file="GDILI_processed_Data_Log2.RData", traindata_MCF7_norm, traindata_PC3_norm, validatingdata_MCF7_norm, validatingdata_PC3_norm, sample_train)