GDILI_Preprocess

Introduction

This document recorded the pre-Processing of G-DILI data, including following contents:

Remove Control Probes;
Generate random training and testing dataset with fixed random seed.
Data Scaling and Normalization;

# rm(list=ls())
# Load data saved in Data_extraction_Process.Rmd
load("GDILI_traindata_mas5.RData")

Remove Control Probes and Log2 Transform

The platform information of Affy HG-u133a2 (GPL571) could be found from GEO.

platform_info <- read.table('Data/GPL571-17391.txt',skip = 16, header = T, sep="\t", fill = T, quote = "")

The General information of probes in this platform is shown as belows: (10 examples)

as.data.frame(platform_info[1:10,c(1,2,6,7,11,12)]) %>%
  kable("html", align= "l", escape = F) %>%
  kable_styling("striped") %>%scroll_box(height = "300px")

ID	GB_ACC	Sequence.Type	Sequence.Source	Gene.Symbol	ENTREZ_GENE_ID
1007_s_at	U48705	Exemplar sequence	Affymetrix Proprietary Database	DDR1 /// MIR4640	780 /// 100616237
1053_at	M87338	Exemplar sequence	GenBank	RFC2	5982
117_at	X51757	Exemplar sequence	Affymetrix Proprietary Database	HSPA6	3310
121_at	X69699	Exemplar sequence	GenBank	PAX8	7849
1255_g_at	L36861	Exemplar sequence	Affymetrix Proprietary Database	GUCA1A	2978
1294_at	L13852	Exemplar sequence	GenBank	MIR5193 /// UBA7	7318 /// 100847079
1316_at	X55005	Exemplar sequence	Affymetrix Proprietary Database	THRA	7067
1320_at	X79510	Exemplar sequence	Affymetrix Proprietary Database	PTPN21	11099
1405_i_at	M21121	Exemplar sequence	GenBank	CCL5	6352
1431_at	J02843	Exemplar sequence	Affymetrix Proprietary Database	CYP2E1	1571

Statistics of Sequence type:

table(platform_info$Sequence.Type)

## 
## Consensus sequence   Control sequence  Exemplar sequence 
##               8645                 62              13570

Remove Control sequence from the dataset

Control_probe <- as.matrix(platform_info$ID[platform_info$Sequence.Type=="Control sequence"])
data_MCF7_clean <- data_train_MCF7[,!(colnames(data_train_MCF7) %in% Control_probe)]
data_PC3_clean <- data_train_PC3[,!(colnames(data_train_PC3) %in% Control_probe)]

# Log2 Transform
Label <- data_MCF7_clean$Label
data_MCF7_clean_log2 <- log2(data_MCF7_clean[,-ncol(data_MCF7_clean)]) %>% mutate(Label=Label)
Label <- data_PC3_clean$Label
data_PC3_clean_log2 <- log2(data_PC3_clean[,-ncol(data_PC3_clean)]) %>% mutate(Label=Label)

Generate random training and testing dataset with fixed random seed.

Here we used the log2 transformed dataset; you can also use original expr dataset.

# InTraining should be generated in Data_extraction_Process document
# If not, generate it via following script, note that random seed used in previous was 2008
if (!exists("inTraining")){
  set.seed(2008)
  Label <- factor(sample_train$Label,levels = c(1,0))
  levels(Label) <- c("Positive","Negative") # rename the Label to be more intuitive. 
  inTraining <- createDataPartition(Label, p=0.6, list=FALSE, times=1)
}

used_data ="Log2" # or Intensity
if (used_data == "Log2"){
  traindata_MCF7 <- data_MCF7_clean_log2[inTraining,]
  validatingdata_MCF7 <- data_MCF7_clean_log2[-inTraining,]
  
  traindata_PC3 <- data_PC3_clean_log2[inTraining,]
  validatingdata_PC3 <- data_PC3_clean_log2[-inTraining,]
}else{
  traindata_MCF7 <- data_MCF7_clean[inTraining,]
  validatingdata_MCF7 <- data_MCF7_clean[-inTraining,]
  
  traindata_PC3 <- data_PC3_clean[inTraining,]
  validatingdata_PC3 <- data_PC3_clean[-inTraining,]
}

Data Scaling and Normalization

# Log2 transform

#MCF7
norm_proc_MCF7 <- preProcess(traindata_MCF7,method=c("scale","center"))
traindata_MCF7_norm <- predict(norm_proc_MCF7, traindata_MCF7)
validatingdata_MCF7_norm <- predict(norm_proc_MCF7, validatingdata_MCF7)

#PC3
norm_proc_PC3 <- preProcess(traindata_MCF7,method=c("scale","center"))
traindata_PC3_norm <- predict(norm_proc_PC3, traindata_PC3)
validatingdata_PC3_norm <- predict(norm_proc_PC3, validatingdata_PC3)

#Release space from unused matrix and reduce confusion about "Training"
# rm(traindata_MCF7, traindata_PC3, validatingdata_MCF7, validatingdata_PC3, 
#      data_MCF7_clean, data_MCF7_clean_log2, data_PC3_clean, data_PC3_clean_log2, 
#       data_train_MCF7, data_train_PC3)

Save processed Data to file

Most important step in the document :)

save(file="GDILI_processed_Data_Log2.RData", traindata_MCF7_norm, traindata_PC3_norm, validatingdata_MCF7_norm, validatingdata_PC3_norm, sample_train)