This document recorded the pre-Processing of G-DILI data, including following contents:
# rm(list=ls())
# Load data saved in Data_extraction_Process.Rmd
load("GDILI_traindata_mas5.RData")
The platform information of Affy HG-u133a2 (GPL571) could be found from GEO.
platform_info <- read.table('Data/GPL571-17391.txt',skip = 16, header = T, sep="\t", fill = T, quote = "")
The General information of probes in this platform is shown as belows: (10 examples)
as.data.frame(platform_info[1:10,c(1,2,6,7,11,12)]) %>%
kable("html", align= "l", escape = F) %>%
kable_styling("striped") %>%scroll_box(height = "300px")
| ID | GB_ACC | Sequence.Type | Sequence.Source | Gene.Symbol | ENTREZ_GENE_ID |
|---|---|---|---|---|---|
| 1007_s_at | U48705 | Exemplar sequence | Affymetrix Proprietary Database | DDR1 /// MIR4640 | 780 /// 100616237 |
| 1053_at | M87338 | Exemplar sequence | GenBank | RFC2 | 5982 |
| 117_at | X51757 | Exemplar sequence | Affymetrix Proprietary Database | HSPA6 | 3310 |
| 121_at | X69699 | Exemplar sequence | GenBank | PAX8 | 7849 |
| 1255_g_at | L36861 | Exemplar sequence | Affymetrix Proprietary Database | GUCA1A | 2978 |
| 1294_at | L13852 | Exemplar sequence | GenBank | MIR5193 /// UBA7 | 7318 /// 100847079 |
| 1316_at | X55005 | Exemplar sequence | Affymetrix Proprietary Database | THRA | 7067 |
| 1320_at | X79510 | Exemplar sequence | Affymetrix Proprietary Database | PTPN21 | 11099 |
| 1405_i_at | M21121 | Exemplar sequence | GenBank | CCL5 | 6352 |
| 1431_at | J02843 | Exemplar sequence | Affymetrix Proprietary Database | CYP2E1 | 1571 |
Statistics of Sequence type:
table(platform_info$Sequence.Type)
##
## Consensus sequence Control sequence Exemplar sequence
## 8645 62 13570
Remove Control sequence from the dataset
Control_probe <- as.matrix(platform_info$ID[platform_info$Sequence.Type=="Control sequence"])
data_MCF7_clean <- data_train_MCF7[,!(colnames(data_train_MCF7) %in% Control_probe)]
data_PC3_clean <- data_train_PC3[,!(colnames(data_train_PC3) %in% Control_probe)]
# Log2 Transform
Label <- data_MCF7_clean$Label
data_MCF7_clean_log2 <- log2(data_MCF7_clean[,-ncol(data_MCF7_clean)]) %>% mutate(Label=Label)
Label <- data_PC3_clean$Label
data_PC3_clean_log2 <- log2(data_PC3_clean[,-ncol(data_PC3_clean)]) %>% mutate(Label=Label)
Here we used the log2 transformed dataset; you can also use original expr dataset.
# InTraining should be generated in Data_extraction_Process document
# If not, generate it via following script, note that random seed used in previous was 2008
if (!exists("inTraining")){
set.seed(2008)
Label <- factor(sample_train$Label,levels = c(1,0))
levels(Label) <- c("Positive","Negative") # rename the Label to be more intuitive.
inTraining <- createDataPartition(Label, p=0.6, list=FALSE, times=1)
}
used_data ="Log2" # or Intensity
if (used_data == "Log2"){
traindata_MCF7 <- data_MCF7_clean_log2[inTraining,]
validatingdata_MCF7 <- data_MCF7_clean_log2[-inTraining,]
traindata_PC3 <- data_PC3_clean_log2[inTraining,]
validatingdata_PC3 <- data_PC3_clean_log2[-inTraining,]
}else{
traindata_MCF7 <- data_MCF7_clean[inTraining,]
validatingdata_MCF7 <- data_MCF7_clean[-inTraining,]
traindata_PC3 <- data_PC3_clean[inTraining,]
validatingdata_PC3 <- data_PC3_clean[-inTraining,]
}
# Log2 transform
#MCF7
norm_proc_MCF7 <- preProcess(traindata_MCF7,method=c("scale","center"))
traindata_MCF7_norm <- predict(norm_proc_MCF7, traindata_MCF7)
validatingdata_MCF7_norm <- predict(norm_proc_MCF7, validatingdata_MCF7)
#PC3
norm_proc_PC3 <- preProcess(traindata_MCF7,method=c("scale","center"))
traindata_PC3_norm <- predict(norm_proc_PC3, traindata_PC3)
validatingdata_PC3_norm <- predict(norm_proc_PC3, validatingdata_PC3)
#Release space from unused matrix and reduce confusion about "Training"
# rm(traindata_MCF7, traindata_PC3, validatingdata_MCF7, validatingdata_PC3,
# data_MCF7_clean, data_MCF7_clean_log2, data_PC3_clean, data_PC3_clean_log2,
# data_train_MCF7, data_train_PC3)
Most important step in the document :)
save(file="GDILI_processed_Data_Log2.RData", traindata_MCF7_norm, traindata_PC3_norm, validatingdata_MCF7_norm, validatingdata_PC3_norm, sample_train)