This Document is only used for CAMDA G-DILI project. Copy the code to your R console and Run.
Disclaimer:
1.1 Data download
1.2 Data Unzip
1.3 CAMDA sample information
1.4 Cleaning table
1.5 Data Summary
2.1 Data import and cleaning
3.1 Data import and cleaning
See Supplementary Information Section for R package requirement.
Raw .cel File are downloaded from cmap02 website. We used cmap - build02 version - data and stored them in the Data folder.
if (!file.exists("Data/cmap_build02.volume1of7.zip")){
dir.create('Data')
data_file = "ftp://ftp.broad.mit.edu/pub/cmap/cmap_build02.volume1of7.zip"
download.file(data_file,destfile = "Data/cmap_build02.volume1of7.zip")
data_file = "ftp://ftp.broad.mit.edu/pub/cmap/cmap_build02.volume2of7.zip"
download.file(data_file,destfile = "Data/cmap_build02.volume2of7.zip")
data_file = "ftp://ftp.broad.mit.edu/pub/cmap/cmap_build02.volume3of7.zip"
download.file(data_file,destfile = "Data/cmap_build02.volume3of7.zip")
data_file = "ftp://ftp.broad.mit.edu/pub/cmap/cmap_build02.volume4of7.zip"
download.file(data_file,destfile = "Data/cmap_build02.volume4of7.zip")
data_file = "ftp://ftp.broad.mit.edu/pub/cmap/cmap_build02.volume5of7.zip"
download.file(data_file,destfile = "Data/cmap_build02.volume5of7.zip")
data_file = "ftp://ftp.broad.mit.edu/pub/cmap/cmap_build02.volume6of7.zip"
download.file(data_file,destfile = "Data/cmap_build02.volume6of7.zip")
data_file = "ftp://ftp.broad.mit.edu/pub/cmap/cmap_build02.volume7of7.zip"
download.file(data_file,destfile = "Data/cmap_build02.volume7of7.zip")
}
Unzip zip files from Data into data_raw folder.
if (!file.exists("data_raw")){
unzip("Data/cmap_build02.volume1of7.zip",exdir = "data_raw")
unzip("Data/cmap_build02.volume2of7.zip",exdir = "data_raw")
unzip("Data/cmap_build02.volume3of7.zip",exdir = "data_raw")
unzip("Data/cmap_build02.volume4of7.zip",exdir = "data_raw")
unzip("Data/cmap_build02.volume5of7.zip",exdir = "data_raw")
unzip("Data/cmap_build02.volume6of7.zip",exdir = "data_raw")
unzip("Data/cmap_build02.volume7of7.zip",exdir = "data_raw")
}
Get sample and related .cel file information from CAMDA website.
sample_info <- read.xlsx("Copy of CAMDA_Challange_dataset_filenames.xlsx", sheetIndex = 1, header=T,startRow = 2)
sample_info <- sample_info[,c(1,2,3,4,6)]
colnames(sample_info)=c("ID","Category","Label","MCF7","PC3")
#head(sample_info)
There are two platform included in this study: MCF7 and PC3. They are analyzed separately.
To be consistent between studies, we used the same separating random seeds:
sample_info$Category <- gsub("Training ", "Training", sample_info$Category)
sample_info$MCF7 <- gsub("^\'","", sample_info$MCF7)
sample_info$PC3 <- gsub("^\'","", sample_info$PC3)
set.seed(2018) # Fix the random seed for reproducible purpose.
sample_train <- sample_info[sample_info$Category=="Training",]
Label <- factor(sample_train$Label,levels = c(1,0))
levels(Label) <- c("Positive","Negative") # rename the Label to be more intuitive.
inTraining <- createDataPartition(Label, p=0.6, list=FALSE, times=1)
## Data normalization Method:
norm_method = "MAS5" # could be "RMA" too. "MAS5" would be relatively slower.
After data cleaning, the final table is formatted as:
as.data.frame(sample_info) %>% mutate(
Category = cell_spec(Category, color = "white", background = ifelse(Category =="Training", "Blue", "darkgray")),
Label = cell_spec(Label, color = ifelse(is.na(Label), "darkgray", ifelse(Label==1, "Red","Green")))
) %>% kable("html", align= "c", escape = F) %>%
kable_styling("striped") %>%
scroll_box(height = "300px")
| ID | Category | Label | MCF7 | PC3 |
|---|---|---|---|---|
| 1 | Training | 1 | 5500024030403071907255.C05 | 5500024031723100807775.C05 |
| 2 | Training | 1 | 5500024030403071907253.A09 | 5500024031723100807771.A09 |
| 3 | Training | 0 | 5500024030402071707279.B01 | 5500024031723100807776.B01 |
| 4 | Training | 1 | 5500024030403071907257.G04 | 5500024030700072107992.G04 |
| 5 | Training | 0 | 5500024037496121008324.E08 | 5500024037498121108438.E02 |
| 6 | Training | 0 | 5500024032848101507997.F03 | 5500024035736031208613.F03 |
| 7 | Training | 1 | 5500024030402071707277.F10 | 5500024031723100807771.F10 |
| 8 | Training | 1 | 5500024024213121906562.B07 | 5500024024213121906564.B01 |
| 9 | Training | 1 | 5500024032848101507997.H04 | 5500024035736031208613.H04 |
| 10 | Training | 1 | 5500024032848101507997.D01 | 5500024035736031208613.D01 |
| 11 | Training | 0 | 5500024030402071707279.H02 | 5500024031723100807776.H02 |
| 12 | Training | 0 | 5500024030403071907252.E04 | 5500024034290101707044.E10 |
| 13 | Training | 1 | 5500024024211121606513.D09 | 5500024031723100807771.D09 |
| 14 | Training | 1 | 5500024024211121606513.C12 | 5500024035736031208613.C06 |
| 15 | Training | 1 | 5500024030403071907255.C09 | 5500024031723100807775.C09 |
| 16 | Training | 1 | 5500024024213121906562.F07 | 5500024024213121906564.F01 |
| 17 | Training | 0 | 5500024030402071707277.A10 | 5500024031723100807771.A10 |
| 18 | Training | 0 | 5500024030402071707277.F11 | 5500024031723100807771.F05 |
| 19 | Training | 1 | 5500024024211121606513.H10 | 5500024031723100807771.H04 |
| 20 | Training | 0 | 5500024030700072107988.G09 | 5500024035736031208613.G09 |
| 21 | Training | 1 | 5500024030700072107994.E04 | 5500024034290101707046.E10 |
| 22 | Training | 1 | 5500024024213121906562.F11 | 5500024024213121906564.F05 |
| 23 | Training | 0 | 5500024035101021908499.D02 | 5500024037498121108440.D08 |
| 24 | Training | 1 | 5500024030403071907255.D01 | 5500024031723100807775.D01 |
| 25 | Training | 1 | 5500024028849050407169.D03 | 5500024030700072107986.D03 |
| 26 | Training | 1 | 5500024035736031208618.D09 | 5500024034290101707046.D09 |
| 27 | Training | 1 | EC2004030515AA | EC2004070116AA |
| 28 | Training | 1 | 5500024030760072207031.H08 | 5500024030700072107991.H08 |
| 29 | Training | 0 | 5500024034290101707045.H06 | 5500024024213121906564.H06 |
| 30 | Training | 1 | 5500024030700072107988.G12 | 5500024037498121108438.G06 |
| 31 | Training | 1 | 5500024034290101707045.C06 | 5500024031330081907229.C06 |
| 32 | Training | 1 | 5500024030403071907257.H09 | 5500024030700072107992.H09 |
| 33 | Training | 1 | 5500024030760072207031.B09 | 5500024030700072107991.B09 |
| 34 | Training | 1 | 5500024024214122006606.F01 | 5500024024213121906561.F01 |
| 35 | Training | 1 | 5500024030700072107994.E06 | 5500024031723100807773.E12 |
| 36 | Training | 1 | 5500024030700072107994.H05 | 5500024034290101707046.H05 |
| 37 | Training | 1 | 5500024024213121906562.C01 | 5500024017802120306174.C07 |
| 38 | Training | 1 | 5500024031330081907228.C08 | 5500024030760072207026.C08 |
| 39 | Training | 1 | 5500024030402071707279.E07 | 5500024031723100807776.E07 |
| 40 | Training | 1 | 5500024035101021908499.F08 | 5500024031723100807773.F08 |
| 41 | Training | 1 | 5500024024211121606513.B08 | 610611110806.B08 |
| 42 | Training | 0 | 5500024053146032309538.C05 | 5500024053146032309538.C11 |
| 43 | Training | 0 | 5500024024213121906562.F08 | 5500024024213121906564.F02 |
| 44 | Training | 0 | 5500024028416032807794.E10 | 5500024035736031208613.E04 |
| 45 | Training | 1 | EC2004012211AA | EC2004060209AA |
| 46 | Training | 1 | 5500024030403071907255.D12 | 5500024031723100807775.D12 |
| 47 | Training | 1 | 5500024024211121606513.C02 | 610611110806.C02 |
| 48 | Training | 0 | 5500024030402071707277.H08 | 5500024031723100807776.H08 |
| 49 | Training | 0 | 5500024024214122006606.B04 | 5500024024213121906561.B04 |
| 50 | Training | 1 | 5500024024211121606513.C08 | 610611110806.C08 |
| 51 | Training | 0 | 5500024032848101507997.C10 | 5500024035736031208613.C10 |
| 52 | Training | 0 | 5500024024213121906562.A09 | 5500024024213121906564.A03 |
| 53 | Training | 1 | 5500024037289120508304.A07 | 5500024037498121108438.A01 |
| 54 | Training | 0 | 5500024032848101507997.H05 | 5500024035736031208613.H05 |
| 55 | Training | 1 | 5500024051859013109328.F04 | 5500024035736031208613.F10 |
| 56 | Training | 1 | 5500024030402071707277.G11 | 5500024031723100807771.G05 |
| 57 | Training | 1 | 5500024037498121108437.B04 | 5500024034290101707044.B04 |
| 58 | Training | 0 | 5500024024211121606513.F01 | 5500024030760072207026.F07 |
| 59 | Training | 0 | 5500024034290101707050.C02 | 5500024031723100807771.C02 |
| 60 | Training | 0 | 5500024030402071707279.D08 | 5500024031723100807776.D08 |
| 61 | Training | 1 | 5500024030402071707279.G07 | 610611110806.G07 |
| 62 | Training | 1 | 5500024028416032807794.B12 | 5500024031723100807776.B06 |
| 63 | Training | 1 | 5500024031723100807770.B08 | 5500024031723100807769.B08 |
| 64 | Training | 0 | 5500024031723100807770.H10 | 5500024031723100807769.H10 |
| 65 | Training | 1 | 5500024037289120508304.H11 | 5500024037498121108438.H05 |
| 66 | Training | 1 | 5500024030403071907255.A10 | 5500024031723100807775.A10 |
| 67 | Training | 1 | 5500024030403071907251.G12 | 5500024030700072107986.G12 |
| 68 | Training | 1 | 5500024030700072107994.F05 | 5500024034290101707046.F05 |
| 69 | Training | 0 | 5500024032848101507997.H10 | 5500024035736031208613.H10 |
| 70 | Training | 1 | 5500024028849050407169.F06 | 5500024030700072107991.F06 |
| 71 | Training | 0 | 5500024024213121906558.A03 | 5500024030700072107991.A03 |
| 72 | Training | 1 | 5500024030402071707279.B08 | 5500024031723100807776.B08 |
| 73 | Training | 1 | 5500024030760072207031.E07 | 5500024030700072107991.E07 |
| 74 | Training | 0 | 5500024030403071907252.A05 | 5500024034290101707044.A05 |
| 75 | Training | 1 | 5500024028849050407169.C09 | 5500024034290101707044.C09 |
| 76 | Training | 1 | 5500024024214122006606.C07 | 5500024024213121906561.C07 |
| 77 | Training | 1 | 5500024037289120508304.E07 | 5500024037498121108438.E01 |
| 78 | Training | 1 | 5500024030700072107994.H02 | 5500024034290101707044.H08 |
| 79 | Training | 1 | 5500024030403071907255.H11 | 5500024031723100807775.H11 |
| 80 | Training | 1 | 5500024031723100807772.H03 | 5500024037498121108442.H09 |
| 81 | Training | 1 | 5500024024211121606513.B09 | 5500024031723100807771.B09 |
| 82 | Training | 1 | 5500024030403071907255.D05 | 5500024031723100807775.D05 |
| 83 | Training | 0 | 5500024030402071707279.H03 | 5500024031723100807776.H03 |
| 84 | Training | 1 | 5500024035735011708571.A11 | 5500024034290101707046.A11 |
| 85 | Training | 1 | EC2004041404AA | EC2004070117AA |
| 86 | Training | 1 | 5500024024213121906562.E07 | 5500024024213121906564.E01 |
| 87 | Training | 1 | 5500024024214122006606.A08 | 5500024024213121906561.A08 |
| 88 | Training | 1 | 5500024053146032309538.F02 | 5500024053146032309538.F08 |
| 89 | Training | 1 | 5500024037498121108437.A01 | 5500024034290101707044.A01 |
| 90 | Training | 1 | 5500024051859013109328.B06 | 5500024037498121108438.B12 |
| 91 | Training | 0 | 5500024024211121606513.D02 | 5500024030760072207026.D02 |
| 92 | Training | 0 | 5500024030700072107994.D09 | 5500024031723100807771.D03 |
| 93 | Training | 1 | 5500024030403071907251.D11 | 5500024030700072107986.D11 |
| 94 | Training | 1 | 5500024030700072107994.H06 | 5500024031723100807773.H12 |
| 95 | Training | 1 | 5500024035735011708567.C02 | 5500024037498121108438.C08 |
| 96 | Training | 1 | 5500024031330081907228.F12 | 5500024030760072207026.F12 |
| 97 | Training | 0 | 5500024031723100807770.B12 | 5500024031723100807769.B12 |
| 98 | Training | 1 | 5500024038291011509600.A06 | 5500024038291011509600.A12 |
| 99 | Training | 0 | 5500024024211121606513.C09 | 5500024031723100807771.C09 |
| 100 | Training | 1 | 5500024024213121906558.G05 | 5500024030700072107986.G05 |
| 101 | Training | 0 | 5500024037289120508304.D09 | 5500024037498121108438.D03 |
| 102 | Training | 1 | 5500024024214122006606.H03 | 5500024024213121906561.H03 |
| 103 | Training | 1 | 5500024030403071907257.B03 | 5500024030700072107992.B03 |
| 104 | Training | 1 | EC2004012212AA | EC2005030717AA |
| 105 | Training | 1 | 5500024028849050407172.E12 | 5500024034290101707044.E06 |
| 106 | Training | 1 | 5500024028849050407169.C11 | 5500024034290101707046.C11 |
| 107 | Training | 1 | 5500024030403071907255.E12 | 5500024031723100807775.E12 |
| 108 | Training | 1 | 5500024030403071907253.B08 | 5500024031723100807771.B08 |
| 109 | Training | 1 | 5500024030403071907251.D09 | 5500024030700072107986.D09 |
| 110 | Training | 1 | 5500024035736031208618.A11 | 5500024031723100807773.A11 |
| 111 | Training | 1 | 5500024030403071907257.A04 | 5500024030700072107992.A04 |
| 112 | Training | 0 | 5500024034290101707050.H06 | 5500024031723100807771.H06 |
| 113 | Training | 1 | 5500024030760072207031.B04 | 5500024030700072107991.B04 |
| 114 | Training | 0 | 5500024024214122006606.A05 | 5500024024213121906561.A05 |
| 115 | Training | 1 | 5500024034290101707045.B03 | 5500024031330081907229.B03 |
| 116 | Training | 0 | 5500024031330081907228.B02 | 5500024030760072207026.B02 |
| 117 | Training | 1 | 5500024037289120508304.F10 | 5500024037498121108438.F04 |
| 118 | Training | 0 | 5500024024214122006606.E01 | 5500024034290101707044.E01 |
| 119 | Training | 0 | 5500024024211121606513.H09 | 5500024031723100807771.H09 |
| 120 | Training | 1 | 5500024032848101507997.A07 | 5500024035736031208613.A07 |
| 121 | Training | 0 | 5500024024214122006606.E06 | 5500024024213121906561.E06 |
| 122 | Training | 1 | 5500024030760072207031.H10 | 5500024030700072107991.H10 |
| 123 | Training | 1 | 5500024024213121906562.D09 | 5500024024213121906564.D03 |
| 124 | Training | 1 | 5500024024214122006606.D11 | 5500024024213121906561.D11 |
| 125 | Training | 1 | 5500024037498121108437.F12 | 5500024034290101707044.F12 |
| 126 | Training | 0 | 5500024031330081907228.A04 | 5500024030760072207026.A04 |
| 127 | Training | 0 | 5500024031330081907228.D09 | 5500024030760072207026.D09 |
| 128 | Training | 1 | 5500024030760072207031.G10 | 5500024030700072107991.G10 |
| 129 | Training | 0 | 5500024035101021908499.G01 | 5500024037498121108440.G07 |
| 130 | Training | 1 | 5500024030402071707279.A02 | 5500024031723100807776.A02 |
| 131 | Training | 1 | 5500024035735011708571.H12 | 5500024034290101707046.H12 |
| 132 | Training | 0 | 5500024031723100807770.B10 | 5500024031723100807769.B10 |
| 133 | Training | 1 | 5500024031723100807770.F08 | 5500024031723100807769.F08 |
| 134 | Training | 1 | 5500024034290101707050.B04 | 5500024031723100807771.B04 |
| 135 | Training | 1 | 5500024031723100807770.F06 | 5500024031723100807769.F06 |
| 136 | Training | 0 | 5500024030403071907253.G11 | 5500024031723100807771.G11 |
| 137 | Training | 0 | 5500024024213121906558.B02 | 5500024030700072107991.B02 |
| 138 | Training | 0 | 5500024030403071907252.C01 | 5500024034290101707046.C01 |
| 139 | Training | 0 | 5500024035101021908499.E01 | 5500024037498121108440.E07 |
| 140 | Training | 0 | 5500024024214122006606.C04 | 5500024031723100807773.C10 |
| 141 | Training | 1 | 5500024035100021608459.C04 | 5500024035100021608459.C10 |
| 142 | Training | 1 | 5500024024214122006606.H09 | 5500024024213121906561.H09 |
| 143 | Training | 1 | 5500024034290101707045.D08 | 5500024031330081907229.D08 |
| 144 | Training | 1 | 5500024031723100807770.F12 | 610611110806.F06 |
| 145 | Training | 0 | 5500024030402071707279.A05 | 5500024031723100807776.A05 |
| 146 | Training | 1 | 5500024034290101707050.H02 | 5500024031723100807771.H02 |
| 147 | Training | 1 | 5500024024213121906562.F03 | 5500024031723100807773.F03 |
| 148 | Training | 1 | 5500024030403071907255.H12 | 5500024031723100807775.H12 |
| 149 | Training | 1 | 5500024032848101507997.F09 | 5500024035736031208613.F09 |
| 150 | Training | 1 | 5500024030402071707279.D05 | 5500024031723100807776.D05 |
| 151 | Training | 1 | 5500024030403071907257.F11 | 5500024030700072107992.F11 |
| 152 | Training | 0 | 5500024032848101507997.F11 | 5500024035736031208613.F11 |
| 153 | Training | 0 | 5500024024214122006606.G12 | 5500024024213121906561.G12 |
| 154 | Training | 1 | 5500024031330081907228.F02 | 5500024030760072207026.F02 |
| 155 | Training | 1 | 5500024035735011708571.C10 | 5500024034290101707046.C10 |
| 156 | Training | 1 | 5500024028849050407172.F11 | 5500024034290101707044.F11 |
| 157 | Training | 0 | 5500024030403071907256.E06 | 5500024030760072207026.E06 |
| 158 | Training | 1 | 5500024031723100807772.H05 | 5500024030700072107986.H05 |
| 159 | Training | 1 | 5500024024213121906562.C05 | 5500024017802120306174.C11 |
| 160 | Training | 0 | 5500024024213121906558.F05 | 5500024030700072107986.F05 |
| 161 | Training | 1 | 5500024035735011708571.G07 | 5500024034290101707046.G07 |
| 162 | Training | 0 | 5500024031330081907228.B06 | 5500024030760072207026.B06 |
| 163 | Training | 0 | 5500024030403071907257.H11 | 5500024030700072107992.H11 |
| 164 | Training | 1 | 5500024030403071907253.E04 | 5500024031723100807773.H06 |
| 165 | Training | 1 | 5500024024214122006606.E02 | 5500024024213121906561.E02 |
| 166 | Training | 0 | 5500024030700072107994.B03 | 5500024034290101707044.B03 |
| 167 | Training | 1 | 5500024035736031208618.B09 | 5500024034290101707046.B09 |
| 168 | Training | 1 | 5500024024213121906562.D11 | 5500024024213121906564.D05 |
| 169 | Training | 1 | 5500024024214122006606.H05 | 5500024024213121906561.H05 |
| 170 | Training | 0 | 5500024035736031208618.C08 | 5500024034290101707044.C02 |
| 171 | Training | 1 | 5500024035735011708571.A05 | 5500024034290101707046.A05 |
| 172 | Training | 1 | 5500024030403071907253.B07 | 5500024031723100807771.B07 |
| 173 | Training | 1 | 5500024035735011708571.B03 | 5500024024213121906561.B03 |
| 174 | Training | 1 | 5500024024214122006606.G10 | 5500024024213121906561.G10 |
| 175 | Training | 0 | 5500024030403071907256.H05 | 5500024030760072207026.H11 |
| 176 | Training | 1 | 5500024024213121906562.C02 | 5500024017802120306174.C08 |
| 177 | Training | 1 | 5500024030403071907257.F09 | 5500024030700072107992.F09 |
| 178 | Training | 1 | 5500024031723100807770.C02 | 5500024031723100807769.C02 |
| 179 | Training | 1 | 5500024030700072107988.E06 | 5500024030700072107992.E06 |
| 180 | Training | 1 | 5500024030403071907257.C07 | 5500024030700072107992.C07 |
| 181 | Training | 0 | 5500024024214122006606.A03 | 5500024024213121906561.A03 |
| 182 | Training | 1 | 5500024024214122006606.F10 | 5500024024213121906561.F10 |
| 183 | Training | 1 | 5500024024213121906562.C09 | 5500024024213121906564.C03 |
| 184 | Training | 0 | 5500024030403071907257.B01 | 5500024030700072107992.B01 |
| 185 | Training | 0 | 5500024051859013109328.G01 | 5500024037498121108438.G01 |
| 186 | Training | 1 | 5500024035736031208617.E03 | 5500024035736031208617.B08 |
| 187 | Training | 1 | 5500024031723100807770.A08 | 5500024031723100807769.A08 |
| 188 | Training | 1 | 5500024032848101507997.H06 | 610611110806.H12 |
| 189 | Training | 0 | 5500024030403071907253.E02 | 5500024017802120306174.E08 |
| 190 | Training | 1 | 5500024030402071707279.G12 | 5500024031723100807776.G12 |
| 1 | Validation | NA | 5500024024214122006606.C11 | 5500024024213121906561.C11 |
| 2 | Validation | NA | 5500024024213121906562.E08 | 5500024024213121906564.E02 |
| 3 | Validation | NA | 5500024030700072107994.H08 | 5500024031723100807771.H08 |
| 4 | Validation | NA | 5500024032848101507997.D06 | 5500024035736031208613.D06 |
| 5 | Validation | NA | 5500024028849050407169.F03 | 5500024030700072107991.F03 |
| 6 | Validation | NA | 5500024028849050407172.B04 | 5500024031330081907229.B10 |
| 7 | Validation | NA | 5500024030402071707277.C05 | 5500024030700072107992.C05 |
| 8 | Validation | NA | 5500024031723100807770.D12 | 5500024031723100807769.D12 |
| 9 | Validation | NA | 5500024030403071907255.G06 | 5500024031723100807775.G06 |
| 10 | Validation | NA | 5500024030402071707277.E04 | 5500024030700072107992.E10 |
| 11 | Validation | NA | 5500024028849050407172.A08 | 5500024034290101707046.A02 |
| 12 | Validation | NA | 5500024032848101507997.A05 | 5500024035736031208613.A05 |
| 13 | Validation | NA | 5500024024214122006606.C01 | 5500024024213121906561.C01 |
| 14 | Validation | NA | 5500024034290101707045.F02 | 5500024031330081907229.F02 |
| 15 | Validation | NA | 5500024030700072107988.G03 | 5500024030760072207026.G09 |
| 16 | Validation | NA | 5500024030760072207031.D11 | 5500024030700072107991.D11 |
| 17 | Validation | NA | 5500024030700072107994.B09 | 5500024031723100807771.B03 |
| 18 | Validation | NA | 5500024032848101507997.G05 | 5500024035736031208613.G05 |
| 19 | Validation | NA | 5500024024214122006606.F09 | 5500024024213121906561.F09 |
| 20 | Validation | NA | 5500024030402071707279.E02 | 5500024031723100807776.E02 |
| 21 | Validation | NA | 5500024024214122006606.A09 | 5500024024213121906561.A09 |
| 22 | Validation | NA | 5500024030402071707277.F01 | 5500024031723100807769.F07 |
| 23 | Validation | NA | 5500024024213121906558.A04 | 5500024030700072107991.A04 |
| 24 | Validation | NA | 5500024024213121906562.B12 | 5500024024213121906564.B06 |
| 25 | Validation | NA | 5500024028849050407169.F05 | 5500024030700072107991.F05 |
| 26 | Validation | NA | 5500024024214122006606.G03 | 5500024034290101707046.G03 |
| 27 | Validation | NA | 5500024051859013109328.C01 | 5500024037498121108438.C01 |
| 28 | Validation | NA | 5500024024211121606513.E04 | 5500024030700072107992.E04 |
| 29 | Validation | NA | 5500024030403071907255.D11 | 5500024031723100807775.D11 |
| 30 | Validation | NA | 5500024030402071707279.F03 | 5500024031723100807776.F03 |
| 31 | Validation | NA | 5500024030403071907251.A09 | 5500024030700072107986.A09 |
| 32 | Validation | NA | 5500024030402071707279.G05 | 5500024031723100807776.G05 |
| 33 | Validation | NA | 5500024030403071907255.C02 | 5500024031723100807775.C02 |
| 34 | Validation | NA | 5500024031723100807770.C03 | 5500024031723100807769.C03 |
| 35 | Validation | NA | 5500024030700072107994.G05 | 5500024034290101707046.G05 |
| 36 | Validation | NA | 5500024030403071907251.E11 | 5500024030700072107986.E11 |
| 37 | Validation | NA | 5500024030403071907255.E11 | 5500024031723100807775.E11 |
| 38 | Validation | NA | 5500024030402071707279.H10 | 5500024031723100807776.H10 |
| 39 | Validation | NA | 5500024032848101507997.D11 | 5500024035736031208613.D11 |
| 40 | Validation | NA | 5500024024214122006606.H06 | 5500024024213121906561.H06 |
| 41 | Validation | NA | 5500024028416032807794.D08 | 5500024031723100807771.D02 |
| 42 | Validation | NA | 5500024037496121008324.G07 | 5500024037498121108438.G07 |
| 43 | Validation | NA | 5500024037498121108437.A07 | 5500024034290101707044.A07 |
| 44 | Validation | NA | 5500024031723100807772.H11 | 5500024030402071707280.H05 |
| 45 | Validation | NA | 5500024030403071907255.H03 | 5500024031723100807775.H03 |
| 46 | Validation | NA | 5500024028849050407172.C05 | 5500024031330081907229.C05 |
| 47 | Validation | NA | 5500024030403071907255.B01 | 5500024031723100807775.B01 |
| 48 | Validation | NA | 5500024024213121906562.C11 | 5500024024213121906564.C05 |
| 49 | Validation | NA | 5500024028849050407172.A11 | 5500024034290101707044.A11 |
| 50 | Validation | NA | 5500024024213121906562.D12 | 5500024024213121906564.D06 |
| 51 | Validation | NA | 5500024032848101507997.B06 | 5500024035736031208613.B06 |
| 52 | Validation | NA | 5500024035101021908499.D01 | 5500024037498121108440.D07 |
| 53 | Validation | NA | 5500024030700072107994.D03 | 5500024034290101707044.D03 |
| 54 | Validation | NA | 5500024030402071707279.C12 | 5500024031723100807776.C12 |
| 55 | Validation | NA | 5500024030403071907257.H02 | 5500024030700072107992.H02 |
| 56 | Validation | NA | 5500024037496121008324.A03 | 5500024031723100807776.A09 |
| 57 | Validation | NA | 5500024031723100807772.F02 | 5500024037498121108442.F08 |
| 58 | Validation | NA | 5500024032848101507997.D03 | 5500024035736031208613.D03 |
| 59 | Validation | NA | 5500024035735011708567.H04 | 5500024037498121108438.H10 |
| 60 | Validation | NA | 5500024051859013109328.E06 | 5500024037498121108438.E12 |
| 61 | Validation | NA | 5500024031723100807772.F01 | 5500024030700072107986.F01 |
| 62 | Validation | NA | 5500024030403071907255.G01 | 5500024031723100807775.G01 |
| 63 | Validation | NA | 5500024035735011708571.B07 | 5500024034290101707046.B07 |
| 64 | Validation | NA | 5500024035736031208617.F05 | 5500024035736031208617.C09 |
| 65 | Validation | NA | 5500024024213121906558.C01 | 5500024030700072107991.F11 |
| 66 | Validation | NA | 5500024030760072207031.B10 | 5500024030700072107991.B10 |
| 67 | Validation | NA | 5500024030403071907253.D12 | 5500024031723100807771.D12 |
| 68 | Validation | NA | 5500024034290101707045.A03 | 5500024031330081907229.A03 |
| 69 | Validation | NA | 5500024030402071707277.A02 | 5500024030760072207026.A08 |
| 70 | Validation | NA | 5500024030403071907256.G05 | 5500024030760072207026.G11 |
| 71 | Validation | NA | 5500024030402071707279.B04 | 5500024031723100807776.B04 |
| 72 | Validation | NA | 5500024030403071907255.E01 | 5500024031723100807775.E01 |
| 73 | Validation | NA | 5500024037498121108437.G05 | 5500024034290101707044.G05 |
| 74 | Validation | NA | 5500024028849050407169.B12 | 5500024034290101707046.B06 |
| 75 | Validation | NA | 5500024035101021908499.C04 | 5500024037498121108440.C10 |
| 76 | Validation | NA | 5500024024214122006606.H08 | 5500024024213121906561.H08 |
| 77 | Validation | NA | 5500024035101021908499.E08 | 5500024031723100807773.E08 |
| 78 | Validation | NA | 5500024031723100807770.H06 | 5500024031723100807769.H06 |
| 79 | Validation | NA | 5500024031330081907228.E11 | 5500024030760072207026.E11 |
| 80 | Validation | NA | 5500024034290101707045.E11 | 5500024024213121906564.E05 |
| 81 | Validation | NA | 5500024032848101507997.D09 | 5500024035736031208613.D09 |
| 82 | Validation | NA | 5500024031330081907228.H02 | 5500024030760072207026.H02 |
| 83 | Validation | NA | 5500024031723100807772.F06 | 5500024030700072107986.F06 |
| 84 | Validation | NA | 5500024024213121906562.C04 | 5500024017802120306174.C10 |
| 85 | Validation | NA | 5500024024213121906562.G03 | 5500024017802120306174.G09 |
| 86 | Validation | NA | 5500024053146032309538.H03 | 5500024053146032309538.H09 |
Extract Data based on their platform type.
missing_drug <- NULL # If there are mismatched cel file name.
Final_matrix_MCF7 <- NULL
Final_matrix_MCF7_type2 <- NULL
# prepared space for unclassifed dataset. Not used in this study so far.
# Unclassify_data <- list() n=1
for (i in 1:nrow(sample_train)) {
tmp_MCF_cel = sample_train$MCF7[i]
# Unzip bzfile if necessary
file_handle = paste("data_raw/", tmp_MCF_cel, ".cel.bz2", sep = "")
if (file.exists(file_handle)) {
# remove temporary file if interrupted before.
if (file.exists(paste("data_raw/", tmp_MCF_cel, ".cel.tmp", sep = ""))) {
file.remove(paste("data_raw/", tmp_MCF_cel, ".cel.tmp", sep = ""))
}
# Unzip bz file
bunzip2(file_handle)
}
#
file_handle_2 = paste("data_raw/", tmp_MCF_cel, ".cel", sep = "")
if (file.exists(file_handle_2)) {
if (norm_method == "MAS5") {
data <- read.affybatch(file_handle_2)
data_mas5 <- mas5(data, verbose = F)
data_exp <- exprs(data_mas5)
} else if (norm_method == "RMA") {
## If using RMA
data_rma <- just.rma(file_handle_2) # rma normalization
data_exp <- exprs(data_rma)
}
if (is.null(nrow(Final_matrix_MCF7)) || nrow(Final_matrix_MCF7) == nrow(data_exp)) {
Final_matrix_MCF7 <- cbind(Final_matrix_MCF7, data_exp)
} else if (is.null(nrow(Final_matrix_MCF7_type2)) || nrow(Final_matrix_MCF7_type2) ==
nrow(data_exp)) {
Final_matrix_MCF7_type2 <- cbind(Final_matrix_MCF7_type2, data_exp)
} else {
cat("Cannot match any existed ArrayType!\n")
}
# Unclassify_data[[n]] <- data_exp n=n+1
} else {
cat(paste("There is no cel file named ", file_handle_2, "\n", sep = ""))
missing_drug <- cbind(missing_drug, sample_train$ID[i])
}
}
In total 190 drug samples are extracted.
There are two kinds of microarray platform used in this study, as HG-U133a and HG-U133a2.
HG-U133a has 22283 probes; HG-U133a2 has 22277 probes.
Therefore, we combined the two platforms based on their common probes (22277).
common_probe <- intersect(rownames(Final_matrix_MCF7),rownames(Final_matrix_MCF7_type2))
Final_matrix_MCF7_2 <- Final_matrix_MCF7_type2[common_probe,]
data_MCF7 <- cbind(Final_matrix_MCF7, Final_matrix_MCF7_2)
#Remove temporary tables
# rm(Final_matrix_MCF7,Final_matrix_MCF7_2,Final_matrix_MCF7_type2,data_exp)
#(Optional) Change row name as "F_"+probe names to avoid its start with number.
# rownames(data_MCF7) = sapply(rownames(data_MCF7), function(x) paste("F",x,sep="_"))
The Final Dataset is generated by combining variables and endpoints.
The size of final data matrix is:
# Get Label information and generate the Total Training Dataset
data_train_MCF7 <- mutate(as.data.frame(t(data_MCF7)), Label=Label)
dim(data_train_MCF7)
# 190 22278
Extract Data based on their platform type.
missing_drug <- NULL # If there are mismatched cel file name.
Final_matrix_PC3 <- NULL
Final_matrix_PC3_type2 <- NULL
# prepared space for unclassifed dataset. Not used in this study so far.
# Unclassify_data <- list() n=1
for (i in 1:nrow(sample_train)) {
tmp_PC3_cel = sample_train$PC3[i]
# Unzip bzfile if necessary
file_handle = paste("data_raw/", tmp_PC3_cel, ".cel.bz2", sep = "")
if (file.exists(file_handle)) {
# remove temporary file if interrupted before.
if (file.exists(paste("data_raw/", tmp_PC3_cel, ".cel.tmp", sep = ""))) {
file.remove(paste("data_raw/", tmp_PC3_cel, ".cel.tmp", sep = ""))
}
# Unzip bz file
bunzip2(file_handle)
}
#
file_handle_2 = paste("data_raw/", tmp_PC3_cel, ".cel", sep = "")
if (file.exists(file_handle_2)) {
if (norm_method == "MAS5") {
data <- read.affybatch(file_handle_2)
data_mas5 <- mas5(data, verbose = F)
data_exp <- exprs(data_mas5)
} else if (norm_method == "RMA") {
## If using RMA
data_rma <- just.rma(file_handle_2) # rma normalization
data_exp <- exprs(data_rma)
}
if (is.null(nrow(Final_matrix_PC3)) || nrow(Final_matrix_PC3) == nrow(data_exp)) {
Final_matrix_PC3 <- cbind(Final_matrix_PC3, data_exp)
} else if (is.null(nrow(Final_matrix_PC3_type2)) || nrow(Final_matrix_PC3_type2) ==
nrow(data_exp)) {
Final_matrix_PC3_type2 <- cbind(Final_matrix_PC3_type2, data_exp)
} else {
cat("Cannot match any existed ArrayType!\n")
}
# Unclassify_data[[n]] <- data_exp n=n+1
} else {
cat(paste("There is no cel file named ", file_handle_2, "\n", sep = ""))
missing_drug <- cbind(missing_drug, sample_train$ID[i])
}
}
In total 190 drug samples are extracted.
Combine two microarray platforms based on their common probes (22277).
common_probe <- intersect(rownames(Final_matrix_PC3),rownames(Final_matrix_PC3_type2))
Final_matrix_PC3_2 <- Final_matrix_PC3_type2[common_probe,]
data_PC3 <- cbind(Final_matrix_PC3, Final_matrix_PC3_2)
# Remove temporary tables
# rm(Final_matrix_MCF7,Final_matrix_MCF7_2,Final_matrix_MCF7_type2,data_exp)
#(Optional) Change row name as "F_"+probe names to avoid its start with number.
# rownames(data_PC3) = sapply(rownames(data_PC3), function(x) paste("F",x,sep="_"))
The Final Dataset is generated by combining variables and endpoints.
The size of final data matrix is:
# Get Label information and generate the Total Training Dataset
data_train_PC3 <- mutate(as.data.frame(t(data_PC3)), Label=Label)
dim(data_train_PC3)
# 190 22278
Training Drugs are Extracted.#Dont forget to Save your data file. (Or you will have to do it again)
save(file="GDILI_traindata_mas5.RData",data_train_MCF7,data_train_PC3,sample_train)
Here is the R package list used in this process. You may need to run install.packages() to install them before load them.
library(xlsx)
library(R.utils)
library(randomForest)
library(ggplot2)
library(caret)
library(dplyr)
library(tidyr)
library(kernlab)
library(DMwR)
library(knitr)
library(kableExtra)
# source("https://bioconductor.org/biocLite.R")
# biocLite("limma")
library(limma)
# biocLite("affy")
library(affy)