This file is a project from the GSE15402 GEO gene expression omnibus on National Center for Bioinformatics Information (NCBI). It explores autism and has 3 different types using lymphoblastic cell lines (LCL) from many autistic and control patients. The study is a gene chip study and has a published article last updated in 2019 but this data was uploaded to GEO in 2009 and you can tell by how many digits are in the GSE ID of only 5 when the newer ones I have worked on recent publications of 2024-2026 are 6 digits. It discovered 5 genes in the published article that was not read yet, but will if I need more information, which is likely. They said that the androgens seem to be impacted and the genes related to circadian rhythm in Autistic folk. The diagnosis of Autism Spectrum Disorder is made clinically by answering with questions and objective findings in clinic using a questionnaire.

I have never worked with MEV files but it says that they are just tab spaced and read.delimit function can open each one up. But there were 133 separate zipped files extracted to separate folders with each file inside the folder. I manually unzipped each one in Windows and then manually copied each file to a folder called ‘files unzipped no folders’ to retrieve. Lets look at the series text data first.

library(rmarkdown)
## Warning: package 'rmarkdown' was built under R version 4.5.3
series_36 <- read.table("GSE15402_series_matrix.txt", nrows=35)

paged_table(series_36)
series9 <- read.table("GSE15402_series_matrix.txt", skip=36, nrow=9)

paged_table(series9)

We know the sample ID and type of sample for our project are in the 2nd table first 2 rows we named it series9.

dataframe <- series9[c(1:2),]
colnames(dataframe) <- series9[1,]
paged_table(dataframe)
data <- dataframe[2,c(2:117)]

paged_table(data)

Lets make our class vector to use later in predicting the class, its nice that these are ordered but necessary as grep will save the indices under that name. You can enter each name and it gives the indices, or you can just write the name of the grep object in the brackets and it is a placeholcer for columns of that type by index.

control <- grep('control',colnames(data))
language <- grep('language',colnames(data))
mild <- grep('mild', colnames(data))
savant <- grep('savant', colnames(data))

class <- c(1:116)
class[1:29] <- 'control'
class[30:60] <- 'language'
class[61:86] <- 'mild'
class[87:116] <- 'savant'

class
##   [1] "control"  "control"  "control"  "control"  "control"  "control" 
##   [7] "control"  "control"  "control"  "control"  "control"  "control" 
##  [13] "control"  "control"  "control"  "control"  "control"  "control" 
##  [19] "control"  "control"  "control"  "control"  "control"  "control" 
##  [25] "control"  "control"  "control"  "control"  "control"  "language"
##  [31] "language" "language" "language" "language" "language" "language"
##  [37] "language" "language" "language" "language" "language" "language"
##  [43] "language" "language" "language" "language" "language" "language"
##  [49] "language" "language" "language" "language" "language" "language"
##  [55] "language" "language" "language" "language" "language" "language"
##  [61] "mild"     "mild"     "mild"     "mild"     "mild"     "mild"    
##  [67] "mild"     "mild"     "mild"     "mild"     "mild"     "mild"    
##  [73] "mild"     "mild"     "mild"     "mild"     "mild"     "mild"    
##  [79] "mild"     "mild"     "mild"     "mild"     "mild"     "mild"    
##  [85] "mild"     "mild"     "savant"   "savant"   "savant"   "savant"  
##  [91] "savant"   "savant"   "savant"   "savant"   "savant"   "savant"  
##  [97] "savant"   "savant"   "savant"   "savant"   "savant"   "savant"  
## [103] "savant"   "savant"   "savant"   "savant"   "savant"   "savant"  
## [109] "savant"   "savant"   "savant"   "savant"   "savant"   "savant"  
## [115] "savant"   "savant"

We don’t have our genes yet. But we do have a folder for it. Lets go into the folder and open up the files one by one.

path <- "files unzipped no folders/"
setwd(path)

list.files()
##   [1] "GSM386518.mev" "GSM386519.mev" "GSM386520.mev" "GSM386521.mev"
##   [5] "GSM386522.mev" "GSM386523.mev" "GSM386524.mev" "GSM386525.mev"
##   [9] "GSM386526.mev" "GSM386527.mev" "GSM386528.mev" "GSM386529.mev"
##  [13] "GSM386530.mev" "GSM386531.mev" "GSM386532.mev" "GSM386533.mev"
##  [17] "GSM386534.mev" "GSM386535.mev" "GSM386536.mev" "GSM386537.mev"
##  [21] "GSM386538.mev" "GSM386539.mev" "GSM386540.mev" "GSM386541.mev"
##  [25] "GSM386542.mev" "GSM386543.mev" "GSM386544.mev" "GSM386545.mev"
##  [29] "GSM386546.mev" "GSM386547.mev" "GSM386548.mev" "GSM386549.mev"
##  [33] "GSM386550.mev" "GSM386551.mev" "GSM386552.mev" "GSM386553.mev"
##  [37] "GSM386554.mev" "GSM386555.mev" "GSM386556.mev" "GSM386557.mev"
##  [41] "GSM386558.mev" "GSM386559.mev" "GSM386560.mev" "GSM386561.mev"
##  [45] "GSM386562.mev" "GSM386563.mev" "GSM386564.mev" "GSM386565.mev"
##  [49] "GSM386566.mev" "GSM386567.mev" "GSM386568.mev" "GSM386569.mev"
##  [53] "GSM386570.mev" "GSM386571.mev" "GSM386572.mev" "GSM386573.mev"
##  [57] "GSM386574.mev" "GSM386575.mev" "GSM386576.mev" "GSM386577.mev"
##  [61] "GSM386578.mev" "GSM386579.mev" "GSM386580.mev" "GSM386581.mev"
##  [65] "GSM386582.mev" "GSM386583.mev" "GSM386584.mev" "GSM386585.mev"
##  [69] "GSM386586.mev" "GSM386587.mev" "GSM386588.mev" "GSM386589.mev"
##  [73] "GSM386590.mev" "GSM386591.mev" "GSM386592.mev" "GSM386593.mev"
##  [77] "GSM386594.mev" "GSM386595.mev" "GSM386596.mev" "GSM386597.mev"
##  [81] "GSM386598.mev" "GSM386599.mev" "GSM386600.mev" "GSM386601.mev"
##  [85] "GSM386602.mev" "GSM386603.mev" "GSM386604.mev" "GSM386605.mev"
##  [89] "GSM386606.mev" "GSM386607.mev" "GSM386608.mev" "GSM386609.mev"
##  [93] "GSM386610.mev" "GSM386611.mev" "GSM386612.mev" "GSM386613.mev"
##  [97] "GSM386614.mev" "GSM386615.mev" "GSM386616.mev" "GSM386617.mev"
## [101] "GSM386618.mev" "GSM386619.mev" "GSM386620.mev" "GSM386621.mev"
## [105] "GSM386622.mev" "GSM386623.mev" "GSM386624.mev" "GSM386625.mev"
## [109] "GSM386626.mev" "GSM386627.mev" "GSM386628.mev" "GSM386629.mev"
## [113] "GSM386630.mev" "GSM386631.mev" "GSM386632.mev" "GSM386633.mev"

There are 116 files that are mev type in this folder, each one extracted from a folder that it was unzipped to individually.

setwd(path)

file1 <- read.table("GSM386518.mev", header=T,comment.char='#')


dim(file1)
## [1] 41472    18
paged_table(file1[41470:41472,])

The columns for each file look like the micro array chip readings. UID must be row number, C must be count, IA and IB are integrated intensities for Cy3 and Cy5 respectively in the machine. I looked up on the internet for last few and got all columns:

Nowhere is there a gene to go with this data that I am seeing. The internet says its in the Data section, but no Data section when I removed the comment tags of ‘#’ to read in the file. Each of the 116 files has one to read in.

These UID are unique identifiers of the TIGR system that has a table that translates the UID into the GenBank Accession number in the GPL or gene platform used, it is in the series information. In series_36 its line 32.

series_36[32,]
##                     V1      V2
## 32 !Series_platform_id GPL3427

We can download the GPL3427 platform and read it in.

GPL3427 <- read.delim('GPL3427_family.soft', nrow=235)

paged_table(GPL3427) #235 X 1

GenBank accession IDs are the nucleotide sequence reference, if you input it into NCBI database system search of all databases, the GenBank Accession ID takes you to a report of the nucleotide sequence or where referenced as a loci of a chromosome, similar to the address of a chromosome. The very first hundreds of thousands of rows had empty SPOT_IDs so I thought it was a problem with the delimiter reading in the commas at the ID. But checked and GB_acc is the correct entry and further down the 8million long platform dataframe there are SPOT_ID entries that are numeric of ‘null’ not just empty. The ID must be the row and column of the array, so a matrix ID by row and column, so probably why separated by a comma.

platform <- read.delim('GPL3427_family.soft', header=F, skip=235)

colnames(platform) <- c("ID","GB_ACC","SPOT_ID")

paged_table(platform[1800000:1800010,]) #8180580 X 3

And if you look at the file we opened of one of the MEV files, the R and C for row and column match up with the ID, the row of the platform could be the UID or unique identifier.

file1[1:5,]
##   UID     IA     IB R C MR MC SR SC FlagA FlagB SA SF QCscore    QCA    QCB
## 1   1      0      0 1 1  1  1  1  1     X     X 25  0  0.0000 0.0000 0.0000
## 2   2 117298 119918 1 2  1  1  1  2     B     B 36  1  0.9860 0.9860 0.9860
## 3   3  96678 233351 1 3  1  1  1  3     C     C 50  1  0.5967 0.2367 0.9567
## 4   4  61484  64172 1 4  1  1  1  4     B     B 49  1  0.7890 0.7806 0.7974
## 5   5  40932  22366 1 5  1  1  1  5     B     B 44  1  0.7640 0.7731 0.7549
##    BkgA  BkgB
## 1     0     0
## 2 15876 16200
## 3 20100 21850
## 4 18130 22687
## 5 17072 16192
platform[1:5,]
##    ID   GB_ACC SPOT_ID
## 1 1,2 AA486138        
## 2 1,3   N51018        
## 3 1,4   H65481        
## 4 1,5   T98628        
## 5 1,6   N34799

The row names can be used to see where the Row and Column are, but the first 5 rows of the file are the first row and down the columns excluding the first column and first row that is 0 or no value of intensity in channel 1 or IA or channel 2 or IB. We would then have to know what the values are, it looks like the Flag columns give the saturation of pixels in that row and column for the channels of A and B, with quality control scores. There is a combined score and individual scores per channel. We can revert to the series information to see which channel they used to get gene expression values. I have seen channel 1 used a lot, but there are so many machines.

Lets go back to the series9 dataframe of empty but meta data to each 116 samples with GSM ID.

paged_table(series9) #9 X 117

They have that they used 2 channels in the Sample_channel_count for each sample as a column. So the gene expression value is probably the QCscore that combines both channels.

The tail of the file1 we uploaded of 116 files should show number of rows and columns to compare in the platform table.

paged_table(file1[41470:41472,])

There are 384 Rows and 108 columns, the platform ID will be 384,108 as the entry. Lets see what the platform looks like at 384,108 for the ID column

platform_384_108 <- platform[platform$ID == "384,108",]
platform_384_108
##              ID    GB_ACC     SPOT_ID
## 41471   384,108                 blank
## 82989   384,108                      
## 124507  384,108                      
## 166025  384,108                      
## 207545  384,108                      
## 249065  384,108                      
## 290585  384,108                      
## 332105  384,108                      
## 373625  384,108                      
## 415145  384,108                      
## 456665  384,108                      
## 498185  384,108                      
## 539705  384,108                      
## 581225  384,108                      
## 622745  384,108                      
## 664265  384,108                      
## 705785  384,108                      
## 747305  384,108                      
## 788825  384,108                      
## 830345  384,108                      
## 871869  384,108      null        null
## 913393  384,108      null        null
## 954917  384,108      null        null
## 996441  384,108      null        null
## 1037965 384,108      null        null
## 1079489 384,108      null        null
## 1121013 384,108      null        null
## 1162537 384,108      null        null
## 1204061 384,108      null        null
## 1245585 384,108      null        null
## 1287109 384,108      null        null
## 1328633 384,108      null        null
## 1370157 384,108      null        null
## 1411681 384,108      null        null
## 1453205 384,108      null        null
## 1494729 384,108      null        null
## 1536253 384,108      null        null
## 1577777 384,108 -0.117222 0.117222026
## 1619301 384,108 -0.919426   0.9194264
## 1660825 384,108 -0.550725   0.5507253
## 1702349 384,108      null        null
## 1743873 384,108      null        null
## 1785397 384,108      null        null
## 1826921 384,108      null        null
## 1868445 384,108      null        null
## 1909969 384,108      null        null
## 1951493 384,108      null        null
## 1993017 384,108      null        null
## 2034541 384,108      null        null
## 2076065 384,108      null        null
## 2117589 384,108      null        null
## 2159113 384,108      null        null
## 2200637 384,108      null        null
## 2242161 384,108      null        null
## 2283685 384,108      null        null
## 2325209 384,108      null        null
## 2366733 384,108      null        null
## 2408257 384,108      null        null
## 2449781 384,108      null        null
## 2491305 384,108      null        null
## 2532829 384,108      null        null
## 2574353 384,108      null        null
## 2615877 384,108      null        null
## 2657401 384,108      null        null
## 2698925 384,108      null        null
## 2740449 384,108      null        null
## 2781973 384,108      null        null
## 2823497 384,108      null        null
## 2865021 384,108      null        null
## 2906545 384,108      null        null
## 2948069 384,108      null        null
## 2989593 384,108      null        null
## 3031117 384,108      null        null
## 3072641 384,108      null        null
## 3114165 384,108      null        null
## 3155689 384,108      null        null
## 3197213 384,108      null        null
## 3238737 384,108      null        null
## 3280261 384,108      null        null
## 3321785 384,108      null        null
## 3363309 384,108      null        null
## 3404833 384,108      null        null
## 3446357 384,108      null        null
## 3487881 384,108      null        null
## 3529405 384,108      null        null
## 3570929 384,108      null        null
## 3612453 384,108  -1.10021   1.1002067
## 3653977 384,108      null        null
## 3695501 384,108      null        null
## 3737025 384,108      null        null
## 3778549 384,108      null        null
## 3820073 384,108      null        null
## 3861597 384,108      null        null
## 3903121 384,108      null        null
## 3944645 384,108      null        null
## 3986169 384,108      null        null
## 4027693 384,108      null        null
## 4069217 384,108      null        null
## 4110741 384,108      null        null
## 4152265 384,108      null        null
## 4193789 384,108      null        null
## 4235313 384,108      null        null
## 4276837 384,108      null        null
## 4318361 384,108      null        null
## 4359885 384,108      null        null
## 4401409 384,108      null        null
## 4442933 384,108      null        null
## 4484457 384,108      null        null
## 4525981 384,108      null        null
## 4567505 384,108      null        null
## 4609029 384,108      null        null
## 4650553 384,108      null        null
## 4692077 384,108      null        null
## 4733601 384,108      null        null
## 4775125 384,108      null        null
## 4816649 384,108      null        null
## 4858173 384,108      null        null
## 4899697 384,108      null        null
## 4941221 384,108      null        null
## 4982745 384,108      null        null
## 5024269 384,108      null        null
## 5065793 384,108      null        null
## 5107317 384,108      null        null
## 5148841 384,108      null        null
## 5190365 384,108      null        null
## 5231889 384,108      null        null
## 5273413 384,108      null        null
## 5314937 384,108      null        null
## 5356461 384,108      null        null
## 5397985 384,108      null        null
## 5439509 384,108      null        null
## 5481033 384,108      null        null
## 5522557 384,108      null        null
## 5564081 384,108      null        null
## 5605605 384,108      null        null
## 5647129 384,108      null        null
## 5688655 384,108                      
## 5730182 384,108                      
## 5771709 384,108                      
## 5813236 384,108                      
## 5854763 384,108                      
## 5896290 384,108                      
## 5937816 384,108 -0.380484  0.38048366
## 5979343 384,108                      
## 6020870 384,108                      
## 6062397 384,108                      
## 6103923 384,108                      
## 6145450 384,108                      
## 6186976 384,108                      
## 6228502 384,108                      
## 6270028 384,108                      
## 6311554 384,108                      
## 6353080 384,108                      
## 6394606 384,108                      
## 6436132 384,108                      
## 6477658 384,108                      
## 6519184 384,108                      
## 6560710 384,108                      
## 6602236 384,108                      
## 6643762 384,108                      
## 6685288 384,108                      
## 6726814 384,108                      
## 6768340 384,108                      
## 6809866 384,108                      
## 6851392 384,108                      
## 6892918 384,108                      
## 6934444 384,108                      
## 6975970 384,108                      
## 7017496 384,108                      
## 7059022 384,108                      
## 7100548 384,108                      
## 7142074 384,108                      
## 7183600 384,108                      
## 7225126 384,108                      
## 7266653 384,108      null            
## 7308180 384,108      null            
## 7349707 384,108      null            
## 7391234 384,108      null            
## 7432761 384,108      null            
## 7474288 384,108      null            
## 7515815 384,108      null            
## 7557342 384,108      null            
## 7598869 384,108      null            
## 7640396 384,108      null            
## 7681923 384,108      null            
## 7723450 384,108      null            
## 7764977 384,108      null            
## 7806504 384,108      null            
## 7848031 384,108      null            
## 7889558 384,108      null            
## 7931085 384,108      null            
## 7972612 384,108      null            
## 8014139 384,108      null            
## 8055666 384,108      null            
## 8097193 384,108      null            
## 8138720 384,108      null            
## 8180247 384,108      null

This is not the tail of platform because many entries for the row and column end of the file in the platform. But the ID in the platform is the unique ID to this series as platforms or GPL are shared with multiple series and this is the catalog of its entry when using this platform and identifying the machine values.

The rows are off by 1 where it looks like uploading the platform file I cut off 1 extra line. Lets try it again I skipped one too many lines maybe. array, so a matrix ID by row and column, so probably why separated by a comma.

platform <- read.delim('GPL3427_family.soft', header=F, skip=234)

colnames(platform) <- c("ID","GB_ACC","SPOT_ID")

paged_table(platform[41470:41472,])

That works! the row number is the same in the file1 as it should be in the platform when reading in the file I originally skipped too many lines. But we see above it is corrected and row 41472 is the ID 384,108 for the row 384 and column 108 of the file.

Lets see if at least one more file also has this same amount of rows.

setwd(path)

file2 <- read.table("GSM386519.mev", header=T,comment.char='#')

paged_table(file2[41470:41472,])

It looks the same.

Now lets remove the file1 and file2 and start reading in each file by their file name from this list, and double check again to see if the QCscore is the gene expression reading for that ID.

Lets look at all the files we must individually read in. A for loop would be great for this but I forgot how to do that. Maybe AI has a solution.

setwd(path)

list.files()
##   [1] "GSM386518.mev" "GSM386519.mev" "GSM386520.mev" "GSM386521.mev"
##   [5] "GSM386522.mev" "GSM386523.mev" "GSM386524.mev" "GSM386525.mev"
##   [9] "GSM386526.mev" "GSM386527.mev" "GSM386528.mev" "GSM386529.mev"
##  [13] "GSM386530.mev" "GSM386531.mev" "GSM386532.mev" "GSM386533.mev"
##  [17] "GSM386534.mev" "GSM386535.mev" "GSM386536.mev" "GSM386537.mev"
##  [21] "GSM386538.mev" "GSM386539.mev" "GSM386540.mev" "GSM386541.mev"
##  [25] "GSM386542.mev" "GSM386543.mev" "GSM386544.mev" "GSM386545.mev"
##  [29] "GSM386546.mev" "GSM386547.mev" "GSM386548.mev" "GSM386549.mev"
##  [33] "GSM386550.mev" "GSM386551.mev" "GSM386552.mev" "GSM386553.mev"
##  [37] "GSM386554.mev" "GSM386555.mev" "GSM386556.mev" "GSM386557.mev"
##  [41] "GSM386558.mev" "GSM386559.mev" "GSM386560.mev" "GSM386561.mev"
##  [45] "GSM386562.mev" "GSM386563.mev" "GSM386564.mev" "GSM386565.mev"
##  [49] "GSM386566.mev" "GSM386567.mev" "GSM386568.mev" "GSM386569.mev"
##  [53] "GSM386570.mev" "GSM386571.mev" "GSM386572.mev" "GSM386573.mev"
##  [57] "GSM386574.mev" "GSM386575.mev" "GSM386576.mev" "GSM386577.mev"
##  [61] "GSM386578.mev" "GSM386579.mev" "GSM386580.mev" "GSM386581.mev"
##  [65] "GSM386582.mev" "GSM386583.mev" "GSM386584.mev" "GSM386585.mev"
##  [69] "GSM386586.mev" "GSM386587.mev" "GSM386588.mev" "GSM386589.mev"
##  [73] "GSM386590.mev" "GSM386591.mev" "GSM386592.mev" "GSM386593.mev"
##  [77] "GSM386594.mev" "GSM386595.mev" "GSM386596.mev" "GSM386597.mev"
##  [81] "GSM386598.mev" "GSM386599.mev" "GSM386600.mev" "GSM386601.mev"
##  [85] "GSM386602.mev" "GSM386603.mev" "GSM386604.mev" "GSM386605.mev"
##  [89] "GSM386606.mev" "GSM386607.mev" "GSM386608.mev" "GSM386609.mev"
##  [93] "GSM386610.mev" "GSM386611.mev" "GSM386612.mev" "GSM386613.mev"
##  [97] "GSM386614.mev" "GSM386615.mev" "GSM386616.mev" "GSM386617.mev"
## [101] "GSM386618.mev" "GSM386619.mev" "GSM386620.mev" "GSM386621.mev"
## [105] "GSM386622.mev" "GSM386623.mev" "GSM386624.mev" "GSM386625.mev"
## [109] "GSM386626.mev" "GSM386627.mev" "GSM386628.mev" "GSM386629.mev"
## [113] "GSM386630.mev" "GSM386631.mev" "GSM386632.mev" "GSM386633.mev"

We will do that next time. This is part 1. Keep checking in.