Final Project Data Processing

##Load in file

library(GEOquery)

## Loading required package: Biobase

## Loading required package: BiocGenerics

## 
## Attaching package: 'BiocGenerics'

## The following objects are masked from 'package:stats':
## 
##     IQR, mad, sd, var, xtabs

## The following objects are masked from 'package:base':
## 
##     anyDuplicated, aperm, append, as.data.frame, basename, cbind,
##     colnames, dirname, do.call, duplicated, eval, evalq, Filter, Find,
##     get, grep, grepl, intersect, is.unsorted, lapply, Map, mapply,
##     match, mget, order, paste, pmax, pmax.int, pmin, pmin.int,
##     Position, rank, rbind, Reduce, rownames, sapply, saveRDS, setdiff,
##     table, tapply, union, unique, unsplit, which.max, which.min

## Welcome to Bioconductor
## 
##     Vignettes contain introductory material; view with
##     'browseVignettes()'. To cite Bioconductor, see
##     'citation("Biobase")', and for packages 'citation("pkgname")'.

## Setting options('download.file.method.GEOquery'='auto')

## Setting options('GEOquery.inmemory.gpl'=FALSE)

file_path <- "~/Downloads/GSE22148_series_matrix.txt"

gse <- getGEO(filename = file_path)

expr_mat <- exprs(gse)
pheno <- pData(gse)

##Sanity

class(gse)

## [1] "ExpressionSet"
## attr(,"package")
## [1] "Biobase"

dim(expr_mat)

## [1] 54675   143

dim(pheno)

## [1] 143  44

colnames(pheno)

##  [1] "title"                   "geo_accession"          
##  [3] "status"                  "submission_date"        
##  [5] "last_update_date"        "type"                   
##  [7] "channel_count"           "source_name_ch1"        
##  [9] "organism_ch1"            "characteristics_ch1"    
## [11] "characteristics_ch1.1"   "characteristics_ch1.2"  
## [13] "characteristics_ch1.3"   "characteristics_ch1.4"  
## [15] "characteristics_ch1.5"   "characteristics_ch1.6"  
## [17] "molecule_ch1"            "extract_protocol_ch1"   
## [19] "label_ch1"               "label_protocol_ch1"     
## [21] "taxid_ch1"               "hyb_protocol"           
## [23] "scan_protocol"           "description"            
## [25] "data_processing"         "platform_id"            
## [27] "contact_name"            "contact_email"          
## [29] "contact_phone"           "contact_department"     
## [31] "contact_institute"       "contact_address"        
## [33] "contact_city"            "contact_zip/postal_code"
## [35] "contact_country"         "supplementary_file"     
## [37] "data_row_count"          "age:ch1"                
## [39] "batch:ch1"               "gender:ch1"             
## [41] "gold stage:ch1"          "group:ch1"              
## [43] "hyb date:ch1"            "plateid:ch1"

head(pheno[,1:10])

##                                                           title geo_accession
## GSM550785 Sputum from Stage II Moderate, biological replicate 1     GSM550785
## GSM550786 Sputum from Stage II Moderate, biological replicate 2     GSM550786
## GSM550787  Sputum from Stage III Severe, biological replicate 1     GSM550787
## GSM550788 Sputum from Stage II Moderate, biological replicate 3     GSM550788
## GSM550789 Sputum from Stage II Moderate, biological replicate 4     GSM550789
## GSM550790 Sputum from Stage II Moderate, biological replicate 5     GSM550790
##                          status submission_date last_update_date type
## GSM550785 Public on Jun 08 2010     Jun 04 2010      Jun 07 2010  RNA
## GSM550786 Public on Jun 08 2010     Jun 04 2010      Jun 07 2010  RNA
## GSM550787 Public on Jun 08 2010     Jun 04 2010      Jun 07 2010  RNA
## GSM550788 Public on Jun 08 2010     Jun 04 2010      Jun 07 2010  RNA
## GSM550789 Public on Jun 08 2010     Jun 04 2010      Jun 07 2010  RNA
## GSM550790 Public on Jun 08 2010     Jun 04 2010      Jun 07 2010  RNA
##           channel_count                    source_name_ch1 organism_ch1
## GSM550785             1 Sputum from COPD Stage II Moderate Homo sapiens
## GSM550786             1 Sputum from COPD Stage II Moderate Homo sapiens
## GSM550787             1  Sputum from COPD Stage III Severe Homo sapiens
## GSM550788             1 Sputum from COPD Stage II Moderate Homo sapiens
## GSM550789             1 Sputum from COPD Stage II Moderate Homo sapiens
## GSM550790             1 Sputum from COPD Stage II Moderate Homo sapiens
##           characteristics_ch1
## GSM550785             age: 69
## GSM550786             age: 57
## GSM550787             age: 68
## GSM550788             age: 71
## GSM550789             age: 57
## GSM550790             age: 65

summary(as.numeric(expr_mat))

##     Min.  1st Qu.   Median     Mean  3rd Qu.     Max.     NA's 
##  -662.65    18.71    49.70   361.28   238.87 22603.40      429

##NA Removal

expr_mat <- expr_mat[complete.cases(expr_mat), ]
dim(expr_mat)

## [1] 54672   143

##Top 2000 variable genes

gene_var <- apply(expr_mat, 1, var)

top_genes <- names(sort(gene_var, decreasing = TRUE))[1:2000]

expr_top <- expr_mat[top_genes, ]

expr_top_t <- t(expr_top)
dim(expr_top_t)

## [1]  143 2000

Final Project Data Processing

2026-02-26