##Load in file
library(GEOquery)
## Loading required package: Biobase
## Loading required package: BiocGenerics
##
## Attaching package: 'BiocGenerics'
## The following objects are masked from 'package:stats':
##
## IQR, mad, sd, var, xtabs
## The following objects are masked from 'package:base':
##
## anyDuplicated, aperm, append, as.data.frame, basename, cbind,
## colnames, dirname, do.call, duplicated, eval, evalq, Filter, Find,
## get, grep, grepl, intersect, is.unsorted, lapply, Map, mapply,
## match, mget, order, paste, pmax, pmax.int, pmin, pmin.int,
## Position, rank, rbind, Reduce, rownames, sapply, saveRDS, setdiff,
## table, tapply, union, unique, unsplit, which.max, which.min
## Welcome to Bioconductor
##
## Vignettes contain introductory material; view with
## 'browseVignettes()'. To cite Bioconductor, see
## 'citation("Biobase")', and for packages 'citation("pkgname")'.
## Setting options('download.file.method.GEOquery'='auto')
## Setting options('GEOquery.inmemory.gpl'=FALSE)
file_path <- "~/Downloads/GSE22148_series_matrix.txt"
gse <- getGEO(filename = file_path)
expr_mat <- exprs(gse)
pheno <- pData(gse)
##Sanity
class(gse)
## [1] "ExpressionSet"
## attr(,"package")
## [1] "Biobase"
dim(expr_mat)
## [1] 54675 143
dim(pheno)
## [1] 143 44
colnames(pheno)
## [1] "title" "geo_accession"
## [3] "status" "submission_date"
## [5] "last_update_date" "type"
## [7] "channel_count" "source_name_ch1"
## [9] "organism_ch1" "characteristics_ch1"
## [11] "characteristics_ch1.1" "characteristics_ch1.2"
## [13] "characteristics_ch1.3" "characteristics_ch1.4"
## [15] "characteristics_ch1.5" "characteristics_ch1.6"
## [17] "molecule_ch1" "extract_protocol_ch1"
## [19] "label_ch1" "label_protocol_ch1"
## [21] "taxid_ch1" "hyb_protocol"
## [23] "scan_protocol" "description"
## [25] "data_processing" "platform_id"
## [27] "contact_name" "contact_email"
## [29] "contact_phone" "contact_department"
## [31] "contact_institute" "contact_address"
## [33] "contact_city" "contact_zip/postal_code"
## [35] "contact_country" "supplementary_file"
## [37] "data_row_count" "age:ch1"
## [39] "batch:ch1" "gender:ch1"
## [41] "gold stage:ch1" "group:ch1"
## [43] "hyb date:ch1" "plateid:ch1"
head(pheno[,1:10])
## title geo_accession
## GSM550785 Sputum from Stage II Moderate, biological replicate 1 GSM550785
## GSM550786 Sputum from Stage II Moderate, biological replicate 2 GSM550786
## GSM550787 Sputum from Stage III Severe, biological replicate 1 GSM550787
## GSM550788 Sputum from Stage II Moderate, biological replicate 3 GSM550788
## GSM550789 Sputum from Stage II Moderate, biological replicate 4 GSM550789
## GSM550790 Sputum from Stage II Moderate, biological replicate 5 GSM550790
## status submission_date last_update_date type
## GSM550785 Public on Jun 08 2010 Jun 04 2010 Jun 07 2010 RNA
## GSM550786 Public on Jun 08 2010 Jun 04 2010 Jun 07 2010 RNA
## GSM550787 Public on Jun 08 2010 Jun 04 2010 Jun 07 2010 RNA
## GSM550788 Public on Jun 08 2010 Jun 04 2010 Jun 07 2010 RNA
## GSM550789 Public on Jun 08 2010 Jun 04 2010 Jun 07 2010 RNA
## GSM550790 Public on Jun 08 2010 Jun 04 2010 Jun 07 2010 RNA
## channel_count source_name_ch1 organism_ch1
## GSM550785 1 Sputum from COPD Stage II Moderate Homo sapiens
## GSM550786 1 Sputum from COPD Stage II Moderate Homo sapiens
## GSM550787 1 Sputum from COPD Stage III Severe Homo sapiens
## GSM550788 1 Sputum from COPD Stage II Moderate Homo sapiens
## GSM550789 1 Sputum from COPD Stage II Moderate Homo sapiens
## GSM550790 1 Sputum from COPD Stage II Moderate Homo sapiens
## characteristics_ch1
## GSM550785 age: 69
## GSM550786 age: 57
## GSM550787 age: 68
## GSM550788 age: 71
## GSM550789 age: 57
## GSM550790 age: 65
summary(as.numeric(expr_mat))
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## -662.65 18.71 49.70 361.28 238.87 22603.40 429
##NA Removal
expr_mat <- expr_mat[complete.cases(expr_mat), ]
dim(expr_mat)
## [1] 54672 143
##Top 2000 variable genes
gene_var <- apply(expr_mat, 1, var)
top_genes <- names(sort(gene_var, decreasing = TRUE))[1:2000]
expr_top <- expr_mat[top_genes, ]
expr_top_t <- t(expr_top)
dim(expr_top_t)
## [1] 143 2000