library(Biobase)
library(GEOquery)


#This will produce ages, but it doesn't produce ONLY ages for the discovery dataset
N <- 443
AgeVec <- vector("list", N)

for (i in 971957:972456){
  GSMName <- paste("GSM", i, sep = "") 
  GSM <- getGEO(GSMName, destdir=".")
  
  if(Meta(GSM)$characteristics_ch1[1]!="dataset: discovery"){
    next
  }
  #print(gsub("^.*?age.at.diagnosis: ","",Meta(GSM)$characteristics_ch1[3]))
  Ps <- gsub("^.*?age.at.diagnosis: ","",Meta(GSM)$characteristics_ch1[3])
  AgeVec[[Meta(GSM)$characteristics_ch1[3]]] <- Ps
  
}

length(AgeVec) #Result: 556; s/b 443

#This returns only NULL and grepl() definitely is evaluating accurately

N <- 443
AgeVec <- vector("list", N)

for (i in 971957:971959){
  GSMName <- paste("GSM", i, sep = "") 
  GSM <- getGEO(GSMName, destdir=".")
  
  if(grepl("discovery", Meta(GSM)$characteristics_ch1[1])!=TRUE){
    next
  }
  #print(gsub("^.*?age.at.diagnosis: ","",Meta(GSM)$characteristics_ch1[3]))
  Ps <- gsub("^.*?age.at.diagnosis: ","",Meta(GSM)$characteristics_ch1[3])
  AgeVec[[Meta(GSM)$characteristics_ch1[3]]] <- Ps
  
}

AgeVec[1:5]

#Remove quotes
AgeVecnoquote <- noquote(AgeVec)

#Consider as numeric
AgeVecNumeric <- as.numeric(AgeVecnoquote)

#Ignore NAs
AgeVecNumeric <- AgeVecNumeric[!is.na(AgeVecNumeric)]

#Eliminate the one large element in vector, 972399
AgeVecClean <- AgeVecNumeric[2:112] 

mean(AgeVecClean)
sd(AgeVecClean)
range(AgeVecClean)