Preparación de Features para Análisis de Absorbancia. Análisis de diferencias.

Introducción

Cargamos los paquetes necesarios.

Lectura de espectros

Inicialmente definimos algunas funciones de utilidad para el procesado.

Leemos los csv de los espectros

#
formatea=function(vx,nd=2){
  y = list()
  y$name = vx[1]
  y$cuts = 1
  y$nch  = 1
  y$factors = 1.
  vv     = vx[2:(length(vx)-nd)]
  y$stellarp = t(as.data.frame(c(as.numeric(vx[(length(vx)-nd+1):length(vx)]))))
  colnames(y$stellarp) = names(vx[(length(vx)-nd+1):length(vx)])
  dd     = data.frame(x=rev(as.numeric(gsub('X','',names(vv)))),y=rev(as.numeric(vv)))
  y$data = list()
  y$data[[1]]=dd
  return(y)
}
#
if ( file.exists("~/git/BL_2012/Features_BL3_2012.RData")) {
  load("~/git/BL_2012/Features_BL3_2012.RData")
} else {
  # Lectura de los CSV
  acus<-read.csv(file="~/git/BL_2012/csv2/todo_JLV_svirgen.csv", 
               header=TRUE,stringsAsFactors=FALSE);
  # Se recorta a lambda 550 A-1 porque es donde llega el espectro virgen.
  iwh = which(colnames(acus)=="X550")
  acus=acus[,1:iwh]
  # se carga el espectro virgen
  aref<-read.csv(file="~/git/BL_2012/csv2/AcvirgAeroshell_sep12.csv", 
               header=FALSE,skip=2,stringsAsFactors=FALSE);
  ## Loading the original spectrum for a virgin oil
  datos<-read.csv(file="~/git/BL_2012/csv2/MuestrasSelecionadas.csv", 
                header=TRUE,stringsAsFactors=FALSE,sep=";")
  dacus=acus
  dacus[,-1]=t(apply(acus[,-1],1,FUN="-",as.numeric(aref[,2])))
  #
  dmodel=datos[  datos[,2] %in% dacus[,1],]
  nacus = dacus[dacus[,1] %in% dmodel[,2],]
  data=merge(nacus,dmodel[,c(2:4)],by.x="Name",by.y="Espectro")
  '%ni%' = Negate('%in%')
  fuera= datos[datos[,2] %ni% acus[,1],2]
  rm(acus)
  rm(datos)
  lper=c(0.7,0.8,0.9,1.)
  bp=list()
  for (i in 1:nrow(data)) {
    bp[[i]]=formatea(data[i,])
  }
  nsamp = length(bp)
  idx=sample(1:nsamp,round(nsamp*0.80),replace=FALSE)
  nidx=(1:nsamp)[1:nsamp %ni% idx]
  bp_clean=bp[idx]
  bf_clean=bp[nidx]
  #
  bpy=do.call(rbind,lapply(bp_clean,function(x){return(x$data[[1]][,2])}))  
  YV=do.call(rbind,lapply(bp_clean,function(x){return(x$stellarp[1])}))
  YT=do.call(rbind,lapply(bp_clean,function(x){return(x$stellarp[2])}))
  #
  vf=as.character(bp_clean[[1]]$data[[1]][,1])
  colnames(bpy)=vf
  #
  bfy=do.call(rbind,lapply(bf_clean,function(x){return(x$data[[1]][,2])}))
  colnames(bfy)=vf
  #
  ####################################################################
  #
  siz=10
  bandas=list()
  bfndas=list()
  for (i in seq(1,siz,5)) { # LSB asked for steps of 5 pixels
    idx=paste("s=",i,sep="")
    lvf=vf
    if ( i > 1) {
      lvf=vf[-c(1:(i-1))]
    }
    ff=as.factor(sort(rep(1:ceiling(length(lvf)/siz),siz))[1:length(lvf)])
    st=split(lvf,ff)
    for (j in 1:length(st)) {
      if (length(st[[j]]) < siz)
        st[[j]] = NULL
    }
    lst = length(st)
    bandas[[idx]]=list(names= as.vector(unlist(lapply(st,
                        function(x){return(paste(range(x),collapse="-"))}))),
                      mat=matrix(NA,ncol=lst,nrow=nrow(bpy)))
    bfndas[[idx]]=list(names= as.vector(unlist(lapply(st,
                        function(x){return(paste(range(x),collapse="-"))}))),
                      mat=matrix(NA,ncol=lst,nrow=nrow(bfy)))
    for (j in 1:lst) {
      bandas[[idx]]$mat[,j]=apply(bpy[,st[[j]]],1,area,st[[j]])
      bfndas[[idx]]$mat[,j]=apply(bfy[,st[[j]]],1,area,st[[j]])     
    }
  }
  #
  save(bpy,bfy,bandas,bfndas,vf,st,siz,YT,YV,bp_clean, bf_clean,idx,nidx,
       file="~/git/BL_2012/Features_BL3_2012.RData")
}
#

Distance $ ^2 $ between spectra

Let’s find the closest spectra at different SNR ratio per IPAC spectum.

Closest spectrum for the existing selection

minchid=function(x,lib){
  chid=function(x,y){
    if ( nrow(x$data[[1]]) == nrow(y$data[[1]])) {
      return(sum((x$data[[1]][,2]-y$data[[1]][,2])^2)/nrow(x$data[[1]]))
    } else {
      return (5000) # Enormous distance as it is not valid
    }
  }
  ldist=ldply(lib,chid,x)
  idx = which.min(ldist[,1])[1]
  res = data.frame(V=lib[[idx]]$stellarp[1],T=lib[[idx]]$stellarp[2],
                  name=lib[[idx]]$name, dist=ldist[idx,1])
  return(res)
}
#
chi2d=ldply(bf_clean,minchid,bp_clean)
#

Now we will try with ICA approach

# We prepare the coefs
if ( file.exists("~/git/BL_2012/BL-2012_ICA3.RData")) {
  load("~/git/BL_2012/BL-2012_ICA3.RData")
} else {
  dd2=as.data.frame(do.call(rbind,lapply(bp_clean,function(x){return(x$data[[1]][,2])})))
  jd2=JADE(dd2,10)
  ss2=as.data.frame(t(jd2$W %*% t(dd2-jd2$Xmu)))
  ss2[,(ncol(ss2)+1)]=unlist(lapply(bp_clean,function(x){return(x$stellarp[2])}))
  colnames(ss2)[ncol(ss2)]="TAN"
  #
  folds=5
  repeats=1
  myControl = trainControl(method='cv', number=folds, 
                    repeats=repeats, returnResamp='none', 
                    returnData=FALSE, savePredictions=TRUE, 
                    verboseIter=FALSE, allowParallel=TRUE,
                    index=createMultiFolds(ss2[,ncol(ss2)], 
                          k=folds, times=repeats))
  model2=train(ss2[,-ncol(ss2)], ss2[,ncol(ss2)], method='ppr', 
                         trControl=myControl,tuneGrid=expand.grid(.nterms=3:(ncol(ss2)-1)))
  d2=as.data.frame(do.call(rbind,lapply(bf_clean,
                      function(x){return(ldply(x$data,
                                 function(x){return(x)})[,2])})))
  si2=as.data.frame(t(jd2$W %*% t(d2-jd2$Xmu)))
  T_coef=predict(model2,si2)
  save(T_coef,bp_ica=bp,bf_ica=bf_clean,si2,d2,model2,myControl,
       folds,repeats,ss2,jd2,dd2,file="~/git/BL_2012/BL-2012_ICA3.RData")
}
#
ref=data.frame(Name=unlist(lapply(bf_clean,function(x){return(x$name)})),
               T_ICA=T_coef,T_chi2=chi2d,
               T_real=unlist(lapply(bf_clean,function(x){return(x$stellarp[2])})))
#

Otras features de genéticos que evolucionan construyendo las features (Población 8000 individuos, 1000 evoluciones)

#
#
require(GA)

GA_r Loading required package: GA
GA_r Package 'GA' version 2.2
GA_r Type 'citation("GA")' for citing this R package in publications.

require(doParallel)

GA_r Loading required package: doParallel

require(plyr)
#
#
area=function(v,nv) {
  y=data.frame(x=as.numeric(nv),y=v)
  xz = diff(as.numeric(y[,1]),1)
  yz = as.numeric(y[,2])
  z  = sum(xz*rollmean(yz,2))
  return(z)
}
fitness = function(string) {
  par = decode(string)
  if ( 0 %in% unlist(apply(par,1,sd)) ) {
    return (-1000*NBLK*NBITS)
  }
  if ( length(unique(sort(par[,1]))) < nrow(par) |
         length(unique(sort(par[,2]))) < nrow(par) ) {
    return (-1000*NBLK*NBITS)
  }
  X   = as.data.frame(matrix(0,nrow=nrow(bandas[[cnjts[1]]]$mat),ncol=(nrow(par))))
  colnames(X)[1] ="Intercept"
  for ( i in 1:nrow(par)) {    
    bi=par[i,1] %% NBLK
    bj=par[i,2] %% NBLK
    if ( bi==0 ) bi=NBLK
    if ( bj==0 ) bj=NBLK
    idxi=(par[i,1] %/% NBLK) +1
    idxj=(par[i,2] %/% NBLK) +1    
    wini=-eval(parse(text=bandas[[cnjts[bi]]]$names[idxi]))
    winj=-eval(parse(text=bandas[[cnjts[bj]]]$names[idxj]))    
    X[,(i)]=wini-bandas[[cnjts[bi]]]$mat[,idxi] / (bandas[[cnjts[bj]]]$mat[,idxj]/winj)
    colnames(X)[(i)] = paste(par[i,],collapse="-")
  }
  mod = lm.fit(as.matrix(X), y)
  class(mod) = "lm"
  return( -BIC(mod))
}
#
SelFeatures=function(x,sets=cnjts,dat=bandas,fitness.=fitness,pp=FALSE){
  pos=which.max(as.numeric(unlist(ldply(x@bestSol,fitness.,.parallel=pp))))
  sol=x@bestSol[[pos]]
  par = decode(sol)  
  res = list()
  for ( i in 1:nrow(par)) {    
    bi=par[i,1] %% NBLK
    bj=par[i,2] %% NBLK
    if ( bi==0 ) bi=NBLK
    if ( bj==0 ) bj=NBLK
    idxi=(par[i,1] %/% NBLK) +1
    idxj=(par[i,2] %/% NBLK) +1  
    res[[i]]=list(set1=sets[bi],pos1=idxi,set2=sets[bj],pos2=idxj,par=par[i,],
                  x=dat[[sets[bi]]]$mat[,idxi],
                  x0=dat[[sets[bj]]]$mat[,idxj],
                  name=dat[[sets[bi]]]$nam[idxi],
                  name0=dat[[sets[bj]]]$nam[idxj])
  }
  return(res)
}
#
FeaturesExt=function(x,num,sets=cnjts,dat=bandas,fitness.=fitness,pp=FALSE){
  vals=as.numeric(unlist(ldply(x@bestSol,fitness.,.parallel=pp)))
  fiveval=summary(vals)
  uv    = unique(sort(vals,decreasing=TRUE))
  uvals = uv[1:min(num,length(uv))]
  isols=rep(0,length(uvals))
  for (i in 1:length(uvals)) {
    isols[i] = which(vals==uvals[i])[1]
  }
  res=NA
  if ( length(isols) > 0) {
    par=decode(x@bestSol[[1]])
    res = as.data.frame(matrix(NA,nrow=nrow(par),ncol=(2*length(isols))))
    nam = as.data.frame(matrix(NA,nrow=nrow(par),ncol=(2*length(isols))))    
    for (j in 1:length(isols)) {
      sol=x@bestSol[[isols[j]]]
      par = decode(sol)  
      res[,(2*(j-1)+1):(2*j)]=par
      names(res)[(2*(j-1)+1):(2*j)] = c(paste(j,":s",sep=""),paste(j,":c",sep=""))
      for ( i in 1:nrow(par)) {    
        bi=par[i,1] %% NBLK
        bj=par[i,2] %% NBLK
        if ( bi==0 ) bi=NBLK
        if ( bj==0 ) bj=NBLK
        idxi=(par[i,1] %/% NBLK) +1
        idxj=(par[i,2] %/% NBLK) +1  
        nam[i,(2*(j-1)+1)]=dat[[sets[bi]]]$nam[idxi]
        nam[i,(2*j)]=dat[[sets[bj]]]$nam[idxj]
      }
      names(nam)[(2*(j-1)+1):(2*j)] = c(paste(j,":s",sep=""),paste(j,":c",sep=""))        
    }
  }
  return(list(par=res,name=nam,pos=isols,vpos=vals[isols],fivevals=fiveval))
}
#
decode = function(string) {
  string = gray2binary(string)
  n = binary2decimal(string[1:NBITS])
  c = binary2decimal(string[(NBITS + 1):(2*NBITS)])
  res = data.frame(p1=n%%MAXV,p2=c%%MAXV)
  ini = 2 * NBITS
  if ( NG > 1) {
    for (i in 2:NG) {
      n = binary2decimal(string[(ini+1):(ini+NBITS)])
      c = binary2decimal(string[(ini + NBITS + 1):(ini + 2*NBITS)])
      res = rbind(res, c(n%%MAXV,c%%MAXV))
      ini = ini + 2*NBITS
    }
  }
  return(res)
}
#
encode = function(res,nbits=NBITS) {
  if (! is.data.frame(res)) {
    warning(paste("ENCODE: First parameter is not data.frame:",str(res),sep=" "))
    return(NULL)
  }
  cad=rep(0,ncol(res)*nrow(res)*nbits)
  for (i in 1:nrow(res)) {
    for (j in 1:ncol(res)) {
      k=((i-1)*ncol(res) + (j - 1))*nbits+1
      cad[k:(k+nbits-1)]=decimal2binary(res[i,j],nbits)
    }
  }
  return(binary2gray(cad))
}
#
#
NITER=9
if ( file.exists("~/git/BL_2012/GA_NT31F2_IPAC.RData")) {
  j=1
  load("~/git/BL_2012/GA_NT31F2_IPAC.RData")
  tt50=list()
  GAs50 = list()
  solsT50=list()
  tt10=list()
  GAs10 = list()
  solsT10=list()
  tt=list()
  GAs=list()
  solsT=list()
  while(j <= NITER) {
    cat(paste("Reading Iteration:",j,"<br>",sep=""))
    flush.console()
    load(paste("~/git/BL_2012/GA_T_",sprintf("%02d",j),"_f32_IPAC.RData",sep=""))
    tt[[j]]= t_tot
    GAs[[j]]=GA1
    solsT[[j]]=FeaturesExt(GA1,12)
    j = j+1
  }
} else {
  cnjts = c("s=1","s=6")
  bpy=do.call(rbind,lapply(bp_clean,function(x){return(x$data[[1]][,2])}))
  YV=do.call(rbind,lapply(bp_clean,function(x){return(x$stellarp[1])}))
  YT=do.call(rbind,lapply(bp_clean,function(x){return(x$stellarp[2])}))
  vf=as.character(bp_clean[[1]]$data[[1]][,1])
  colnames(bpy)=vf
  bfy=do.call(rbind,lapply(bf_clean,function(x){return(x$data[[1]][,2])}))
  colnames(bfy)=vf
  #
  siz=10
  #
  y=as.numeric(unlist(YT))
  cnjts = c("s=1","s=6")
  NG=10   # Número de Features
  NBLK = length(cnjts)
  NBITS= ceiling(log(NBLK*ncol(bandas[[cnjts[1]]]$mat),2))
  MAXV = ncol(bandas[[cnjts[1]]]$mat)
  #
  save(bpy,bfy,YT,YV,vf,siz,bandas,bfndas, 
       cnjts,st,NG,NBLK,NBITS,MAXV,y,fitness,encode,decode,
       SelFeatures,file="~/git/BL_2012/GA_NT31F2_IPAC.RData")
  #
  tt=list()
  GAs = list()
  solsT=list()
  j=1
  while(j <= NITER) {
    cat(paste("Starting Iteration:",j,"<br>",sep=""))
    flush.console()
    tt[[j]]= system.time({GA1 = ga("binary", fitness = fitness, nBits = NBITS*2*NG,
        monitor = FALSE,parallel=18,maxiter=1000,popSize=8000,keepBest=TRUE)})
    t_tot=tt[[j]]
    GAs[[j]]=GA1
    save(fitness,encode,decode,SelFeatures,GA1,t_tot,j,
       file=paste("~/git/BL_2012/GA_T_",sprintf("%02d",j),"_f32_IPAC.RData",sep=""))
    solsT[[j]]=FeaturesExt(GA1,12)
    j=j+1
  }
  #
  print(xtable(ldply(solsT,function(x){return(x$vpos[1])})),type="html")
  #
}

Reading Iteration:1
Reading Iteration:2
Reading Iteration:3
Reading Iteration:4
Reading Iteration:5
Reading Iteration:6
Reading Iteration:7
Reading Iteration:8
Reading Iteration:9

#
print(xtable(ldply(solsT,function(x){return(x$vpos[1])})),type="html")

	V1
1	-70.48
2	-93.11
3	-86.63
4	-89.95
5	-72.02
6	-97.74
7	-87.07
8	-82.50
9	-83.24

# ISOL holds the best solution !
isol = which.max(as.numeric(ldply(solsT,function(x){return(x$vpos[1])})[,1])) 
isol

[1] 1

Modelado de TAN con esas Features

#
#
prep_datos=function(res){
  X=matrix(NA,nrow=length(res[[1]]$x),ncol=length(res))
  for (j in 1:length(res)) {
    X[,j]=res[[j]]$x/res[[j]]$x0
  }
  colnames(X)=paste("C",1:length(res),sep="")   
  return(X)
}
#
modelado=function(X,Y) {
  #
  trn=1:nrow(X)
  folds=5
  repeats=1
  #
  # mseeds <- vector(mode = "list", length = (folds+1))
  #for(i in 1:folds) mseeds[[i]] <- sample.int(nrow(X), 10)
  #    mseeds[[(folds+1)]] <- sample.int(20, 1)
  myControl = trainControl(method='cv', number=folds, 
                  repeats=repeats, returnResamp='none', 
                  returnData=FALSE, savePredictions=TRUE, 
                  verboseIter=FALSE, allowParallel=TRUE) # ,seeds=mseeds)
  #Train some models
  model_1.1 = "train(X[trn,], Y[trn], method='gbm', 
                     trControl=myControl,
                tuneGrid=expand.grid(n.trees=seq(200,500,100), n.minobsinnode=c(2,5),
                                     interaction.depth=seq(3,15,2), 
                                     shrinkage = seq(0.01,0.1,0.02)),verbose=FALSE)"
  model_1.2 = "train(X[trn,], Y[trn], method='blackboost', 
                  trControl=myControl, tuneGrid=expand.grid(.mstop=10^(2:4), 
                                     .maxdepth=seq(5,15,3)))"
  model_1.3 = "train(X[trn,], Y[trn], method='rf', 
                     trControl=myControl,
                     tuneGrid=expand.grid(.mtry=(2:ncol(X))),
                     proximity=TRUE)"
  model_1.4 = "train(X[trn,], Y[trn], method='knn', 
                     trControl=myControl, trace=FALSE,
                     tuneGrid=expand.grid(k=(2:min(9,ncol(X)))))"
  model_1.5 = "train(X[trn,], Y[trn], method='ppr', 
                     trControl=myControl,tuneGrid=expand.grid(.nterms=10))"
  model_1.6 = "train(X[trn,], Y[trn], method='earth', 
                     trControl=myControl,tuneGrid=expand.grid(.degree = 3,
                          .nprune = (1:10) * 2), metric = 'RMSE',
                          maximize = FALSE)"
  model_1.7 = "train(X[trn,], Y[trn], method='glm', 
                     trControl=myControl)"
  model_1.8 = "train(X[trn,], Y[trn], method='svmRadial', 
                     trControl=myControl,tuneGrid=expand.grid(.C=10^(-3:3), 
                      .sigma=c(10^(-4:3))))"
  model_1.9 = "train(X[trn,], Y[trn], method='cubist', 
                     trControl=myControl,commiittees=10)"
  model_1.10 = "train(X[trn,], Y[trn], method='kernelpls', 
                     trControl=myControl,tuneGrid=expand.grid(.ncomp=seq(2,ncol(X))))"
  model_1.11 = "train(X[trn,], Y[trn], method='nnet', 
                  trControl=myControl,tuneGrid=expand.grid(.size=seq(2,round(2*ncol(X)),3), 
                      .decay=c(1.e-5,1.e-2)),
                      linout=TRUE, rang = 300, trace=FALSE,
                      maxit=10000, reltol=1.0e-11, abstol=1.0e-6)"
  model_1.12 = "train(X[trn,], Y[trn], method='bagEarth', 
                  trControl=myControl,tuneGrid=expand.grid(.nprune=(1:10)*3,.degree=1:5))"
  model_1.13 = "train(X[trn,], Y[trn], method='cubist', trControl=myControl,
                  tuneGrid=expand.grid(.committees=3:50, .neighbors=1:7))"
  # 
  #Make a list of all the models
  lmodels=c("model_1.1", "model_1.2", "model_1.3", "model_1.4", 
            "model_1.5", "model_1.6", "model_1.7", "model_1.8", 
            "model_1.9", "model_1.10","model_1.11","model_1.12", 
            "model_1.13") 
#  
  all.models_5 =list()
  for (i in lmodels) {
    all.models_5[[length(all.models_5)+1]] = eval(parse(text=gsub('\n','',get(i))))
  }
  names(all.models_5) = sapply(all.models_5, function(x) x$method)
  # 
  ensam_5 <- caretList(X[trn,],Y[trn],methodList=c('svmRadial',
                        'gbm','blackboost','rf','earth','bagEarth'))
  ens <- caretEnsemble(ensam_5)
  #Make a linear regression ensemble
  if (exists("ens")){
    all.models_5[[length(all.models_5)+1]] = get("ens")
    names(all.models_5)[length(all.models_5)]="ENS"
  } 
  return(all.models_5)
}
#
# Function that returns Root Mean Squared Error
rmse <- function(error) {
    sqrt(mean(error^2,na.rm=TRUE))
}
# Function that returns Mean Absolute Error
mae <- function(error) {
    mean(abs(error),na.rm=TRUE)
}
#
#
if ( file.exists("~/git/BL_2012/GA_BL_Models3.RData")) {
  load("~/git/BL_2012/GA_BL_Models3.RData")
} else {
  res00=SelFeatures(GAs[[isol]],cnjts,bandas)   
  #
  X00 = prep_datos(res00)
  md5 = modelado(X00,as.numeric(YT[,1]))
  #
  xf0 = prep_datos(SelFeatures(GAs[[isol]],cnjts,bfndas))
  #
  YAn00=predict(md5[[which(names(md5)=="rf")]],xf0)
  #
  ref_tot0=data.frame(Name=as.character(ref$Name),
                chi2d=ref$T_chi2.T, T_ICA=ref$T_ICA,
                YAn00,T_real=ref$T_real)
  #
  save(bpy,bfy,YT,YV,vf,siz,bandas,bfndas,cnjts, NG,NBLK,NBITS,MAXV,md5,res00, 
       X00,YAn00,xf0,ref_tot0,bp_clean, bf_clean,
       file="~/git/BL_2012/GA_BL_Models3.RData")
}

GA_r3 Loading required package: gbm
GA_r3 Loading required package: survival
GA_r3 
GA_r3 Attaching package: 'survival'
GA_r3 
GA_r3 The following object is masked from 'package:caret':
GA_r3 
GA_r3     cluster
GA_r3 
GA_r3 Loading required package: splines
GA_r3 Loaded gbm 2.1.1
GA_r3 Loading required package: randomForest
GA_r3 randomForest 4.6-10
GA_r3 Type rfNews() to see new features/changes/bug fixes.
GA_r3 Loading required package: earth
GA_r3 Loading required package: plotmo
GA_r3 Loading required package: plotrix
GA_r3 Loading required package: TeachingDemos

Iter TrainDeviance ValidDeviance StepSize Improve 1 0.5227 -nan 0.1000 0.0203 2 0.4988 -nan 0.1000 0.0196 3 0.4844 -nan 0.1000 0.0137 4 0.4674 -nan 0.1000 0.0152 5 0.4467 -nan 0.1000 0.0149 6 0.4304 -nan 0.1000 0.0124 7 0.4173 -nan 0.1000 0.0091 8 0.4090 -nan 0.1000 -0.0094 9 0.4007 -nan 0.1000 0.0019 10 0.3880 -nan 0.1000 0.0023 20 0.3388 -nan 0.1000 0.0003 40 0.2718 -nan 0.1000 -0.0039 50 0.2542 -nan 0.1000 -0.0038

#
YTpknn00=predict(md5[[which(names(md5)=="knn")]],xf0)
save(YTpknn00,xf0,file="~/git/BL_2012/GA_predict_T_31F2.RData")
ref_tot0=data.frame(Name=as.character(ref$Name),chi2d=ref$T_chi2.T, T_ICA=ref$T_ICA,
                YAn00,T_real=ref$T_real)
res00=SelFeatures(GAs[[isol]],cnjts,bandas)
ref_tot2=ref_tot0
#
print(xtable(ldply(res00,function(x){return(c(x$name,x$name0))})),type="html")

	V1	V2
1	1110-1119	2180-2189
2	2225-2234	710-719
3	2165-2174	1380-1389
4	1820-1829	2020-2029
5	815-824	1330-1339
6	1270-1279	715-724
7	1565-1574	745-754
8	1880-1889	1365-1374
9	1015-1024	1470-1479
10	1135-1144	1465-1474

#
ggplot(data=ref_tot2) + 
         geom_point(aes(x=T_real,y=chi2d),size=3) +
         xlab("Theoretical TAN [mg KOH]") + ylab("Chi2 [mg KOH]") +
         theme_bw() + #  scale_colour_brewer(palette="Set1") +
         geom_abline(position="identity", colour="gray") # +

         # xlim(2000,4200) + ylim(2000,4200)   +
         # guides(col=guide_legend(ncol=2))
#
ggplot(data=ref_tot2) + 
         geom_point(aes(x=T_real,y=T_ICA),size=3) +
         xlab("Theoretical TAN [mg KOH]") + ylab("ICA [mg KOH]") +
         theme_bw() + #  scale_colour_brewer(palette="Set1") +
         geom_abline(position="identity", colour="gray")

#
ggplot(data=ref_tot2) + 
         geom_point(aes(x=T_real,y=YAn00),size=3) +
         xlab("Theoretical T [mg KOH]") + 
         ylab("ML predicted RF [mg KOH]") +
         theme_bw() + #  scale_colour_brewer(palette="Set1") +
         geom_abline(position="identity", colour="gray")

#
lmod=c("rf","gbm","svmRadial","nnet","knn","bagEarth","kernelpls","cubist")
nmod=c("RF","GB","SVR","NNR","KNN","MARS","PLS","Rule-Regression")
for(ll in c(1:2)) {
  if (ll == 2) {
    pdf(file="~/git/BL_2012/GA_model_T3.pdf",width=12,heigh=10)
  }
  for (i in 1:length(lmod)) {
    YTm00=predict(md5[[which(names(md5)==lmod[i])[1]]],xf0)
    #
    ref_tot0m=data.frame(Name=as.character(ref$Name),
                chi2d=ref$T_chi2.T, T_ICA=ref$T_ICA,T_real=ref$T_real,YTm00)
    ref_tot2m=ref_tot0m 
    diffs=apply(ref_tot2m[,! colnames(ref_tot2m) %in% c("Name","SpT","T_real")],
              2,FUN="-",ref_tot2m[,"T_real"])
    rownames(diffs)=ref_tot2m[,"Name"]
    #
    cat(paste("T Modelling with ",lmod[i],". Error analysis follows.",sep=""))
    errors=data.frame(drmse = apply(diffs,2,rmse),dmae=apply(diffs,2,mae))
    print(xtable(errors),type="html")
  #
    print(ggplot(data=ref_tot2m) + 
           geom_point(aes(x=T_real,y=YTm00),size=3) +
           xlab("Theoretical TAN [mg KOH]") + 
           ylab(paste(nmod[i]," predicted [mg KOH]","")) +
           theme_bw() + geom_abline(position="identity", colour="red") )
  }
}

T Modelling with rf. Error analysis follows.

	drmse	dmae
chi2d	1.20	0.95
T_ICA	1.11	0.81
YTm00	1.12	0.75

T Modelling with gbm. Error analysis follows.

	drmse	dmae
chi2d	1.20	0.95
T_ICA	1.11	0.81
YTm00	1.14	0.70

T Modelling with svmRadial. Error analysis follows.

	drmse	dmae
chi2d	1.20	0.95
T_ICA	1.11	0.81
YTm00	1.10	0.65

T Modelling with nnet. Error analysis follows.

	drmse	dmae
chi2d	1.20	0.95
T_ICA	1.11	0.81
YTm00	1.07	0.67

T Modelling with knn. Error analysis follows.

	drmse	dmae
chi2d	1.20	0.95
T_ICA	1.11	0.81
YTm00	1.14	0.76

T Modelling with bagEarth. Error analysis follows.

	drmse	dmae
chi2d	1.20	0.95
T_ICA	1.11	0.81
YTm00	1.08	0.65

T Modelling with kernelpls. Error analysis follows.

	drmse	dmae
chi2d	1.20	0.95
T_ICA	1.11	0.81
YTm00	1.20	0.73

T Modelling with cubist. Error analysis follows.

	drmse	dmae
chi2d	1.20	0.95
T_ICA	1.11	0.81
YTm00	1.12	0.66