PreparaciĂ³n de Features para AnĂ¡lisis de Absorbancia. AnĂ¡lisis de diferencias.

IntroducciĂ³n

Cargamos los paquetes necesarios.

Lectura de espectros

Inicialmente definimos algunas funciones de utilidad para el procesado.

Leemos los csv de los espectros

#
formatea=function(vx,nd=2){
  y = list()
  y$name = vx[1]
  y$cuts = 1
  y$nch  = 1
  y$factors = 1.
  vv     = vx[2:(length(vx)-nd)]
  y$stellarp = t(as.data.frame(c(as.numeric(vx[(length(vx)-nd+1):length(vx)]))))
  colnames(y$stellarp) = names(vx[(length(vx)-nd+1):length(vx)])
  dd     = data.frame(x=rev(as.numeric(gsub('X','',names(vv)))),y=rev(as.numeric(vv)))
  y$data = list()
  y$data[[1]]=dd
  return(y)
}
#
if ( file.exists("~/git/BL_2012/Features_BL3_2012.RData")) {
  load("~/git/BL_2012/Features_BL3_2012.RData")
} else {
  # Lectura de los CSV
  acus<-read.csv(file="~/git/BL_2012/csv2/todo_JLV_svirgen.csv", 
               header=TRUE,stringsAsFactors=FALSE);
  # Se recorta a lambda 550 A-1 porque es donde llega el espectro virgen.
  iwh = which(colnames(acus)=="X550")
  acus=acus[,1:iwh]
  # se carga el espectro virgen
  aref<-read.csv(file="~/git/BL_2012/csv2/AcvirgAeroshell_sep12.csv", 
               header=FALSE,skip=2,stringsAsFactors=FALSE);
  ## Loading the original spectrum for a virgin oil
  datos<-read.csv(file="~/git/BL_2012/csv2/MuestrasSelecionadas.csv", 
                header=TRUE,stringsAsFactors=FALSE,sep=";")
  dacus=acus
  dacus[,-1]=t(apply(acus[,-1],1,FUN="-",as.numeric(aref[,2])))
  #
  dmodel=datos[  datos[,2] %in% dacus[,1],]
  nacus = dacus[dacus[,1] %in% dmodel[,2],]
  data=merge(nacus,dmodel[,c(2:4)],by.x="Name",by.y="Espectro")
  '%ni%' = Negate('%in%')
  fuera= datos[datos[,2] %ni% acus[,1],2]
  rm(acus)
  rm(datos)
  lper=c(0.7,0.8,0.9,1.)
  bp=list()
  for (i in 1:nrow(data)) {
    bp[[i]]=formatea(data[i,])
  }
  nsamp = length(bp)
  idx=sample(1:nsamp,round(nsamp*0.80),replace=FALSE)
  nidx=(1:nsamp)[1:nsamp %ni% idx]
  bp_clean=bp[idx]
  bf_clean=bp[nidx]
  #
  bpy=do.call(rbind,lapply(bp_clean,function(x){return(x$data[[1]][,2])}))  
  YV=do.call(rbind,lapply(bp_clean,function(x){return(x$stellarp[1])}))
  YT=do.call(rbind,lapply(bp_clean,function(x){return(x$stellarp[2])}))
  #
  vf=as.character(bp_clean[[1]]$data[[1]][,1])
  colnames(bpy)=vf
  #
  bfy=do.call(rbind,lapply(bf_clean,function(x){return(x$data[[1]][,2])}))
  colnames(bfy)=vf
  #
  ####################################################################
  #
  siz=10
  bandas=list()
  bfndas=list()
  for (i in seq(1,siz,5)) { # LSB asked for steps of 5 pixels
    idx=paste("s=",i,sep="")
    lvf=vf
    if ( i > 1) {
      lvf=vf[-c(1:(i-1))]
    }
    ff=as.factor(sort(rep(1:ceiling(length(lvf)/siz),siz))[1:length(lvf)])
    st=split(lvf,ff)
    for (j in 1:length(st)) {
      if (length(st[[j]]) < siz)
        st[[j]] = NULL
    }
    lst = length(st)
    bandas[[idx]]=list(names= as.vector(unlist(lapply(st,
                        function(x){return(paste(range(x),collapse="-"))}))),
                      mat=matrix(NA,ncol=lst,nrow=nrow(bpy)))
    bfndas[[idx]]=list(names= as.vector(unlist(lapply(st,
                        function(x){return(paste(range(x),collapse="-"))}))),
                      mat=matrix(NA,ncol=lst,nrow=nrow(bfy)))
    for (j in 1:lst) {
      bandas[[idx]]$mat[,j]=apply(bpy[,st[[j]]],1,area,st[[j]])
      bfndas[[idx]]$mat[,j]=apply(bfy[,st[[j]]],1,area,st[[j]])     
    }
  }
  #
  save(bpy,bfy,bandas,bfndas,vf,st,siz,YT,YV,bp_clean, bf_clean,idx,nidx,
       file="~/git/BL_2012/Features_BL3_2012.RData")
}
#

Distance $ ^2 $ between spectra

Let’s find the closest spectra at different SNR ratio per IPAC spectum.

Closest spectrum for the existing selection

minchid=function(x,lib){
  chid=function(x,y){
    if ( nrow(x$data[[1]]) == nrow(y$data[[1]])) {
      return(sum((x$data[[1]][,2]-y$data[[1]][,2])^2)/nrow(x$data[[1]]))
    } else {
      return (5000) # Enormous distance as it is not valid
    }
  }
  ldist=ldply(lib,chid,x)
  idx = which.min(ldist[,1])[1]
  res = data.frame(V=lib[[idx]]$stellarp[1],T=lib[[idx]]$stellarp[2],
                  name=lib[[idx]]$name, dist=ldist[idx,1])
  return(res)
}
#
chi2d=ldply(bf_clean,minchid,bp_clean)
#

Now we will try with ICA approach

# We prepare the coefs
if ( file.exists("~/git/BL_2012/BL-2012_ICA3.RData")) {
  load("~/git/BL_2012/BL-2012_ICA3.RData")
} else {
  dd2=as.data.frame(do.call(rbind,lapply(bp_clean,function(x){return(x$data[[1]][,2])})))
  jd2=JADE(dd2,10)
  ss2=as.data.frame(t(jd2$W %*% t(dd2-jd2$Xmu)))
  ss2[,(ncol(ss2)+1)]=unlist(lapply(bp_clean,function(x){return(x$stellarp[2])}))
  colnames(ss2)[ncol(ss2)]="TAN"
  #
  folds=5
  repeats=1
  myControl = trainControl(method='cv', number=folds, 
                    repeats=repeats, returnResamp='none', 
                    returnData=FALSE, savePredictions=TRUE, 
                    verboseIter=FALSE, allowParallel=TRUE,
                    index=createMultiFolds(ss2[,ncol(ss2)], 
                          k=folds, times=repeats))
  model2=train(ss2[,-ncol(ss2)], ss2[,ncol(ss2)], method='ppr', 
                         trControl=myControl,tuneGrid=expand.grid(.nterms=3:(ncol(ss2)-1)))
  d2=as.data.frame(do.call(rbind,lapply(bf_clean,
                      function(x){return(ldply(x$data,
                                 function(x){return(x)})[,2])})))
  si2=as.data.frame(t(jd2$W %*% t(d2-jd2$Xmu)))
  T_coef=predict(model2,si2)
  save(T_coef,bp_ica=bp,bf_ica=bf_clean,si2,d2,model2,myControl,
       folds,repeats,ss2,jd2,dd2,file="~/git/BL_2012/BL-2012_ICA3.RData")
}
#
ref=data.frame(Name=unlist(lapply(bf_clean,function(x){return(x$name)})),
               T_ICA=T_coef,T_chi2=chi2d,
               T_real=unlist(lapply(bf_clean,function(x){return(x$stellarp[2])})))
#

Otras features de genĂ©ticos que evolucionan construyendo las features (PoblaciĂ³n 8000 individuos, 1000 evoluciones)

#
#
require(GA)
GA_r Loading required package: GA
GA_r Package 'GA' version 2.2
GA_r Type 'citation("GA")' for citing this R package in publications.
require(doParallel)
GA_r Loading required package: doParallel
require(plyr)
#
#
area=function(v,nv) {
  y=data.frame(x=as.numeric(nv),y=v)
  xz = diff(as.numeric(y[,1]),1)
  yz = as.numeric(y[,2])
  z  = sum(xz*rollmean(yz,2))
  return(z)
}
fitness = function(string) {
  par = decode(string)
  if ( 0 %in% unlist(apply(par,1,sd)) ) {
    return (-1000*NBLK*NBITS)
  }
  if ( length(unique(sort(par[,1]))) < nrow(par) |
         length(unique(sort(par[,2]))) < nrow(par) ) {
    return (-1000*NBLK*NBITS)
  }
  X   = as.data.frame(matrix(0,nrow=nrow(bandas[[cnjts[1]]]$mat),ncol=(nrow(par))))
  colnames(X)[1] ="Intercept"
  for ( i in 1:nrow(par)) {    
    bi=par[i,1] %% NBLK
    bj=par[i,2] %% NBLK
    if ( bi==0 ) bi=NBLK
    if ( bj==0 ) bj=NBLK
    idxi=(par[i,1] %/% NBLK) +1
    idxj=(par[i,2] %/% NBLK) +1    
    wini=-eval(parse(text=bandas[[cnjts[bi]]]$names[idxi]))
    winj=-eval(parse(text=bandas[[cnjts[bj]]]$names[idxj]))    
    X[,(i)]=wini-bandas[[cnjts[bi]]]$mat[,idxi] / (bandas[[cnjts[bj]]]$mat[,idxj]/winj)
    colnames(X)[(i)] = paste(par[i,],collapse="-")
  }
  mod = lm.fit(as.matrix(X), y)
  class(mod) = "lm"
  return( -BIC(mod))
}
#
SelFeatures=function(x,sets=cnjts,dat=bandas,fitness.=fitness,pp=FALSE){
  pos=which.max(as.numeric(unlist(ldply(x@bestSol,fitness.,.parallel=pp))))
  sol=x@bestSol[[pos]]
  par = decode(sol)  
  res = list()
  for ( i in 1:nrow(par)) {    
    bi=par[i,1] %% NBLK
    bj=par[i,2] %% NBLK
    if ( bi==0 ) bi=NBLK
    if ( bj==0 ) bj=NBLK
    idxi=(par[i,1] %/% NBLK) +1
    idxj=(par[i,2] %/% NBLK) +1  
    res[[i]]=list(set1=sets[bi],pos1=idxi,set2=sets[bj],pos2=idxj,par=par[i,],
                  x=dat[[sets[bi]]]$mat[,idxi],
                  x0=dat[[sets[bj]]]$mat[,idxj],
                  name=dat[[sets[bi]]]$nam[idxi],
                  name0=dat[[sets[bj]]]$nam[idxj])
  }
  return(res)
}
#
FeaturesExt=function(x,num,sets=cnjts,dat=bandas,fitness.=fitness,pp=FALSE){
  vals=as.numeric(unlist(ldply(x@bestSol,fitness.,.parallel=pp)))
  fiveval=summary(vals)
  uv    = unique(sort(vals,decreasing=TRUE))
  uvals = uv[1:min(num,length(uv))]
  isols=rep(0,length(uvals))
  for (i in 1:length(uvals)) {
    isols[i] = which(vals==uvals[i])[1]
  }
  res=NA
  if ( length(isols) > 0) {
    par=decode(x@bestSol[[1]])
    res = as.data.frame(matrix(NA,nrow=nrow(par),ncol=(2*length(isols))))
    nam = as.data.frame(matrix(NA,nrow=nrow(par),ncol=(2*length(isols))))    
    for (j in 1:length(isols)) {
      sol=x@bestSol[[isols[j]]]
      par = decode(sol)  
      res[,(2*(j-1)+1):(2*j)]=par
      names(res)[(2*(j-1)+1):(2*j)] = c(paste(j,":s",sep=""),paste(j,":c",sep=""))
      for ( i in 1:nrow(par)) {    
        bi=par[i,1] %% NBLK
        bj=par[i,2] %% NBLK
        if ( bi==0 ) bi=NBLK
        if ( bj==0 ) bj=NBLK
        idxi=(par[i,1] %/% NBLK) +1
        idxj=(par[i,2] %/% NBLK) +1  
        nam[i,(2*(j-1)+1)]=dat[[sets[bi]]]$nam[idxi]
        nam[i,(2*j)]=dat[[sets[bj]]]$nam[idxj]
      }
      names(nam)[(2*(j-1)+1):(2*j)] = c(paste(j,":s",sep=""),paste(j,":c",sep=""))        
    }
  }
  return(list(par=res,name=nam,pos=isols,vpos=vals[isols],fivevals=fiveval))
}
#
decode = function(string) {
  string = gray2binary(string)
  n = binary2decimal(string[1:NBITS])
  c = binary2decimal(string[(NBITS + 1):(2*NBITS)])
  res = data.frame(p1=n%%MAXV,p2=c%%MAXV)
  ini = 2 * NBITS
  if ( NG > 1) {
    for (i in 2:NG) {
      n = binary2decimal(string[(ini+1):(ini+NBITS)])
      c = binary2decimal(string[(ini + NBITS + 1):(ini + 2*NBITS)])
      res = rbind(res, c(n%%MAXV,c%%MAXV))
      ini = ini + 2*NBITS
    }
  }
  return(res)
}
#
encode = function(res,nbits=NBITS) {
  if (! is.data.frame(res)) {
    warning(paste("ENCODE: First parameter is not data.frame:",str(res),sep=" "))
    return(NULL)
  }
  cad=rep(0,ncol(res)*nrow(res)*nbits)
  for (i in 1:nrow(res)) {
    for (j in 1:ncol(res)) {
      k=((i-1)*ncol(res) + (j - 1))*nbits+1
      cad[k:(k+nbits-1)]=decimal2binary(res[i,j],nbits)
    }
  }
  return(binary2gray(cad))
}
#
#
NITER=9
if ( file.exists("~/git/BL_2012/GA_NT31F2_IPAC.RData")) {
  j=1
  load("~/git/BL_2012/GA_NT31F2_IPAC.RData")
  tt50=list()
  GAs50 = list()
  solsT50=list()
  tt10=list()
  GAs10 = list()
  solsT10=list()
  tt=list()
  GAs=list()
  solsT=list()
  while(j <= NITER) {
    cat(paste("Reading Iteration:",j,"<br>",sep=""))
    flush.console()
    load(paste("~/git/BL_2012/GA_T_",sprintf("%02d",j),"_f32_IPAC.RData",sep=""))
    tt[[j]]= t_tot
    GAs[[j]]=GA1
    solsT[[j]]=FeaturesExt(GA1,12)
    j = j+1
  }
} else {
  cnjts = c("s=1","s=6")
  bpy=do.call(rbind,lapply(bp_clean,function(x){return(x$data[[1]][,2])}))
  YV=do.call(rbind,lapply(bp_clean,function(x){return(x$stellarp[1])}))
  YT=do.call(rbind,lapply(bp_clean,function(x){return(x$stellarp[2])}))
  vf=as.character(bp_clean[[1]]$data[[1]][,1])
  colnames(bpy)=vf
  bfy=do.call(rbind,lapply(bf_clean,function(x){return(x$data[[1]][,2])}))
  colnames(bfy)=vf
  #
  siz=10
  #
  y=as.numeric(unlist(YT))
  cnjts = c("s=1","s=6")
  NG=10   # NĂºmero de Features
  NBLK = length(cnjts)
  NBITS= ceiling(log(NBLK*ncol(bandas[[cnjts[1]]]$mat),2))
  MAXV = ncol(bandas[[cnjts[1]]]$mat)
  #
  save(bpy,bfy,YT,YV,vf,siz,bandas,bfndas, 
       cnjts,st,NG,NBLK,NBITS,MAXV,y,fitness,encode,decode,
       SelFeatures,file="~/git/BL_2012/GA_NT31F2_IPAC.RData")
  #
  tt=list()
  GAs = list()
  solsT=list()
  j=1
  while(j <= NITER) {
    cat(paste("Starting Iteration:",j,"<br>",sep=""))
    flush.console()
    tt[[j]]= system.time({GA1 = ga("binary", fitness = fitness, nBits = NBITS*2*NG,
        monitor = FALSE,parallel=18,maxiter=1000,popSize=8000,keepBest=TRUE)})
    t_tot=tt[[j]]
    GAs[[j]]=GA1
    save(fitness,encode,decode,SelFeatures,GA1,t_tot,j,
       file=paste("~/git/BL_2012/GA_T_",sprintf("%02d",j),"_f32_IPAC.RData",sep=""))
    solsT[[j]]=FeaturesExt(GA1,12)
    j=j+1
  }
  #
  print(xtable(ldply(solsT,function(x){return(x$vpos[1])})),type="html")
  #
}

Reading Iteration:1
Reading Iteration:2
Reading Iteration:3
Reading Iteration:4
Reading Iteration:5
Reading Iteration:6
Reading Iteration:7
Reading Iteration:8
Reading Iteration:9

#
print(xtable(ldply(solsT,function(x){return(x$vpos[1])})),type="html")
V1
1 -70.48
2 -93.11
3 -86.63
4 -89.95
5 -72.02
6 -97.74
7 -87.07
8 -82.50
9 -83.24
# ISOL holds the best solution !
isol = which.max(as.numeric(ldply(solsT,function(x){return(x$vpos[1])})[,1])) 
isol

[1] 1

#

Modelado de TAN con esas Features

#
#
prep_datos=function(res){
  X=matrix(NA,nrow=length(res[[1]]$x),ncol=length(res))
  for (j in 1:length(res)) {
    X[,j]=res[[j]]$x/res[[j]]$x0
  }
  colnames(X)=paste("C",1:length(res),sep="")   
  return(X)
}
#
modelado=function(X,Y) {
  #
  trn=1:nrow(X)
  folds=5
  repeats=1
  #
  # mseeds <- vector(mode = "list", length = (folds+1))
  #for(i in 1:folds) mseeds[[i]] <- sample.int(nrow(X), 10)
  #    mseeds[[(folds+1)]] <- sample.int(20, 1)
  myControl = trainControl(method='cv', number=folds, 
                  repeats=repeats, returnResamp='none', 
                  returnData=FALSE, savePredictions=TRUE, 
                  verboseIter=FALSE, allowParallel=TRUE) # ,seeds=mseeds)
  #Train some models
  model_1.1 = "train(X[trn,], Y[trn], method='gbm', 
                     trControl=myControl,
                tuneGrid=expand.grid(n.trees=seq(200,500,100), n.minobsinnode=c(2,5),
                                     interaction.depth=seq(3,15,2), 
                                     shrinkage = seq(0.01,0.1,0.02)),verbose=FALSE)"
  model_1.2 = "train(X[trn,], Y[trn], method='blackboost', 
                  trControl=myControl, tuneGrid=expand.grid(.mstop=10^(2:4), 
                                     .maxdepth=seq(5,15,3)))"
  model_1.3 = "train(X[trn,], Y[trn], method='rf', 
                     trControl=myControl,
                     tuneGrid=expand.grid(.mtry=(2:ncol(X))),
                     proximity=TRUE)"
  model_1.4 = "train(X[trn,], Y[trn], method='knn', 
                     trControl=myControl, trace=FALSE,
                     tuneGrid=expand.grid(k=(2:min(9,ncol(X)))))"
  model_1.5 = "train(X[trn,], Y[trn], method='ppr', 
                     trControl=myControl,tuneGrid=expand.grid(.nterms=10))"
  model_1.6 = "train(X[trn,], Y[trn], method='earth', 
                     trControl=myControl,tuneGrid=expand.grid(.degree = 3,
                          .nprune = (1:10) * 2), metric = 'RMSE',
                          maximize = FALSE)"
  model_1.7 = "train(X[trn,], Y[trn], method='glm', 
                     trControl=myControl)"
  model_1.8 = "train(X[trn,], Y[trn], method='svmRadial', 
                     trControl=myControl,tuneGrid=expand.grid(.C=10^(-3:3), 
                      .sigma=c(10^(-4:3))))"
  model_1.9 = "train(X[trn,], Y[trn], method='cubist', 
                     trControl=myControl,commiittees=10)"
  model_1.10 = "train(X[trn,], Y[trn], method='kernelpls', 
                     trControl=myControl,tuneGrid=expand.grid(.ncomp=seq(2,ncol(X))))"
  model_1.11 = "train(X[trn,], Y[trn], method='nnet', 
                  trControl=myControl,tuneGrid=expand.grid(.size=seq(2,round(2*ncol(X)),3), 
                      .decay=c(1.e-5,1.e-2)),
                      linout=TRUE, rang = 300, trace=FALSE,
                      maxit=10000, reltol=1.0e-11, abstol=1.0e-6)"
  model_1.12 = "train(X[trn,], Y[trn], method='bagEarth', 
                  trControl=myControl,tuneGrid=expand.grid(.nprune=(1:10)*3,.degree=1:5))"
  model_1.13 = "train(X[trn,], Y[trn], method='cubist', trControl=myControl,
                  tuneGrid=expand.grid(.committees=3:50, .neighbors=1:7))"
  # 
  #Make a list of all the models
  lmodels=c("model_1.1", "model_1.2", "model_1.3", "model_1.4", 
            "model_1.5", "model_1.6", "model_1.7", "model_1.8", 
            "model_1.9", "model_1.10","model_1.11","model_1.12", 
            "model_1.13") 
#  
  all.models_5 =list()
  for (i in lmodels) {
    all.models_5[[length(all.models_5)+1]] = eval(parse(text=gsub('\n','',get(i))))
  }
  names(all.models_5) = sapply(all.models_5, function(x) x$method)
  # 
  ensam_5 <- caretList(X[trn,],Y[trn],methodList=c('svmRadial',
                        'gbm','blackboost','rf','earth','bagEarth'))
  ens <- caretEnsemble(ensam_5)
  #Make a linear regression ensemble
  if (exists("ens")){
    all.models_5[[length(all.models_5)+1]] = get("ens")
    names(all.models_5)[length(all.models_5)]="ENS"
  } 
  return(all.models_5)
}
#
# Function that returns Root Mean Squared Error
rmse <- function(error) {
    sqrt(mean(error^2,na.rm=TRUE))
}
# Function that returns Mean Absolute Error
mae <- function(error) {
    mean(abs(error),na.rm=TRUE)
}
#
#
if ( file.exists("~/git/BL_2012/GA_BL_Models3.RData")) {
  load("~/git/BL_2012/GA_BL_Models3.RData")
} else {
  res00=SelFeatures(GAs[[isol]],cnjts,bandas)   
  #
  X00 = prep_datos(res00)
  md5 = modelado(X00,as.numeric(YT[,1]))
  #
  xf0 = prep_datos(SelFeatures(GAs[[isol]],cnjts,bfndas))
  #
  YAn00=predict(md5[[which(names(md5)=="rf")]],xf0)
  #
  ref_tot0=data.frame(Name=as.character(ref$Name),
                chi2d=ref$T_chi2.T, T_ICA=ref$T_ICA,
                YAn00,T_real=ref$T_real)
  #
  save(bpy,bfy,YT,YV,vf,siz,bandas,bfndas,cnjts, NG,NBLK,NBITS,MAXV,md5,res00, 
       X00,YAn00,xf0,ref_tot0,bp_clean, bf_clean,
       file="~/git/BL_2012/GA_BL_Models3.RData")
}
GA_r3 Loading required package: gbm
GA_r3 Loading required package: survival
GA_r3 
GA_r3 Attaching package: 'survival'
GA_r3 
GA_r3 The following object is masked from 'package:caret':
GA_r3 
GA_r3     cluster
GA_r3 
GA_r3 Loading required package: splines
GA_r3 Loaded gbm 2.1.1
GA_r3 Loading required package: randomForest
GA_r3 randomForest 4.6-10
GA_r3 Type rfNews() to see new features/changes/bug fixes.
GA_r3 Loading required package: earth
GA_r3 Loading required package: plotmo
GA_r3 Loading required package: plotrix
GA_r3 Loading required package: TeachingDemos

Iter TrainDeviance ValidDeviance StepSize Improve 1 0.5227 -nan 0.1000 0.0203 2 0.4988 -nan 0.1000 0.0196 3 0.4844 -nan 0.1000 0.0137 4 0.4674 -nan 0.1000 0.0152 5 0.4467 -nan 0.1000 0.0149 6 0.4304 -nan 0.1000 0.0124 7 0.4173 -nan 0.1000 0.0091 8 0.4090 -nan 0.1000 -0.0094 9 0.4007 -nan 0.1000 0.0019 10 0.3880 -nan 0.1000 0.0023 20 0.3388 -nan 0.1000 0.0003 40 0.2718 -nan 0.1000 -0.0039 50 0.2542 -nan 0.1000 -0.0038

#
YTpknn00=predict(md5[[which(names(md5)=="knn")]],xf0)
save(YTpknn00,xf0,file="~/git/BL_2012/GA_predict_T_31F2.RData")
ref_tot0=data.frame(Name=as.character(ref$Name),chi2d=ref$T_chi2.T, T_ICA=ref$T_ICA,
                YAn00,T_real=ref$T_real)
res00=SelFeatures(GAs[[isol]],cnjts,bandas)
ref_tot2=ref_tot0
#
print(xtable(ldply(res00,function(x){return(c(x$name,x$name0))})),type="html")
V1 V2
1 1110-1119 2180-2189
2 2225-2234 710-719
3 2165-2174 1380-1389
4 1820-1829 2020-2029
5 815-824 1330-1339
6 1270-1279 715-724
7 1565-1574 745-754
8 1880-1889 1365-1374
9 1015-1024 1470-1479
10 1135-1144 1465-1474
#
ggplot(data=ref_tot2) + 
         geom_point(aes(x=T_real,y=chi2d),size=3) +
         xlab("Theoretical TAN [mg KOH]") + ylab("Chi2 [mg KOH]") +
         theme_bw() + #  scale_colour_brewer(palette="Set1") +
         geom_abline(position="identity", colour="gray") # +

         # xlim(2000,4200) + ylim(2000,4200)   +
         # guides(col=guide_legend(ncol=2))
#
ggplot(data=ref_tot2) + 
         geom_point(aes(x=T_real,y=T_ICA),size=3) +
         xlab("Theoretical TAN [mg KOH]") + ylab("ICA [mg KOH]") +
         theme_bw() + #  scale_colour_brewer(palette="Set1") +
         geom_abline(position="identity", colour="gray") 

#
ggplot(data=ref_tot2) + 
         geom_point(aes(x=T_real,y=YAn00),size=3) +
         xlab("Theoretical T [mg KOH]") + 
         ylab("ML predicted RF [mg KOH]") +
         theme_bw() + #  scale_colour_brewer(palette="Set1") +
         geom_abline(position="identity", colour="gray") 

#
lmod=c("rf","gbm","svmRadial","nnet","knn","bagEarth","kernelpls","cubist")
nmod=c("RF","GB","SVR","NNR","KNN","MARS","PLS","Rule-Regression")
for(ll in c(1:2)) {
  if (ll == 2) {
    pdf(file="~/git/BL_2012/GA_model_T3.pdf",width=12,heigh=10)
  }
  for (i in 1:length(lmod)) {
    YTm00=predict(md5[[which(names(md5)==lmod[i])[1]]],xf0)
    #
    ref_tot0m=data.frame(Name=as.character(ref$Name),
                chi2d=ref$T_chi2.T, T_ICA=ref$T_ICA,T_real=ref$T_real,YTm00)
    ref_tot2m=ref_tot0m 
    diffs=apply(ref_tot2m[,! colnames(ref_tot2m) %in% c("Name","SpT","T_real")],
              2,FUN="-",ref_tot2m[,"T_real"])
    rownames(diffs)=ref_tot2m[,"Name"]
    #
    cat(paste("T Modelling with ",lmod[i],". Error analysis follows.",sep=""))
    errors=data.frame(drmse = apply(diffs,2,rmse),dmae=apply(diffs,2,mae))
    print(xtable(errors),type="html")
  #
    print(ggplot(data=ref_tot2m) + 
           geom_point(aes(x=T_real,y=YTm00),size=3) +
           xlab("Theoretical TAN [mg KOH]") + 
           ylab(paste(nmod[i]," predicted [mg KOH]","")) +
           theme_bw() + geom_abline(position="identity", colour="red") )
  }
}
T Modelling with rf. Error analysis follows.
drmse dmae
chi2d 1.20 0.95
T_ICA 1.11 0.81
YTm00 1.12 0.75
T Modelling with gbm. Error analysis follows.
drmse dmae
chi2d 1.20 0.95
T_ICA 1.11 0.81
YTm00 1.14 0.70
T Modelling with svmRadial. Error analysis follows.
drmse dmae
chi2d 1.20 0.95
T_ICA 1.11 0.81
YTm00 1.10 0.65
T Modelling with nnet. Error analysis follows.
drmse dmae
chi2d 1.20 0.95
T_ICA 1.11 0.81
YTm00 1.07 0.67
T Modelling with knn. Error analysis follows.
drmse dmae
chi2d 1.20 0.95
T_ICA 1.11 0.81
YTm00 1.14 0.76
T Modelling with bagEarth. Error analysis follows.
drmse dmae
chi2d 1.20 0.95
T_ICA 1.11 0.81
YTm00 1.08 0.65
T Modelling with kernelpls. Error analysis follows.
drmse dmae
chi2d 1.20 0.95
T_ICA 1.11 0.81
YTm00 1.20 0.73
T Modelling with cubist. Error analysis follows.
drmse dmae
chi2d 1.20 0.95
T_ICA 1.11 0.81
YTm00 1.12 0.66
T Modelling with rf. Error analysis follows.
drmse dmae
chi2d 1.20 0.95
T_ICA 1.11 0.81
YTm00 1.12 0.75
T Modelling with gbm. Error analysis follows.
drmse dmae
chi2d 1.20 0.95
T_ICA 1.11 0.81
YTm00 1.14 0.70
T Modelling with svmRadial. Error analysis follows.
drmse dmae
chi2d 1.20 0.95
T_ICA 1.11 0.81
YTm00 1.10 0.65
T Modelling with nnet. Error analysis follows.
drmse dmae
chi2d 1.20 0.95
T_ICA 1.11 0.81
YTm00 1.07 0.67
T Modelling with knn. Error analysis follows.
drmse dmae
chi2d 1.20 0.95
T_ICA 1.11 0.81
YTm00 1.14 0.76
T Modelling with bagEarth. Error analysis follows.
drmse dmae
chi2d 1.20 0.95
T_ICA 1.11 0.81
YTm00 1.08 0.65
T Modelling with kernelpls. Error analysis follows.
drmse dmae
chi2d 1.20 0.95
T_ICA 1.11 0.81
YTm00 1.20 0.73
T Modelling with cubist. Error analysis follows.
drmse dmae
chi2d 1.20 0.95
T_ICA 1.11 0.81
YTm00 1.12 0.66
dev.off()

png 2

#

that’s all folks.