Training Models From GA selected features

In previous steps features were selected by using GA technology to assess the suitability of such features.

Differnt GA strategies for fitness evaluation and we have tested nearcenter and randomforest technologies.

Feature selection was prepared acording to document paso2

Feature extraction as defined by the GA

For feature selection BT-Settl 2012 library from France Allard was selected and wavelength reduction was performed to become compatible with the data coming from the satellite IPAC.

# Procesado del genético de Tª
setwd("~/git/M_sel")
# Datos cargados a mano desde el fichero temporal con 50 iter
load("m_t-settl_5_rf.RData")
datos <- `m_t-settl_5_rf`
attributes(datos)
## $names
##  [1] "Class."             "Libraries."         "bb"                
##  [4] "bestChromosomes"    "bestFitness"        "call"              
##  [7] "callBackFunc"       "callEnhancerFunc"   "callPreFunc"       
## [10] "classes"            "collectMode"        "count"             
## [13] "countDivisor"       "countFilter"        "countLastColumn"   
## [16] "countValues"        "counted"            "data"              
## [19] "elapsedRun"         "elapsedTime"        "evolvedChromosomes"
## [22] "evolvedFitnesses"   "galgo"              "galgos"            
## [25] "gcCalls"            "gcFrequency"        "geneNames"         
## [28] "generation"         "iclasses"           "id"                
## [31] "leftTime"           "levels"             "main"              
## [34] "maxBigBangs"        "maxCounts"          "maxFitnesses"      
## [37] "maxSolutions"       "nClasses"           "onlySolutions"     
## [40] "running"            "sampleNames"        "saveFile"          
## [43] "saveFrequency"      "saveGeneBreaks"     "saveMode"          
## [46] "saveVariableName"   "solution"           "solutions"         
## [49] "startedTime"        "timing"             "userCancelled"     
## [52] "verbose"
# 
features <- list()
features$T <- c("45:52_55:62", "53:62_43:52", "44:51_52:61", "141:148_109:120", 
    "105:119_43:57", "150:159_139:146", "139:146_148:159")
# 

Now, we start to load the original data bp_clean and bq_clean, and we will process the features according to the feature list.

In order to generate features we will use the formula \(\int{\lambda_{1}}{\lambda_{2}}{1-\frac{F_{line}}{F_{cont}} d\lambda}\) as depicted in formula (1) of the above reference.

The band frequences are defined in the previous table but \(F_{cont}\) is not defined. As such a genetic algorithm testing different potential continuum spectra will be tested and looking the best matching to explain spectral parameters by precious feature functions.

Once the signal points and continuous regions were identified, models were setted-up and assessed by crossvalidation procedure. Iedas for the implementation were taken from http://moderntoolmaking.blogspot.com.es/2013/03/caretensemble-classification-example.html

# Cargamos bp_clean (BT_SETTL) & bq_clean (IPAC)
load("~/git/M_prep/M_prep_cleanip_BT-Settl.RData")
# Buscamos las catacterísticas para extraerla del conjunto de train par T
signal <- unlist(lapply(features$T, function(x) {
    a <- strsplit(x, "_")
    return(a[[1]][1])
}))
noise <- unlist(lapply(features$T, function(x) {
    a <- strsplit(x, "_")
    return(a[[1]][2])
}))
sn <- cbind(signal, noise)
int_spec <- function(x, idx, norm = 0) {
    y <- x$data[[1]][eval(parse(text = idx)), ]
    xz <- diff(as.numeric(y[, 1]), 1)
    yz <- as.numeric(y[, 2])
    if (norm > 0) {
        z <- sum(xz)
    } else {
        z <- sum(xz * rollmean(yz, 2))
    }
    return(z)
}
# 
feature_extr <- function(sn, bp) {
    sig <- sn[1]
    noi <- sn[2]
    Fcont <- unlist(lapply(bp, int_spec, noi, 0))/unlist(lapply(bp, int_spec, 
        noi, 1))
    fea <- unlist(lapply(bp, int_spec, sig, 1)) - unlist(lapply(bp, int_spec, 
        sig, 0))/Fcont
    return(fea)
}
xx <- apply(sn, 1, feature_extr, bp_clean)
colnames(xx) <- as.character(sn[, 1])
colnames(xx) <- str_replace(paste("X", colnames(xx), sep = ""), ":", ".")
xx <- cbind(xx, unlist(lapply(bp_clean, function(x) {
    return(x$stellarp[1])
})))
colnames(xx)[ncol(xx)] <- "T"

# Just informing for the features
newfea <- cbind(t(apply(sn, 1, function(x) {
    return(range(bp_clean[[1]]$data[[1]][eval(parse(text = x[1])), 1]))
})), t(apply(sn, 1, function(x) {
    return(range(bp_clean[[1]]$data[[1]][eval(parse(text = x[2])), 1]))
})))
colnames(newfea) <- c("Signal_from", "Signal_To", "Cont_From", "Cont_To")

print(xtable(newfea), type = "html")
Signal_from Signal_To Cont_From Cont_To
1 8619.40 8644.60 8655.40 8680.60
2 8648.20 8680.60 8612.20 8644.60
3 8615.80 8641.00 8644.60 8677.00
4 8965.00 8990.20 8849.80 8889.40
5 8835.40 8885.80 8612.20 8662.60
6 8997.40 9029.80 8957.80 8983.00
7 8957.80 8983.00 8990.20 9029.80
# 

Regression and modeling

Let’s build up several models and compare their performance. Generally speaking we split randomly the learning set and we will learn by ten folder cross validation. Then we will test the models against the unseen data and we will test the ensamble technology, even.

# Setup
gc(reset = TRUE)
##             used   (Mb) gc trigger   (Mb)  max used   (Mb)
## Ncells    771336   41.2    1265230   67.6    771336   41.2
## Vcells 512714595 3911.8  720315638 5495.6 512714595 3911.8
set.seed(42)  #From random.org

# Libraries
library(caret)
## Loading required package: lattice
## Loading required package: ggplot2
## 
## Attaching package: 'caret'
## 
## The following objects are masked from 'package:galgo':
## 
##     best, confusionMatrix
library(devtools)
## 
## Attaching package: 'devtools'
## 
## The following objects are masked from 'package:R.oo':
## 
##     check, unload
# Solo una vez: install_github('caretEnsemble', 'zachmayer') #Install zach's
# caretEnsemble package Code gathered from the author's post.
library(caretEnsemble)

# Data
library(mlbench)
xx <- as.data.frame(xx)
X <- xx[, -(ncol(xx))]
rownames(X) <- 1:nrow(X)
X <- data.frame(X)
Y <- xx[, ncol(xx)]

# Split train/test
train <- runif(nrow(X)) <= 0.66

# Setup CV Folds returnData=FALSE saves some space
folds = 10
repeats = 1
myControl <- trainControl(method = "cv", number = folds, repeats = repeats, 
    returnResamp = "none", returnData = FALSE, savePredictions = TRUE, verboseIter = FALSE, 
    allowParallel = TRUE, index = createMultiFolds(Y[train], k = folds, times = repeats))
# Train some models
model1 <- train(X[train, ], Y[train], method = "gbm", trControl = myControl, 
    tuneGrid = expand.grid(.n.trees = 500, .interaction.depth = 15, .shrinkage = 0.01), 
    verbose = FALSE)
## Loading required package: gbm
## Loading required package: survival
## Loading required package: splines
## 
## Attaching package: 'survival'
## 
## The following object is masked from 'package:caret':
## 
##     cluster
## 
## Loading required package: parallel
## Loaded gbm 2.1
model2 <- train(X[train, ], Y[train], method = "blackboost", trControl = myControl)
## Loading required package: party
## Loading required package: grid
## Loading required package: sandwich
## Loading required package: strucchange
## Loading required package: modeltools
## Loading required package: stats4
## 
## Attaching package: 'modeltools'
## 
## The following objects are masked from 'package:R.oo':
## 
##     clone, dimension
## 
## The following object is masked from 'package:plyr':
## 
##     empty
## 
## Loading required package: mboost
## This is mboost 2.2-3. See 'package?mboost' and the NEWS file
## for a complete list of changes.
## Note: The default for the computation of the degrees of freedom has changed.
##       For details see section 'Global Options' of '?bols'.
## 
## Attaching package: 'mboost'
## 
## The following object is masked from 'package:ggplot2':
## 
##     %+%
model3 <- train(X[train, ], Y[train], method = "rf", trControl = myControl)
## Loading required package: randomForest
## randomForest 4.6-7
## Type rfNews() to see new features/changes/bug fixes.
model4 <- train(X[train, ], Y[train], method = "mlpWeightDecay", trControl = myControl, 
    trace = FALSE)
## Loading required package: RSNNS
## Loading required package: Rcpp
## 
## Attaching package: 'RSNNS'
## 
## The following objects are masked from 'package:caret':
## 
##     confusionMatrix, train
## 
## The following object is masked from 'package:galgo':
## 
##     confusionMatrix
model5 <- train(X[train, ], Y[train], method = "ppr", trControl = myControl)
model6 <- train(X[train, ], Y[train], method = "earth", trControl = myControl)
## Loading required package: earth
## Loading required package: plotmo
## Loading required package: plotrix
model7 <- train(X[train, ], Y[train], method = "glm", trControl = myControl)
model8 <- train(X[train, ], Y[train], method = "svmRadial", trControl = myControl)
## Loading required package: kernlab
## 
## Attaching package: 'kernlab'
## 
## The following object is masked from 'package:modeltools':
## 
##     prior
## 
## The following object is masked from 'package:galgo':
## 
##     scaling
model9 <- train(X[train, ], Y[train], method = "gam", trControl = myControl)
## Loading required package: mgcv
## Loading required package: nlme
## This is mgcv 1.7-28. For overview type 'help("mgcv-package")'.
## 
## Attaching package: 'mgcv'
## 
## The following object is masked from 'package:magic':
## 
##     magic
model10 <- train(X[train, ], Y[train], method = "glmnet", trControl = myControl)
## Loading required package: glmnet
## Loading required package: Matrix
## Loaded glmnet 1.9-5
model11 <- train(X[train, ], Y[train], method = "nnet", trControl = myControl, 
    trace = FALSE, maxit = 10000, reltol = 1e-11, abstol = 1e-06)
## Loading required package: nnet

# Make a list of all the models
all.models <- list(model1, model2, model3, model4, model5, model6, model7, model8, 
    model9, model10, model11)
names(all.models) <- sapply(all.models, function(x) x$method)
sort(sapply(all.models, function(x) min(x$results$RMSE)))
##            ppr             rf          earth            gbm     blackboost 
##          64.08          71.37          74.94          75.77          93.99 
##            gam      svmRadial            glm         glmnet mlpWeightDecay 
##          97.53         101.08         126.78         127.05         478.13 
##           nnet 
##        2978.11

# Make a greedy ensemble - currently can only use RMSE
greedy <- caretEnsemble(all.models, iter = 1000L)
## Loading required package: pbapply
sort(greedy$weights, decreasing = TRUE)
##       ppr     earth        rf svmRadial       gam 
##     0.496     0.192     0.139     0.091     0.082
greedy$error
##  RMSE 
## 62.37

# Make a linear regression ensemble
linear <- caretStack(all.models, method = "glm", trControl = trainControl(method = "cv"))
summary(linear$ens_model$finalModel)
## 
## Call:
## NULL
## 
## Deviance Residuals: 
##    Min      1Q  Median      3Q     Max  
## -334.9   -21.7    -1.0    25.0   211.3  
## 
## Coefficients: (1 not defined because of singularities)
##                Estimate Std. Error t value Pr(>|t|)    
## (Intercept)     -3.3778    53.1730   -0.06  0.94941    
## gbm             -0.0639     0.1405   -0.46  0.64942    
## blackboost      -0.2494     0.0770   -3.24  0.00138 ** 
## rf               0.3468     0.1205    2.88  0.00440 ** 
## mlpWeightDecay  -0.0228     0.0142   -1.60  0.11155    
## ppr              0.4719     0.1000    4.72  4.2e-06 ***
## earth            0.2418     0.0782    3.09  0.00223 ** 
## glm             -0.5273     0.3735   -1.41  0.15950    
## svmRadial        0.2003     0.0573    3.50  0.00057 ***
## gam              0.0776     0.0458    1.69  0.09151 .  
## glmnet           0.5249     0.3753    1.40  0.16337    
## nnet                 NA         NA      NA       NA    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for gaussian family taken to be 3696)
## 
##     Null deviance: 42844307  on 230  degrees of freedom
## Residual deviance:   813028  on 220  degrees of freedom
## AIC: 2566
## 
## Number of Fisher Scoring iterations: 2
linear$error
##   parameter  RMSE Rsquared RMSESD RsquaredSD
## 1      none 63.28    0.977  21.54    0.01676

# Predict for test set:
preds <- data.frame(sapply(all.models, predict, newdata = X[!train, ]))
preds$ENS_greedy <- predict(greedy, newdata = X[!train, ])
preds$ENS_linear <- predict(linear, newdata = X[!train, ])
sort(sqrt(colMeans((preds - Y[!train])^2)))
##     ENS_greedy     ENS_linear          earth            ppr            gbm 
##          77.37          77.98          82.85          90.26          95.91 
##             rf     blackboost      svmRadial            gam         glmnet 
##          97.98         105.82         133.10         135.25         169.20 
##            glm mlpWeightDecay           nnet 
##         170.04         503.46        2877.07

plot(Y[!train], preds$ENS_greedy)
lines(c(0, 5000), c(0, 5000), col = 2)
plot of chunk lee02.1

plot of chunk lee02.1

Let’s have a look about the frequencies of T present in learning dataset:

# T histogram
hist(d_bt[, 2], breaks = 20, main = "T histogram for learning dataset")
plot of chunk lee02.2

plot of chunk lee02.2

IPAC validation

After having the models built as well as the ensamble of them it is time to predict for the IPAC dataset. Thus we start to prepare the data,

# Cargamos bq_clean (IPAC)
yy <- apply(sn, 1, feature_extr, bq_clean)
yy <- as.data.frame(yy)
colnames(yy) <- as.character(sn[, 1])
colnames(yy) <- str_replace(paste("X", colnames(yy), sep = ""), ":", ".")
# Predict for new dataset:
predf <- data.frame(sapply(all.models, predict, newdata = yy))
predf$ENS_greedy <- predict(greedy, newdata = yy)
predf$ENS_linear <- predict(linear, newdata = yy)

Neighbourhood Analysis

In order to realize how close or isolated the both sets (BT-SETTL and IPAC) are a PCA analysis is performed:

zz <- rbind(X, yy)
pcaz <- prcomp(zz)
plot(pcaz$x[, 1], pcaz$x[, 2], pch = ".")
points(pcaz$x[(nrow(X) + 1):nrow(zz), 1], pcaz$x[(nrow(X) + 1):nrow(zz), 2], 
    pch = "x", col = 3)
points(pcaz$x[1:nrow(X), 1], pcaz$x[1:nrow(X), 2], pch = "+", col = 2)
plot of chunk lee04

plot of chunk lee04

# BTSETTL: ROJO: 2 4 8 17 18 19 21 22 24 25 31 32 33 34 35 36 37 39 40 49 53
# 61
smpl_b = c(2, 4, 8, 17, 18, 19, 21, 22, 24, 25, 31, 32, 33, 34, 35, 36, 37, 
    39, 40, 49, 53, 61)
for (i in smpl_b) {
    plot(bp_clean[[i]]$data[[1]], type = "l", main = bp_clean[[i]]$name)
}

plot of chunk lee04 plot of chunk lee04 plot of chunk lee04 plot of chunk lee04 plot of chunk lee04 plot of chunk lee04 plot of chunk lee04 plot of chunk lee04 plot of chunk lee04 plot of chunk lee04 plot of chunk lee04 plot of chunk lee04 plot of chunk lee04 plot of chunk lee04 plot of chunk lee04 plot of chunk lee04 plot of chunk lee04 plot of chunk lee04 plot of chunk lee04 plot of chunk lee04 plot of chunk lee04 plot of chunk lee04

# IPAC: VERDE : c(567,716,722,791) -nrow(X) 221 370 376 445
smpl_i = c(567, 716, 722, 791) - nrow(X)
for (i in smpl_i) {
    plot(bq_clean[[i]]$data[[1]], type = "l", main = bq_clean[[i]]$name)
}

plot of chunk lee04 plot of chunk lee04 plot of chunk lee04 plot of chunk lee04

Estimation Comparison against ICA procedure

According to the obtained predictions, some comparison will be performed against the prediction carried out by using spectrafull projection by means of ICA/JADE technology. This was carried out by Dr Prendes Gero at http://innova.uned.es, so we will download her prediction datasets and just compare them.

# Cargamos estimacion (IPAC)
load("~/git/M_sel/IPAC_Belen_T.RData")
load("~/git/M_sel/belen_resul_T.RData")
rownames(dT) = rownames(dd)
lnm <- unlist(lapply(bq_clean, function(x) {
    return(x$name)
}))
idx = apply(data.frame(rownames(dT)), 1, function(x, y) {
    return(which(x == y))
}, lnm)
XY = cbind(dT[, ncol(dT)], predf$ENS_greedy[idx])
plot(XY)
plot of chunk lee05

plot of chunk lee05

plot(XY, xlim = c(0, 5000), ylim = c(0, 5000))
lines(c(0, 5000), c(0, 5000), col = 2)
plot of chunk lee05

plot of chunk lee05

hist((XY[, 1] - XY[, 2])/sd(XY[, 1] - XY[, 2]), breaks = 20)
plot of chunk lee05

plot of chunk lee05

mer1 = mean(XY[, 1] - XY[, 2])
sder1 = sd(XY[, 1] - XY[, 2])

Then, the structure of the residual has an average of 301.1111 and a standard deviation of 370.895.

Estimation Comparison against Temperature inferred from its Spectral Subtype

Let’s compare predictions between current procedure and the one provided by Dr Sarro Baro as derived from the likelyhood according to the star’s spectral subtype.

# Cargamos bq_clean (IPAC)
load("~/git/M_sel/Teffs.RData")
# 
buscar = function(x, y, l = 4) {
    for (i in x[1:l]) {
        stg = gsub("+", "p", gsub(" ", "_", i), fixed = TRUE)
        if (nchar(stg) > 0) {
            pos = grep(stg, y)
            if (length(pos) == 1) {
                return(pos)
            }
        }
    }
    return(-1)
}
# 
idx = apply(ipac_names[, -5], 1, buscar, lnm, 5)
XY = cbind(teff$fit[idx > 0], predf$ENS_greedy[idx[idx > 0]])
plot(XY[!is.na(XY[, 1]), ])
plot of chunk lee06

plot of chunk lee06

# plot(dd[,1],predf$ENS_greedy[idx], xlim=c(1500,3500),ylim=c(1500,3500))
plot(XY[!is.na(XY[, 1]), ], xlim = c(1000, 5000), ylim = c(1000, 5000), xlab = "T from Dr Sarro", 
    ylab = "Greedy predicted T", main = "IPAC Predicted Temperature Comparison ")
lines(c(0, 5000), c(0, 5000), col = 2)
plot of chunk lee06

plot of chunk lee06

hist((XY[!is.na(XY[, 1]), 1] - XY[!is.na(XY[, 1]), 2])/sd(XY[!is.na(XY[, 1]), 
    1] - XY[!is.na(XY[, 1]), 2]), breaks = 20)
plot of chunk lee06

plot of chunk lee06

mer2 = mean(XY[!is.na(XY[, 1]), 1] - XY[!is.na(XY[, 1]), 2])
sder2 = sd(XY[!is.na(XY[, 1]), 1] - XY[!is.na(XY[, 1]), 2])

Then, the structure of the residual has an average of 14.1379 and a standard deviation of 1064.5538.

Let’s compare the predicitons between Mrs Prendes-Gero and Prof. Sarro’s ones.

# Buscamos los nombres de Belen ...
idbl = apply(ipac_names[, -5], 1, buscar, rownames(dT), 5)
XY = cbind(teff$fit[idbl[idbl > 0]], dT[idbl > 0, ncol(dT)])
plot(XY)
plot of chunk lee06.01

plot of chunk lee06.01

# plot(dd[,1],predf$ENS_greedy[idx], xlim=c(1500,3500),ylim=c(1500,3500))
plot(XY, xlim = c(1000, 5000), ylim = c(1000, 5000), xlab = "T from Dr Sarro", 
    ylab = "T from Mrs Prendes-Gero", main = "IPAC Predicted Temperature Comparison ")
lines(c(0, 5000), c(0, 5000), col = 2)
plot of chunk lee06.01

plot of chunk lee06.01

hist((XY[!is.na(XY[, 1]), 1] - XY[!is.na(XY[, 1]), 2])/sd(XY[!is.na(XY[, 1]), 
    1] - XY[!is.na(XY[, 1]), 2]), breaks = 20)
## Error: character(0)
mer21 = mean(XY[!is.na(XY[, 1]), 1] - XY[!is.na(XY[, 1]), 2])
sder21 = sd(XY[!is.na(XY[, 1]), 1] - XY[!is.na(XY[, 1]), 2])

Then, the structure of the residual has an average of NA and a standard deviation of NA.

What if we test the data windows proposed by the paper?

As it was depicted in “The Infrared Telescope Facility (IRTF) spectral library: Spectral diagnostics for cool stars” A&A 549, A129 (2013), the authors M. Cesetti; A. Pizzella; V. D. Ivanov; L. Morelli; E. M. Corsini and E. Dalla Bontà have proposed a set of ranges for signals and for continuum inside the K band, it could make sense to test the same features for BT-SETTL instead of IRTF as they have done, especially whenever the data are inside the wavelenght range for IPAC. This means that the last two rows of the band I don’t make sense in the IPAC case and they are not presented.

Just as a remainder those values are:

# 
require(xtable)
print(xtable(bi[, 1:7]), "html")
Element From To ContFrom_1 ContTo_1 ContFrom_2 ContTo_2
1 Pa1 8461 8474 8474 8484 8563 8577
2 Ca1 8484 8513 8474 8484 8563 8577
3 Ca2 8522 8562 8474 8484 8563 8577
4 Pa2 8577 8619 8563 8577 8619 8642
5 Ca3 8642 8682 8619 8642 8700 8725
6 Pa3 8730 8772 8700 8725 8776 8792
7 Mg 8802 8811 8776 8792 8815 8850
8 Pa4 8850 8890 8815 8850 8890 8900
9 Pa5 9000 9030 8983 8998 9040 9050
# 
lgth <- 3.6
org <- 8461 - lgth
signal2 <- paste(round((bi[, 2] - org)/lgth), round((bi[, 3] - org)/lgth), sep = ":")
noise2 <- paste(round((bi[, 4] - org)/lgth), round((bi[, 5] - org)/lgth), sep = ":")
sn2 <- cbind(signal2, noise2)
xx2 <- apply(sn2, 1, feature_extr, bp_clean)
colnames(xx2) <- as.character(sn2[, 1])
colnames(xx2) <- str_replace(paste("X", colnames(xx2), sep = ""), ":", ".")
xx2 <- cbind(xx2, unlist(lapply(bp_clean, function(x) {
    return(x$stellarp[1])
})))
colnames(xx2)[ncol(xx2)] <- "T"

Now we perform the regression analysis

Regression analysis against features defined by the paper

# Data
library(mlbench)
xx2 <- as.data.frame(xx2)
X <- xx2[, -ncol(xx2)]
rownames(X) <- 1:nrow(X)
X <- data.frame(X)
Y <- xx2$T

# Train some models
model1.2 <- train(X[train, ], Y[train], method = "gbm", trControl = myControl, 
    tuneGrid = expand.grid(.n.trees = 500, .interaction.depth = 15, .shrinkage = 0.01), 
    verbose = FALSE)
model2.2 <- train(X[train, ], Y[train], method = "blackboost", trControl = myControl)
model3.2 <- train(X[train, ], Y[train], method = "rf", trControl = myControl)
model4.2 <- train(X[train, ], Y[train], method = "mlpWeightDecay", trControl = myControl, 
    trace = FALSE)
model5.2 <- train(X[train, ], Y[train], method = "ppr", trControl = myControl)
model6.2 <- train(X[train, ], Y[train], method = "earth", trControl = myControl)
model7.2 <- train(X[train, ], Y[train], method = "glm", trControl = myControl)
model8.2 <- train(X[train, ], Y[train], method = "svmRadial", trControl = myControl)
model9.2 <- train(X[train, ], Y[train], method = "gam", trControl = myControl)
model10.2 <- train(X[train, ], Y[train], method = "glmnet", trControl = myControl)
model11.2 <- train(X[train, ], Y[train], method = "nnet", trControl = myControl, 
    trace = FALSE, maxit = 10000, reltol = 1e-11, abstol = 1e-06)

# Make a list of all the models
all.models.2 <- list(model1.2, model2.2, model3.2, model4.2, model5.2, model6.2, 
    model7.2, model8.2, model9.2, model10.2, model11.2)
names(all.models.2) <- sapply(all.models.2, function(x) x$method)
sort(sapply(all.models.2, function(x) min(x$results$RMSE)))
##          earth            gbm             rf            ppr            gam 
##          65.76          74.76          74.94          82.12          84.52 
##     blackboost            glm         glmnet      svmRadial mlpWeightDecay 
##          85.81         106.24         106.51         111.06         504.97 
##           nnet 
##        2978.11

# Make a greedy ensemble - currently can only use RMSE
greedy2 <- caretEnsemble(all.models.2, iter = 1000L)
sort(greedy2$weights, decreasing = TRUE)
##     earth       gbm       ppr svmRadial 
##     0.553     0.186     0.159     0.102
greedy2$error
## RMSE 
##   NA

# Make a linear regression ensemble
linear2 <- caretStack(all.models.2, method = "glm", trControl = trainControl(method = "cv"))
summary(linear2$ens_model$finalModel)
## 
## Call:
## NULL
## 
## Deviance Residuals: 
##    Min      1Q  Median      3Q     Max  
## -343.1   -22.8     5.3    28.8   190.9  
## 
## Coefficients: (1 not defined because of singularities)
##                 Estimate Std. Error t value Pr(>|t|)    
## (Intercept)    -58.06409   39.08139   -1.49  0.13896    
## gbm              0.51797    0.15070    3.44  0.00072 ***
## blackboost      -0.08531    0.13012   -0.66  0.51286    
## rf              -0.28102    0.15430   -1.82  0.07009 .  
## mlpWeightDecay   0.00364    0.00925    0.39  0.69466    
## ppr              0.15557    0.06604    2.36  0.01948 *  
## earth            0.55749    0.08559    6.51    6e-10 ***
## glm              0.95023    0.44507    2.14  0.03400 *  
## svmRadial        0.13341    0.04867    2.74  0.00669 ** 
## gam              0.05041    0.06219    0.81  0.41855    
## glmnet          -0.98373    0.44701   -2.20  0.02893 *  
## nnet                  NA         NA      NA       NA    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for gaussian family taken to be 3528)
## 
##     Null deviance: 38063792  on 206  degrees of freedom
## Residual deviance:   691462  on 196  degrees of freedom
##   (24 observations deleted due to missingness)
## AIC: 2291
## 
## Number of Fisher Scoring iterations: 2
linear2$error
##   parameter  RMSE Rsquared RMSESD RsquaredSD
## 1      none 64.82   0.9756  20.95    0.02026

# Predict for test set:
preds2 <- data.frame(sapply(all.models.2, predict, newdata = X[!train, ]))
preds2$ENS_greedy <- predict(greedy2, newdata = X[!train, ])
preds2$ENS_linear <- predict(linear2, newdata = X[!train, ])
sort(sqrt(colMeans((preds2 - Y[!train])^2)))
##     ENS_greedy            gbm     ENS_linear             rf          earth 
##          84.66          84.84          87.60          89.68          99.70 
##     blackboost            gam            ppr      svmRadial         glmnet 
##         100.03         108.91         110.42         122.93         126.91 
##            glm mlpWeightDecay           nnet 
##         128.37        1646.61        2877.07

Let’s produce a comparison for IPAC against Dr Sarro’s estimations.

# Cargamos bq_clean (IPAC)
yy2 <- apply(sn2, 1, feature_extr, bq_clean)
yy2 <- as.data.frame(yy2)
colnames(yy2) <- as.character(sn2[, 1])
colnames(yy2) <- str_replace(paste("X", colnames(yy2), sep = ""), ":", ".")
# Predict for new dataset:
predf2 <- data.frame(sapply(all.models.2, predict, newdata = yy2))
predf2$ENS_greedy <- predict(greedy2, newdata = yy2)
predf2$ENS_linear <- predict(linear2, newdata = yy2)
# 
idx = apply(ipac_names[, -5], 1, buscar, lnm, 5)
XY = cbind(teff$fit[idx > 0], predf2$ENS_greedy[idx[idx > 0]])
plot(XY[!is.na(XY[, 1]), ])
plot of chunk lee10.01

plot of chunk lee10.01

# plot(dd[,1],predf$ENS_greedy[idx], xlim=c(1500,3500),ylim=c(1500,3500))
plot(XY[!is.na(XY[, 1]), ], xlim = c(1000, 5000), ylim = c(1000, 5000), xlab = "T from Dr Sarro", 
    ylab = "Greedy predicted T", main = "IPAC Predicted Temperature Comparison ")
lines(c(0, 5000), c(0, 5000), col = 2)
plot of chunk lee10.01

plot of chunk lee10.01

hist((XY[!is.na(XY[, 1]), 1] - XY[!is.na(XY[, 1]), 2])/sd(XY[!is.na(XY[, 1]), 
    1] - XY[!is.na(XY[, 1]), 2]), breaks = 20)
plot of chunk lee10.01

plot of chunk lee10.01

mer2 = mean(XY[!is.na(XY[, 1]), 1] - XY[!is.na(XY[, 1]), 2])
sder2 = sd(XY[!is.na(XY[, 1]), 1] - XY[!is.na(XY[, 1]), 2])

Then, the structure of the residual has an average of 252.7606 and a standard deviation of 1097.9708.

Now we can test the paper’s second proposal for continuous range: Just as a remainder those values are:

# 
require(xtable)
print(xtable(bi[, 1:7]), "html")
Element From To ContFrom_1 ContTo_1 ContFrom_2 ContTo_2
1 Pa1 8461 8474 8474 8484 8563 8577
2 Ca1 8484 8513 8474 8484 8563 8577
3 Ca2 8522 8562 8474 8484 8563 8577
4 Pa2 8577 8619 8563 8577 8619 8642
5 Ca3 8642 8682 8619 8642 8700 8725
6 Pa3 8730 8772 8700 8725 8776 8792
7 Mg 8802 8811 8776 8792 8815 8850
8 Pa4 8850 8890 8815 8850 8890 8900
9 Pa5 9000 9030 8983 8998 9040 9050
# 
lgth <- 3.6
org <- 8461 - lgth
signal3 <- paste(round((bi[1:8, 2] - org)/lgth), round((bi[1:8, 3] - org)/lgth), 
    sep = ":")
noise3 <- paste(round((bi[1:8, 6] - org)/lgth), round((bi[1:8, 7] - org)/lgth), 
    sep = ":")
sn3 <- cbind(signal3, noise3)
xx3 <- apply(sn3, 1, feature_extr, bp_clean)
colnames(xx3) <- as.character(sn3[, 1])
colnames(xx3) <- str_replace(paste("X", colnames(xx3), sep = ""), ":", ".")
xx3 <- cbind(xx3, unlist(lapply(bp_clean, function(x) {
    return(x$stellarp[1])
})))
colnames(xx3)[ncol(xx3)] <- "T"

Now we perform the regression analysis

Regression analysis against features defined by the paper

# Data
xx3 <- as.data.frame(xx3)
X <- xx3[, -ncol(xx3)]
rownames(X) <- 1:nrow(X)
X <- data.frame(X)
Y <- xx3$T

# Train some models
model1.3 <- train(X[train, ], Y[train], method = "gbm", trControl = myControl, 
    tuneGrid = expand.grid(.n.trees = 500, .interaction.depth = 15, .shrinkage = 0.01), 
    verbose = FALSE)
model2.3 <- train(X[train, ], Y[train], method = "blackboost", trControl = myControl)
model3.3 <- train(X[train, ], Y[train], method = "rf", trControl = myControl)
model4.3 <- train(X[train, ], Y[train], method = "mlpWeightDecay", trControl = myControl, 
    trace = FALSE)
model5.3 <- train(X[train, ], Y[train], method = "ppr", trControl = myControl)
model6.3 <- train(X[train, ], Y[train], method = "earth", trControl = myControl)
model7.3 <- train(X[train, ], Y[train], method = "glm", trControl = myControl)
model8.3 <- train(X[train, ], Y[train], method = "svmRadial", trControl = myControl)
model9.3 <- train(X[train, ], Y[train], method = "gam", trControl = myControl)
model10.3 <- train(X[train, ], Y[train], method = "glmnet", trControl = myControl)
model11.3 <- train(X[train, ], Y[train], method = "nnet", trControl = myControl, 
    trace = FALSE, maxit = 10000, reltol = 1e-11, abstol = 1e-06)

# Make a list of all the models
all.models.3 <- list(model1.3, model2.3, model3.3, model4.3, model5.3, model6.3, 
    model7.3, model8.3, model9.3, model10.3, model11.3)
names(all.models.3) <- sapply(all.models.3, function(x) x$method)
sort(sapply(all.models.3, function(x) min(x$results$RMSE)))
##            ppr             rf            gbm          earth      svmRadial 
##          93.22         106.44         110.72         113.92         115.67 
##            gam     blackboost         glmnet            glm mlpWeightDecay 
##         120.10         126.68         135.13         135.13         609.73 
##           nnet 
##        2978.11

# Make a greedy ensemble - currently can only use RMSE
greedy3 <- caretEnsemble(all.models.3, iter = 1000L)
sort(greedy3$weights, decreasing = TRUE)
##       ppr svmRadial       gam       gbm     earth        rf      nnet 
##     0.512     0.160     0.100     0.097     0.088     0.042     0.001
greedy2$error
## RMSE 
##   NA

# Make a linear regression ensemble
linear3 <- caretStack(all.models.3, method = "glm", trControl = trainControl(method = "cv"))
summary(linear3$ens_model$finalModel)
## 
## Call:
## NULL
## 
## Deviance Residuals: 
##    Min      1Q  Median      3Q     Max  
## -477.0   -33.4     2.0    43.8   429.5  
## 
## Coefficients: (1 not defined because of singularities)
##                Estimate Std. Error t value Pr(>|t|)    
## (Intercept)    -59.7198    56.4001   -1.06    0.291    
## gbm              0.1361     0.1473    0.92    0.356    
## blackboost       0.0156     0.1183    0.13    0.895    
## rf               0.1360     0.1585    0.86    0.392    
## mlpWeightDecay  -0.0197     0.0104   -1.89    0.060 .  
## ppr              0.5936     0.0844    7.03  2.6e-11 ***
## earth            0.1030     0.0788    1.31    0.193    
## glm              1.1047     0.6765    1.63    0.104    
## svmRadial        0.1534     0.0701    2.19    0.030 *  
## gam              0.1025     0.0641    1.60    0.112    
## glmnet          -1.3086     0.6824   -1.92    0.056 .  
## nnet                 NA         NA      NA       NA    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for gaussian family taken to be 7222)
## 
##     Null deviance: 42844307  on 230  degrees of freedom
## Residual deviance:  1588931  on 220  degrees of freedom
## AIC: 2721
## 
## Number of Fisher Scoring iterations: 2
linear3$error
##   parameter  RMSE Rsquared RMSESD RsquaredSD
## 1      none 84.78    0.959  38.25    0.03691

# Predict for test set:
preds3 <- data.frame(sapply(all.models.3, predict, newdata = X[!train, ]))
preds3$ENS_greedy <- predict(greedy3, newdata = X[!train, ])
preds3$ENS_linear <- predict(linear3, newdata = X[!train, ])
sort(sqrt(colMeans((preds3 - Y[!train])^2)))
##     ENS_greedy     ENS_linear             rf            gbm            ppr 
##          100.7          101.2          116.3          116.7          117.7 
##     blackboost            gam         glmnet            glm          earth 
##          133.8          140.5          156.8          156.8          163.1 
##      svmRadial mlpWeightDecay           nnet 
##          163.1          754.3         2877.1

Let’s produce a comparison for IPAC against Dr Sarro’s estimations.

# 
yy3 <- apply(sn3, 1, feature_extr, bq_clean)
yy3 <- as.data.frame(yy3)
colnames(yy3) <- as.character(sn3[, 1])
colnames(yy3) <- str_replace(paste("X", colnames(yy3), sep = ""), ":", ".")
# Predict for new dataset:
predf3 <- data.frame(sapply(all.models.3, predict, newdata = yy3))
predf3$ENS_greedy <- predict(greedy3, newdata = yy3)
predf3$ENS_linear <- predict(linear3, newdata = yy3)
# 
idx = apply(ipac_names[, -5], 1, buscar, lnm, 5)
XY = cbind(teff$fit[idx > 0], predf3$ENS_greedy[idx[idx > 0]])
plot(XY[!is.na(XY[, 1]), ])
plot of chunk lee15

plot of chunk lee15

# plot(dd[,1],predf$ENS_greedy[idx], xlim=c(1500,3500),ylim=c(1500,3500))
plot(XY[!is.na(XY[, 1]), ], xlim = c(1000, 5000), ylim = c(1000, 5000), xlab = "T from Dr Sarro", 
    ylab = "Greedy predicted T", main = "IPAC Predicted Temperature Comparison ")
lines(c(0, 5000), c(0, 5000), col = 2)
plot of chunk lee15

plot of chunk lee15

hist((XY[!is.na(XY[, 1]), 1] - XY[!is.na(XY[, 1]), 2])/sd(XY[!is.na(XY[, 1]), 
    1] - XY[!is.na(XY[, 1]), 2]), breaks = 20)
plot of chunk lee15

plot of chunk lee15

mer3 = mean(XY[!is.na(XY[, 1]), 1] - XY[!is.na(XY[, 1]), 2])
sder3 = sd(XY[!is.na(XY[, 1]), 1] - XY[!is.na(XY[, 1]), 2])

Then, the structure of the residual has an average of 219.8236 and a standard deviation of 1120.2989.

save.image(file = "paso3_rf_v1.RData")