Training Models From GA selected features
What if we test the data windows proposed by the paper?
- Regression analysis against features defined by the paper
- Regression analysis against features defined by the paper

Training Models From GA selected features

In previous steps features were selected by using GA technology to assess the suitability of such features.

Differnt GA strategies for fitness evaluation and we have tested nearcenter and randomforest technologies.

Feature selection was prepared acording to document paso2

Feature extraction as defined by the GA

For feature selection BT-Settl 2012 library from France Allard was selected and wavelength reduction was performed to become compatible with the data coming from the satellite IPAC.

# Procesado del genético de Tª
setwd("~/git/M_sel")
# Datos cargados a mano desde el fichero temporal con 50 iter
load("m_t-settl_5_rf.RData")
datos <- `m_t-settl_5_rf`
attributes(datos)

## $names
##  [1] "Class."             "Libraries."         "bb"                
##  [4] "bestChromosomes"    "bestFitness"        "call"              
##  [7] "callBackFunc"       "callEnhancerFunc"   "callPreFunc"       
## [10] "classes"            "collectMode"        "count"             
## [13] "countDivisor"       "countFilter"        "countLastColumn"   
## [16] "countValues"        "counted"            "data"              
## [19] "elapsedRun"         "elapsedTime"        "evolvedChromosomes"
## [22] "evolvedFitnesses"   "galgo"              "galgos"            
## [25] "gcCalls"            "gcFrequency"        "geneNames"         
## [28] "generation"         "iclasses"           "id"                
## [31] "leftTime"           "levels"             "main"              
## [34] "maxBigBangs"        "maxCounts"          "maxFitnesses"      
## [37] "maxSolutions"       "nClasses"           "onlySolutions"     
## [40] "running"            "sampleNames"        "saveFile"          
## [43] "saveFrequency"      "saveGeneBreaks"     "saveMode"          
## [46] "saveVariableName"   "solution"           "solutions"         
## [49] "startedTime"        "timing"             "userCancelled"     
## [52] "verbose"

# 
features <- list()
features$T <- c("45:52_55:62", "53:62_43:52", "44:51_52:61", "141:148_109:120", 
    "105:119_43:57", "150:159_139:146", "139:146_148:159")
#

Now, we start to load the original data bp_clean and bq_clean, and we will process the features according to the feature list.

In order to generate features we will use the formula \(\int{\lambda_{1}}{\lambda_{2}}{1-\frac{F_{line}}{F_{cont}} d\lambda}\) as depicted in formula (1) of the above reference.

The band frequences are defined in the previous table but \(F_{cont}\) is not defined. As such a genetic algorithm testing different potential continuum spectra will be tested and looking the best matching to explain spectral parameters by precious feature functions.

Once the signal points and continuous regions were identified, models were setted-up and assessed by crossvalidation procedure. Iedas for the implementation were taken from http://moderntoolmaking.blogspot.com.es/2013/03/caretensemble-classification-example.html

# Cargamos bp_clean (BT_SETTL) & bq_clean (IPAC)
load("~/git/M_prep/M_prep_cleanip_BT-Settl.RData")
# Buscamos las catacterísticas para extraerla del conjunto de train par T
signal <- unlist(lapply(features$T, function(x) {
    a <- strsplit(x, "_")
    return(a[[1]][1])
}))
noise <- unlist(lapply(features$T, function(x) {
    a <- strsplit(x, "_")
    return(a[[1]][2])
}))
sn <- cbind(signal, noise)
int_spec <- function(x, idx, norm = 0) {
    y <- x$data[[1]][eval(parse(text = idx)), ]
    xz <- diff(as.numeric(y[, 1]), 1)
    yz <- as.numeric(y[, 2])
    if (norm > 0) {
        z <- sum(xz)
    } else {
        z <- sum(xz * rollmean(yz, 2))
    }
    return(z)
}
# 
feature_extr <- function(sn, bp) {
    sig <- sn[1]
    noi <- sn[2]
    Fcont <- unlist(lapply(bp, int_spec, noi, 0))/unlist(lapply(bp, int_spec, 
        noi, 1))
    fea <- unlist(lapply(bp, int_spec, sig, 1)) - unlist(lapply(bp, int_spec, 
        sig, 0))/Fcont
    return(fea)
}
xx <- apply(sn, 1, feature_extr, bp_clean)
colnames(xx) <- as.character(sn[, 1])
colnames(xx) <- str_replace(paste("X", colnames(xx), sep = ""), ":", ".")
xx <- cbind(xx, unlist(lapply(bp_clean, function(x) {
    return(x$stellarp[1])
})))
colnames(xx)[ncol(xx)] <- "T"

# Just informing for the features
newfea <- cbind(t(apply(sn, 1, function(x) {
    return(range(bp_clean[[1]]$data[[1]][eval(parse(text = x[1])), 1]))
})), t(apply(sn, 1, function(x) {
    return(range(bp_clean[[1]]$data[[1]][eval(parse(text = x[2])), 1]))
})))
colnames(newfea) <- c("Signal_from", "Signal_To", "Cont_From", "Cont_To")

print(xtable(newfea), type = "html")

	Signal_from	Signal_To	Cont_From	Cont_To
1	8619.40	8644.60	8655.40	8680.60
2	8648.20	8680.60	8612.20	8644.60
3	8615.80	8641.00	8644.60	8677.00
4	8965.00	8990.20	8849.80	8889.40
5	8835.40	8885.80	8612.20	8662.60
6	8997.40	9029.80	8957.80	8983.00
7	8957.80	8983.00	8990.20	9029.80

Regression and modeling

Let’s build up several models and compare their performance. Generally speaking we split randomly the learning set and we will learn by ten folder cross validation. Then we will test the models against the unseen data and we will test the ensamble technology, even.

# Setup
gc(reset = TRUE)

##             used   (Mb) gc trigger   (Mb)  max used   (Mb)
## Ncells    771336   41.2    1265230   67.6    771336   41.2
## Vcells 512714595 3911.8  720315638 5495.6 512714595 3911.8

set.seed(42)  #From random.org

# Libraries
library(caret)

## Loading required package: lattice
## Loading required package: ggplot2
## 
## Attaching package: 'caret'
## 
## The following objects are masked from 'package:galgo':
## 
##     best, confusionMatrix

library(devtools)

## 
## Attaching package: 'devtools'
## 
## The following objects are masked from 'package:R.oo':
## 
##     check, unload

# Solo una vez: install_github('caretEnsemble', 'zachmayer') #Install zach's
# caretEnsemble package Code gathered from the author's post.
library(caretEnsemble)

# Data
library(mlbench)
xx <- as.data.frame(xx)
X <- xx[, -(ncol(xx))]
rownames(X) <- 1:nrow(X)
X <- data.frame(X)
Y <- xx[, ncol(xx)]

# Split train/test
train <- runif(nrow(X)) <= 0.66

# Setup CV Folds returnData=FALSE saves some space
folds = 10
repeats = 1
myControl <- trainControl(method = "cv", number = folds, repeats = repeats, 
    returnResamp = "none", returnData = FALSE, savePredictions = TRUE, verboseIter = FALSE, 
    allowParallel = TRUE, index = createMultiFolds(Y[train], k = folds, times = repeats))
# Train some models
model1 <- train(X[train, ], Y[train], method = "gbm", trControl = myControl, 
    tuneGrid = expand.grid(.n.trees = 500, .interaction.depth = 15, .shrinkage = 0.01), 
    verbose = FALSE)

## Loading required package: gbm
## Loading required package: survival
## Loading required package: splines
## 
## Attaching package: 'survival'
## 
## The following object is masked from 'package:caret':
## 
##     cluster
## 
## Loading required package: parallel
## Loaded gbm 2.1

model2 <- train(X[train, ], Y[train], method = "blackboost", trControl = myControl)

## Loading required package: party
## Loading required package: grid
## Loading required package: sandwich
## Loading required package: strucchange
## Loading required package: modeltools
## Loading required package: stats4
## 
## Attaching package: 'modeltools'
## 
## The following objects are masked from 'package:R.oo':
## 
##     clone, dimension
## 
## The following object is masked from 'package:plyr':
## 
##     empty
## 
## Loading required package: mboost
## This is mboost 2.2-3. See 'package?mboost' and the NEWS file
## for a complete list of changes.
## Note: The default for the computation of the degrees of freedom has changed.
##       For details see section 'Global Options' of '?bols'.
## 
## Attaching package: 'mboost'
## 
## The following object is masked from 'package:ggplot2':
## 
##     %+%

model3 <- train(X[train, ], Y[train], method = "rf", trControl = myControl)

## Loading required package: randomForest
## randomForest 4.6-7
## Type rfNews() to see new features/changes/bug fixes.

model4 <- train(X[train, ], Y[train], method = "mlpWeightDecay", trControl = myControl, 
    trace = FALSE)

## Loading required package: RSNNS
## Loading required package: Rcpp
## 
## Attaching package: 'RSNNS'
## 
## The following objects are masked from 'package:caret':
## 
##     confusionMatrix, train
## 
## The following object is masked from 'package:galgo':
## 
##     confusionMatrix

model5 <- train(X[train, ], Y[train], method = "ppr", trControl = myControl)
model6 <- train(X[train, ], Y[train], method = "earth", trControl = myControl)

## Loading required package: earth
## Loading required package: plotmo
## Loading required package: plotrix

model7 <- train(X[train, ], Y[train], method = "glm", trControl = myControl)
model8 <- train(X[train, ], Y[train], method = "svmRadial", trControl = myControl)

## Loading required package: kernlab
## 
## Attaching package: 'kernlab'
## 
## The following object is masked from 'package:modeltools':
## 
##     prior
## 
## The following object is masked from 'package:galgo':
## 
##     scaling

model9 <- train(X[train, ], Y[train], method = "gam", trControl = myControl)

## Loading required package: mgcv
## Loading required package: nlme
## This is mgcv 1.7-28. For overview type 'help("mgcv-package")'.
## 
## Attaching package: 'mgcv'
## 
## The following object is masked from 'package:magic':
## 
##     magic

model10 <- train(X[train, ], Y[train], method = "glmnet", trControl = myControl)

## Loading required package: glmnet
## Loading required package: Matrix
## Loaded glmnet 1.9-5

model11 <- train(X[train, ], Y[train], method = "nnet", trControl = myControl, 
    trace = FALSE, maxit = 10000, reltol = 1e-11, abstol = 1e-06)

## Loading required package: nnet


# Make a list of all the models
all.models <- list(model1, model2, model3, model4, model5, model6, model7, model8, 
    model9, model10, model11)
names(all.models) <- sapply(all.models, function(x) x$method)
sort(sapply(all.models, function(x) min(x$results$RMSE)))

##            ppr             rf          earth            gbm     blackboost 
##          64.08          71.37          74.94          75.77          93.99 
##            gam      svmRadial            glm         glmnet mlpWeightDecay 
##          97.53         101.08         126.78         127.05         478.13 
##           nnet 
##        2978.11


# Make a greedy ensemble - currently can only use RMSE
greedy <- caretEnsemble(all.models, iter = 1000L)

## Loading required package: pbapply

sort(greedy$weights, decreasing = TRUE)

##       ppr     earth        rf svmRadial       gam 
##     0.496     0.192     0.139     0.091     0.082

greedy$error

##  RMSE 
## 62.37


# Make a linear regression ensemble
linear <- caretStack(all.models, method = "glm", trControl = trainControl(method = "cv"))
summary(linear$ens_model$finalModel)

## 
## Call:
## NULL
## 
## Deviance Residuals: 
##    Min      1Q  Median      3Q     Max  
## -334.9   -21.7    -1.0    25.0   211.3  
## 
## Coefficients: (1 not defined because of singularities)
##                Estimate Std. Error t value Pr(>|t|)    
## (Intercept)     -3.3778    53.1730   -0.06  0.94941    
## gbm             -0.0639     0.1405   -0.46  0.64942    
## blackboost      -0.2494     0.0770   -3.24  0.00138 ** 
## rf               0.3468     0.1205    2.88  0.00440 ** 
## mlpWeightDecay  -0.0228     0.0142   -1.60  0.11155    
## ppr              0.4719     0.1000    4.72  4.2e-06 ***
## earth            0.2418     0.0782    3.09  0.00223 ** 
## glm             -0.5273     0.3735   -1.41  0.15950    
## svmRadial        0.2003     0.0573    3.50  0.00057 ***
## gam              0.0776     0.0458    1.69  0.09151 .  
## glmnet           0.5249     0.3753    1.40  0.16337    
## nnet                 NA         NA      NA       NA    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for gaussian family taken to be 3696)
## 
##     Null deviance: 42844307  on 230  degrees of freedom
## Residual deviance:   813028  on 220  degrees of freedom
## AIC: 2566
## 
## Number of Fisher Scoring iterations: 2

linear$error

##   parameter  RMSE Rsquared RMSESD RsquaredSD
## 1      none 63.28    0.977  21.54    0.01676


# Predict for test set:
preds <- data.frame(sapply(all.models, predict, newdata = X[!train, ]))
preds$ENS_greedy <- predict(greedy, newdata = X[!train, ])
preds$ENS_linear <- predict(linear, newdata = X[!train, ])
sort(sqrt(colMeans((preds - Y[!train])^2)))

##     ENS_greedy     ENS_linear          earth            ppr            gbm 
##          77.37          77.98          82.85          90.26          95.91 
##             rf     blackboost      svmRadial            gam         glmnet 
##          97.98         105.82         133.10         135.25         169.20 
##            glm mlpWeightDecay           nnet 
##         170.04         503.46        2877.07


plot(Y[!train], preds$ENS_greedy)
lines(c(0, 5000), c(0, 5000), col = 2)

plot of chunk lee02.1

Let’s have a look about the frequencies of T present in learning dataset:

# T histogram
hist(d_bt[, 2], breaks = 20, main = "T histogram for learning dataset")

plot of chunk lee02.2

IPAC validation

After having the models built as well as the ensamble of them it is time to predict for the IPAC dataset. Thus we start to prepare the data,

# Cargamos bq_clean (IPAC)
yy <- apply(sn, 1, feature_extr, bq_clean)
yy <- as.data.frame(yy)
colnames(yy) <- as.character(sn[, 1])
colnames(yy) <- str_replace(paste("X", colnames(yy), sep = ""), ":", ".")
# Predict for new dataset:
predf <- data.frame(sapply(all.models, predict, newdata = yy))
predf$ENS_greedy <- predict(greedy, newdata = yy)
predf$ENS_linear <- predict(linear, newdata = yy)

Neighbourhood Analysis

In order to realize how close or isolated the both sets (BT-SETTL and IPAC) are a PCA analysis is performed:

zz <- rbind(X, yy)
pcaz <- prcomp(zz)
plot(pcaz$x[, 1], pcaz$x[, 2], pch = ".")
points(pcaz$x[(nrow(X) + 1):nrow(zz), 1], pcaz$x[(nrow(X) + 1):nrow(zz), 2], 
    pch = "x", col = 3)
points(pcaz$x[1:nrow(X), 1], pcaz$x[1:nrow(X), 2], pch = "+", col = 2)

plot of chunk lee04

# BTSETTL: ROJO: 2 4 8 17 18 19 21 22 24 25 31 32 33 34 35 36 37 39 40 49 53
# 61
smpl_b = c(2, 4, 8, 17, 18, 19, 21, 22, 24, 25, 31, 32, 33, 34, 35, 36, 37, 
    39, 40, 49, 53, 61)
for (i in smpl_b) {
    plot(bp_clean[[i]]$data[[1]], type = "l", main = bp_clean[[i]]$name)
}

plot of chunk lee04

# IPAC: VERDE : c(567,716,722,791) -nrow(X) 221 370 376 445
smpl_i = c(567, 716, 722, 791) - nrow(X)
for (i in smpl_i) {
    plot(bq_clean[[i]]$data[[1]], type = "l", main = bq_clean[[i]]$name)
}

plot of chunk lee04

Estimation Comparison against ICA procedure

According to the obtained predictions, some comparison will be performed against the prediction carried out by using spectrafull projection by means of ICA/JADE technology. This was carried out by Dr Prendes Gero at http://innova.uned.es, so we will download her prediction datasets and just compare them.

# Cargamos estimacion (IPAC)
load("~/git/M_sel/IPAC_Belen_T.RData")
load("~/git/M_sel/belen_resul_T.RData")
rownames(dT) = rownames(dd)
lnm <- unlist(lapply(bq_clean, function(x) {
    return(x$name)
}))
idx = apply(data.frame(rownames(dT)), 1, function(x, y) {
    return(which(x == y))
}, lnm)
XY = cbind(dT[, ncol(dT)], predf$ENS_greedy[idx])
plot(XY)

plot of chunk lee05

plot(XY, xlim = c(0, 5000), ylim = c(0, 5000))
lines(c(0, 5000), c(0, 5000), col = 2)

plot of chunk lee05

hist((XY[, 1] - XY[, 2])/sd(XY[, 1] - XY[, 2]), breaks = 20)

plot of chunk lee05

mer1 = mean(XY[, 1] - XY[, 2])
sder1 = sd(XY[, 1] - XY[, 2])

Then, the structure of the residual has an average of 301.1111 and a standard deviation of 370.895.

Estimation Comparison against Temperature inferred from its Spectral Subtype

Let’s compare predictions between current procedure and the one provided by Dr Sarro Baro as derived from the likelyhood according to the star’s spectral subtype.

# Cargamos bq_clean (IPAC)
load("~/git/M_sel/Teffs.RData")
# 
buscar = function(x, y, l = 4) {
    for (i in x[1:l]) {
        stg = gsub("+", "p", gsub(" ", "_", i), fixed = TRUE)
        if (nchar(stg) > 0) {
            pos = grep(stg, y)
            if (length(pos) == 1) {
                return(pos)
            }
        }
    }
    return(-1)
}
# 
idx = apply(ipac_names[, -5], 1, buscar, lnm, 5)
XY = cbind(teff$fit[idx > 0], predf$ENS_greedy[idx[idx > 0]])
plot(XY[!is.na(XY[, 1]), ])

plot of chunk lee06

# plot(dd[,1],predf$ENS_greedy[idx], xlim=c(1500,3500),ylim=c(1500,3500))
plot(XY[!is.na(XY[, 1]), ], xlim = c(1000, 5000), ylim = c(1000, 5000), xlab = "T from Dr Sarro", 
    ylab = "Greedy predicted T", main = "IPAC Predicted Temperature Comparison ")
lines(c(0, 5000), c(0, 5000), col = 2)

plot of chunk lee06

hist((XY[!is.na(XY[, 1]), 1] - XY[!is.na(XY[, 1]), 2])/sd(XY[!is.na(XY[, 1]), 
    1] - XY[!is.na(XY[, 1]), 2]), breaks = 20)

plot of chunk lee06

mer2 = mean(XY[!is.na(XY[, 1]), 1] - XY[!is.na(XY[, 1]), 2])
sder2 = sd(XY[!is.na(XY[, 1]), 1] - XY[!is.na(XY[, 1]), 2])

Then, the structure of the residual has an average of 14.1379 and a standard deviation of 1064.5538.

Let’s compare the predicitons between Mrs Prendes-Gero and Prof. Sarro’s ones.

# Buscamos los nombres de Belen ...
idbl = apply(ipac_names[, -5], 1, buscar, rownames(dT), 5)
XY = cbind(teff$fit[idbl[idbl > 0]], dT[idbl > 0, ncol(dT)])
plot(XY)

plot of chunk lee06.01

# plot(dd[,1],predf$ENS_greedy[idx], xlim=c(1500,3500),ylim=c(1500,3500))
plot(XY, xlim = c(1000, 5000), ylim = c(1000, 5000), xlab = "T from Dr Sarro", 
    ylab = "T from Mrs Prendes-Gero", main = "IPAC Predicted Temperature Comparison ")
lines(c(0, 5000), c(0, 5000), col = 2)

plot of chunk lee06.01

hist((XY[!is.na(XY[, 1]), 1] - XY[!is.na(XY[, 1]), 2])/sd(XY[!is.na(XY[, 1]), 
    1] - XY[!is.na(XY[, 1]), 2]), breaks = 20)

## Error: character(0)

mer21 = mean(XY[!is.na(XY[, 1]), 1] - XY[!is.na(XY[, 1]), 2])
sder21 = sd(XY[!is.na(XY[, 1]), 1] - XY[!is.na(XY[, 1]), 2])

Then, the structure of the residual has an average of NA and a standard deviation of NA.

What if we test the data windows proposed by the paper?

As it was depicted in “The Infrared Telescope Facility (IRTF) spectral library: Spectral diagnostics for cool stars” A&A 549, A129 (2013), the authors M. Cesetti; A. Pizzella; V. D. Ivanov; L. Morelli; E. M. Corsini and E. Dalla Bontà have proposed a set of ranges for signals and for continuum inside the K band, it could make sense to test the same features for BT-SETTL instead of IRTF as they have done, especially whenever the data are inside the wavelenght range for IPAC. This means that the last two rows of the band I don’t make sense in the IPAC case and they are not presented.

Just as a remainder those values are:

# 
require(xtable)
print(xtable(bi[, 1:7]), "html")

	Element	From	To	ContFrom_1	ContTo_1	ContFrom_2	ContTo_2
1	Pa1	8461	8474	8474	8484	8563	8577
2	Ca1	8484	8513	8474	8484	8563	8577
3	Ca2	8522	8562	8474	8484	8563	8577
4	Pa2	8577	8619	8563	8577	8619	8642
5	Ca3	8642	8682	8619	8642	8700	8725
6	Pa3	8730	8772	8700	8725	8776	8792
7	Mg	8802	8811	8776	8792	8815	8850
8	Pa4	8850	8890	8815	8850	8890	8900
9	Pa5	9000	9030	8983	8998	9040	9050

# 
lgth <- 3.6
org <- 8461 - lgth
signal2 <- paste(round((bi[, 2] - org)/lgth), round((bi[, 3] - org)/lgth), sep = ":")
noise2 <- paste(round((bi[, 4] - org)/lgth), round((bi[, 5] - org)/lgth), sep = ":")
sn2 <- cbind(signal2, noise2)
xx2 <- apply(sn2, 1, feature_extr, bp_clean)
colnames(xx2) <- as.character(sn2[, 1])
colnames(xx2) <- str_replace(paste("X", colnames(xx2), sep = ""), ":", ".")
xx2 <- cbind(xx2, unlist(lapply(bp_clean, function(x) {
    return(x$stellarp[1])
})))
colnames(xx2)[ncol(xx2)] <- "T"

Now we perform the regression analysis

Regression analysis against features defined by the paper

# Data
library(mlbench)
xx2 <- as.data.frame(xx2)
X <- xx2[, -ncol(xx2)]
rownames(X) <- 1:nrow(X)
X <- data.frame(X)
Y <- xx2$T

# Train some models
model1.2 <- train(X[train, ], Y[train], method = "gbm", trControl = myControl, 
    tuneGrid = expand.grid(.n.trees = 500, .interaction.depth = 15, .shrinkage = 0.01), 
    verbose = FALSE)
model2.2 <- train(X[train, ], Y[train], method = "blackboost", trControl = myControl)
model3.2 <- train(X[train, ], Y[train], method = "rf", trControl = myControl)
model4.2 <- train(X[train, ], Y[train], method = "mlpWeightDecay", trControl = myControl, 
    trace = FALSE)
model5.2 <- train(X[train, ], Y[train], method = "ppr", trControl = myControl)
model6.2 <- train(X[train, ], Y[train], method = "earth", trControl = myControl)
model7.2 <- train(X[train, ], Y[train], method = "glm", trControl = myControl)
model8.2 <- train(X[train, ], Y[train], method = "svmRadial", trControl = myControl)
model9.2 <- train(X[train, ], Y[train], method = "gam", trControl = myControl)
model10.2 <- train(X[train, ], Y[train], method = "glmnet", trControl = myControl)
model11.2 <- train(X[train, ], Y[train], method = "nnet", trControl = myControl, 
    trace = FALSE, maxit = 10000, reltol = 1e-11, abstol = 1e-06)

# Make a list of all the models
all.models.2 <- list(model1.2, model2.2, model3.2, model4.2, model5.2, model6.2, 
    model7.2, model8.2, model9.2, model10.2, model11.2)
names(all.models.2) <- sapply(all.models.2, function(x) x$method)
sort(sapply(all.models.2, function(x) min(x$results$RMSE)))

##          earth            gbm             rf            ppr            gam 
##          65.76          74.76          74.94          82.12          84.52 
##     blackboost            glm         glmnet      svmRadial mlpWeightDecay 
##          85.81         106.24         106.51         111.06         504.97 
##           nnet 
##        2978.11


# Make a greedy ensemble - currently can only use RMSE
greedy2 <- caretEnsemble(all.models.2, iter = 1000L)
sort(greedy2$weights, decreasing = TRUE)

##     earth       gbm       ppr svmRadial 
##     0.553     0.186     0.159     0.102

greedy2$error

## RMSE 
##   NA


# Make a linear regression ensemble
linear2 <- caretStack(all.models.2, method = "glm", trControl = trainControl(method = "cv"))
summary(linear2$ens_model$finalModel)

## 
## Call:
## NULL
## 
## Deviance Residuals: 
##    Min      1Q  Median      3Q     Max  
## -343.1   -22.8     5.3    28.8   190.9  
## 
## Coefficients: (1 not defined because of singularities)
##                 Estimate Std. Error t value Pr(>|t|)    
## (Intercept)    -58.06409   39.08139   -1.49  0.13896    
## gbm              0.51797    0.15070    3.44  0.00072 ***
## blackboost      -0.08531    0.13012   -0.66  0.51286    
## rf              -0.28102    0.15430   -1.82  0.07009 .  
## mlpWeightDecay   0.00364    0.00925    0.39  0.69466    
## ppr              0.15557    0.06604    2.36  0.01948 *  
## earth            0.55749    0.08559    6.51    6e-10 ***
## glm              0.95023    0.44507    2.14  0.03400 *  
## svmRadial        0.13341    0.04867    2.74  0.00669 ** 
## gam              0.05041    0.06219    0.81  0.41855    
## glmnet          -0.98373    0.44701   -2.20  0.02893 *  
## nnet                  NA         NA      NA       NA    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for gaussian family taken to be 3528)
## 
##     Null deviance: 38063792  on 206  degrees of freedom
## Residual deviance:   691462  on 196  degrees of freedom
##   (24 observations deleted due to missingness)
## AIC: 2291
## 
## Number of Fisher Scoring iterations: 2

linear2$error

##   parameter  RMSE Rsquared RMSESD RsquaredSD
## 1      none 64.82   0.9756  20.95    0.02026


# Predict for test set:
preds2 <- data.frame(sapply(all.models.2, predict, newdata = X[!train, ]))
preds2$ENS_greedy <- predict(greedy2, newdata = X[!train, ])
preds2$ENS_linear <- predict(linear2, newdata = X[!train, ])
sort(sqrt(colMeans((preds2 - Y[!train])^2)))

##     ENS_greedy            gbm     ENS_linear             rf          earth 
##          84.66          84.84          87.60          89.68          99.70 
##     blackboost            gam            ppr      svmRadial         glmnet 
##         100.03         108.91         110.42         122.93         126.91 
##            glm mlpWeightDecay           nnet 
##         128.37        1646.61        2877.07

Let’s produce a comparison for IPAC against Dr Sarro’s estimations.

# Cargamos bq_clean (IPAC)
yy2 <- apply(sn2, 1, feature_extr, bq_clean)
yy2 <- as.data.frame(yy2)
colnames(yy2) <- as.character(sn2[, 1])
colnames(yy2) <- str_replace(paste("X", colnames(yy2), sep = ""), ":", ".")
# Predict for new dataset:
predf2 <- data.frame(sapply(all.models.2, predict, newdata = yy2))
predf2$ENS_greedy <- predict(greedy2, newdata = yy2)
predf2$ENS_linear <- predict(linear2, newdata = yy2)
# 
idx = apply(ipac_names[, -5], 1, buscar, lnm, 5)
XY = cbind(teff$fit[idx > 0], predf2$ENS_greedy[idx[idx > 0]])
plot(XY[!is.na(XY[, 1]), ])

plot of chunk lee10.01

# plot(dd[,1],predf$ENS_greedy[idx], xlim=c(1500,3500),ylim=c(1500,3500))
plot(XY[!is.na(XY[, 1]), ], xlim = c(1000, 5000), ylim = c(1000, 5000), xlab = "T from Dr Sarro", 
    ylab = "Greedy predicted T", main = "IPAC Predicted Temperature Comparison ")
lines(c(0, 5000), c(0, 5000), col = 2)

plot of chunk lee10.01

hist((XY[!is.na(XY[, 1]), 1] - XY[!is.na(XY[, 1]), 2])/sd(XY[!is.na(XY[, 1]), 
    1] - XY[!is.na(XY[, 1]), 2]), breaks = 20)

plot of chunk lee10.01

mer2 = mean(XY[!is.na(XY[, 1]), 1] - XY[!is.na(XY[, 1]), 2])
sder2 = sd(XY[!is.na(XY[, 1]), 1] - XY[!is.na(XY[, 1]), 2])

Then, the structure of the residual has an average of 252.7606 and a standard deviation of 1097.9708.

Now we can test the paper’s second proposal for continuous range: Just as a remainder those values are:

# 
require(xtable)
print(xtable(bi[, 1:7]), "html")

	Element	From	To	ContFrom_1	ContTo_1	ContFrom_2	ContTo_2
1	Pa1	8461	8474	8474	8484	8563	8577
2	Ca1	8484	8513	8474	8484	8563	8577
3	Ca2	8522	8562	8474	8484	8563	8577
4	Pa2	8577	8619	8563	8577	8619	8642
5	Ca3	8642	8682	8619	8642	8700	8725
6	Pa3	8730	8772	8700	8725	8776	8792
7	Mg	8802	8811	8776	8792	8815	8850
8	Pa4	8850	8890	8815	8850	8890	8900
9	Pa5	9000	9030	8983	8998	9040	9050

# 
lgth <- 3.6
org <- 8461 - lgth
signal3 <- paste(round((bi[1:8, 2] - org)/lgth), round((bi[1:8, 3] - org)/lgth), 
    sep = ":")
noise3 <- paste(round((bi[1:8, 6] - org)/lgth), round((bi[1:8, 7] - org)/lgth), 
    sep = ":")
sn3 <- cbind(signal3, noise3)
xx3 <- apply(sn3, 1, feature_extr, bp_clean)
colnames(xx3) <- as.character(sn3[, 1])
colnames(xx3) <- str_replace(paste("X", colnames(xx3), sep = ""), ":", ".")
xx3 <- cbind(xx3, unlist(lapply(bp_clean, function(x) {
    return(x$stellarp[1])
})))
colnames(xx3)[ncol(xx3)] <- "T"

Now we perform the regression analysis

Regression analysis against features defined by the paper

# Data
xx3 <- as.data.frame(xx3)
X <- xx3[, -ncol(xx3)]
rownames(X) <- 1:nrow(X)
X <- data.frame(X)
Y <- xx3$T

# Train some models
model1.3 <- train(X[train, ], Y[train], method = "gbm", trControl = myControl, 
    tuneGrid = expand.grid(.n.trees = 500, .interaction.depth = 15, .shrinkage = 0.01), 
    verbose = FALSE)
model2.3 <- train(X[train, ], Y[train], method = "blackboost", trControl = myControl)
model3.3 <- train(X[train, ], Y[train], method = "rf", trControl = myControl)
model4.3 <- train(X[train, ], Y[train], method = "mlpWeightDecay", trControl = myControl, 
    trace = FALSE)
model5.3 <- train(X[train, ], Y[train], method = "ppr", trControl = myControl)
model6.3 <- train(X[train, ], Y[train], method = "earth", trControl = myControl)
model7.3 <- train(X[train, ], Y[train], method = "glm", trControl = myControl)
model8.3 <- train(X[train, ], Y[train], method = "svmRadial", trControl = myControl)
model9.3 <- train(X[train, ], Y[train], method = "gam", trControl = myControl)
model10.3 <- train(X[train, ], Y[train], method = "glmnet", trControl = myControl)
model11.3 <- train(X[train, ], Y[train], method = "nnet", trControl = myControl, 
    trace = FALSE, maxit = 10000, reltol = 1e-11, abstol = 1e-06)

# Make a list of all the models
all.models.3 <- list(model1.3, model2.3, model3.3, model4.3, model5.3, model6.3, 
    model7.3, model8.3, model9.3, model10.3, model11.3)
names(all.models.3) <- sapply(all.models.3, function(x) x$method)
sort(sapply(all.models.3, function(x) min(x$results$RMSE)))

##            ppr             rf            gbm          earth      svmRadial 
##          93.22         106.44         110.72         113.92         115.67 
##            gam     blackboost         glmnet            glm mlpWeightDecay 
##         120.10         126.68         135.13         135.13         609.73 
##           nnet 
##        2978.11


# Make a greedy ensemble - currently can only use RMSE
greedy3 <- caretEnsemble(all.models.3, iter = 1000L)
sort(greedy3$weights, decreasing = TRUE)

##       ppr svmRadial       gam       gbm     earth        rf      nnet 
##     0.512     0.160     0.100     0.097     0.088     0.042     0.001

greedy2$error

## RMSE 
##   NA


# Make a linear regression ensemble
linear3 <- caretStack(all.models.3, method = "glm", trControl = trainControl(method = "cv"))
summary(linear3$ens_model$finalModel)

## 
## Call:
## NULL
## 
## Deviance Residuals: 
##    Min      1Q  Median      3Q     Max  
## -477.0   -33.4     2.0    43.8   429.5  
## 
## Coefficients: (1 not defined because of singularities)
##                Estimate Std. Error t value Pr(>|t|)    
## (Intercept)    -59.7198    56.4001   -1.06    0.291    
## gbm              0.1361     0.1473    0.92    0.356    
## blackboost       0.0156     0.1183    0.13    0.895    
## rf               0.1360     0.1585    0.86    0.392    
## mlpWeightDecay  -0.0197     0.0104   -1.89    0.060 .  
## ppr              0.5936     0.0844    7.03  2.6e-11 ***
## earth            0.1030     0.0788    1.31    0.193    
## glm              1.1047     0.6765    1.63    0.104    
## svmRadial        0.1534     0.0701    2.19    0.030 *  
## gam              0.1025     0.0641    1.60    0.112    
## glmnet          -1.3086     0.6824   -1.92    0.056 .  
## nnet                 NA         NA      NA       NA    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for gaussian family taken to be 7222)
## 
##     Null deviance: 42844307  on 230  degrees of freedom
## Residual deviance:  1588931  on 220  degrees of freedom
## AIC: 2721
## 
## Number of Fisher Scoring iterations: 2

linear3$error

##   parameter  RMSE Rsquared RMSESD RsquaredSD
## 1      none 84.78    0.959  38.25    0.03691


# Predict for test set:
preds3 <- data.frame(sapply(all.models.3, predict, newdata = X[!train, ]))
preds3$ENS_greedy <- predict(greedy3, newdata = X[!train, ])
preds3$ENS_linear <- predict(linear3, newdata = X[!train, ])
sort(sqrt(colMeans((preds3 - Y[!train])^2)))

##     ENS_greedy     ENS_linear             rf            gbm            ppr 
##          100.7          101.2          116.3          116.7          117.7 
##     blackboost            gam         glmnet            glm          earth 
##          133.8          140.5          156.8          156.8          163.1 
##      svmRadial mlpWeightDecay           nnet 
##          163.1          754.3         2877.1

Let’s produce a comparison for IPAC against Dr Sarro’s estimations.

# 
yy3 <- apply(sn3, 1, feature_extr, bq_clean)
yy3 <- as.data.frame(yy3)
colnames(yy3) <- as.character(sn3[, 1])
colnames(yy3) <- str_replace(paste("X", colnames(yy3), sep = ""), ":", ".")
# Predict for new dataset:
predf3 <- data.frame(sapply(all.models.3, predict, newdata = yy3))
predf3$ENS_greedy <- predict(greedy3, newdata = yy3)
predf3$ENS_linear <- predict(linear3, newdata = yy3)
# 
idx = apply(ipac_names[, -5], 1, buscar, lnm, 5)
XY = cbind(teff$fit[idx > 0], predf3$ENS_greedy[idx[idx > 0]])
plot(XY[!is.na(XY[, 1]), ])

plot of chunk lee15

# plot(dd[,1],predf$ENS_greedy[idx], xlim=c(1500,3500),ylim=c(1500,3500))
plot(XY[!is.na(XY[, 1]), ], xlim = c(1000, 5000), ylim = c(1000, 5000), xlab = "T from Dr Sarro", 
    ylab = "Greedy predicted T", main = "IPAC Predicted Temperature Comparison ")
lines(c(0, 5000), c(0, 5000), col = 2)

plot of chunk lee15

hist((XY[!is.na(XY[, 1]), 1] - XY[!is.na(XY[, 1]), 2])/sd(XY[!is.na(XY[, 1]), 
    1] - XY[!is.na(XY[, 1]), 2]), breaks = 20)

plot of chunk lee15

mer3 = mean(XY[!is.na(XY[, 1]), 1] - XY[!is.na(XY[, 1]), 2])
sder3 = sd(XY[!is.na(XY[, 1]), 1] - XY[!is.na(XY[, 1]), 2])

Then, the structure of the residual has an average of 219.8236 and a standard deviation of 1120.2989.

save.image(file = "paso3_rf_v1.RData")