In previous steps features were selected by using GA technology to assess the suitability of such features.
Differnt GA strategies for fitness evaluation and we have tested nearcenter and randomforest technologies.
Feature selection was prepared acording to document paso2
For feature selection BT-Settl 2012 library from France Allard was selected and wavelength reduction was performed to become compatible with the data coming from the satellite IPAC.
# Procesado del genético de Tª
setwd("~/git/M_sel")
# Datos cargados a mano desde el fichero temporal con 50 iter
load("m_t-settl_5_rf.RData")
datos <- `m_t-settl_5_rf`
attributes(datos)
## $names
## [1] "Class." "Libraries." "bb"
## [4] "bestChromosomes" "bestFitness" "call"
## [7] "callBackFunc" "callEnhancerFunc" "callPreFunc"
## [10] "classes" "collectMode" "count"
## [13] "countDivisor" "countFilter" "countLastColumn"
## [16] "countValues" "counted" "data"
## [19] "elapsedRun" "elapsedTime" "evolvedChromosomes"
## [22] "evolvedFitnesses" "galgo" "galgos"
## [25] "gcCalls" "gcFrequency" "geneNames"
## [28] "generation" "iclasses" "id"
## [31] "leftTime" "levels" "main"
## [34] "maxBigBangs" "maxCounts" "maxFitnesses"
## [37] "maxSolutions" "nClasses" "onlySolutions"
## [40] "running" "sampleNames" "saveFile"
## [43] "saveFrequency" "saveGeneBreaks" "saveMode"
## [46] "saveVariableName" "solution" "solutions"
## [49] "startedTime" "timing" "userCancelled"
## [52] "verbose"
#
features <- list()
features$T <- c("45:52_55:62", "53:62_43:52", "44:51_52:61", "141:148_109:120",
"105:119_43:57", "150:159_139:146", "139:146_148:159")
#
Now, we start to load the original data bp_clean and bq_clean, and we will process the features according to the feature list.
In order to generate features we will use the formula \(\int{\lambda_{1}}{\lambda_{2}}{1-\frac{F_{line}}{F_{cont}} d\lambda}\) as depicted in formula (1) of the above reference.
The band frequences are defined in the previous table but \(F_{cont}\) is not defined. As such a genetic algorithm testing different potential continuum spectra will be tested and looking the best matching to explain spectral parameters by precious feature functions.
Once the signal points and continuous regions were identified, models were setted-up and assessed by crossvalidation procedure. Iedas for the implementation were taken from http://moderntoolmaking.blogspot.com.es/2013/03/caretensemble-classification-example.html
# Cargamos bp_clean (BT_SETTL) & bq_clean (IPAC)
load("~/git/M_prep/M_prep_cleanip_BT-Settl.RData")
# Buscamos las catacterísticas para extraerla del conjunto de train par T
signal <- unlist(lapply(features$T, function(x) {
a <- strsplit(x, "_")
return(a[[1]][1])
}))
noise <- unlist(lapply(features$T, function(x) {
a <- strsplit(x, "_")
return(a[[1]][2])
}))
sn <- cbind(signal, noise)
int_spec <- function(x, idx, norm = 0) {
y <- x$data[[1]][eval(parse(text = idx)), ]
xz <- diff(as.numeric(y[, 1]), 1)
yz <- as.numeric(y[, 2])
if (norm > 0) {
z <- sum(xz)
} else {
z <- sum(xz * rollmean(yz, 2))
}
return(z)
}
#
feature_extr <- function(sn, bp) {
sig <- sn[1]
noi <- sn[2]
Fcont <- unlist(lapply(bp, int_spec, noi, 0))/unlist(lapply(bp, int_spec,
noi, 1))
fea <- unlist(lapply(bp, int_spec, sig, 1)) - unlist(lapply(bp, int_spec,
sig, 0))/Fcont
return(fea)
}
xx <- apply(sn, 1, feature_extr, bp_clean)
colnames(xx) <- as.character(sn[, 1])
colnames(xx) <- str_replace(paste("X", colnames(xx), sep = ""), ":", ".")
xx <- cbind(xx, unlist(lapply(bp_clean, function(x) {
return(x$stellarp[1])
})))
colnames(xx)[ncol(xx)] <- "T"
# Just informing for the features
newfea <- cbind(t(apply(sn, 1, function(x) {
return(range(bp_clean[[1]]$data[[1]][eval(parse(text = x[1])), 1]))
})), t(apply(sn, 1, function(x) {
return(range(bp_clean[[1]]$data[[1]][eval(parse(text = x[2])), 1]))
})))
colnames(newfea) <- c("Signal_from", "Signal_To", "Cont_From", "Cont_To")
print(xtable(newfea), type = "html")
| Signal_from | Signal_To | Cont_From | Cont_To | |
|---|---|---|---|---|
| 1 | 8619.40 | 8644.60 | 8655.40 | 8680.60 |
| 2 | 8648.20 | 8680.60 | 8612.20 | 8644.60 |
| 3 | 8615.80 | 8641.00 | 8644.60 | 8677.00 |
| 4 | 8965.00 | 8990.20 | 8849.80 | 8889.40 |
| 5 | 8835.40 | 8885.80 | 8612.20 | 8662.60 |
| 6 | 8997.40 | 9029.80 | 8957.80 | 8983.00 |
| 7 | 8957.80 | 8983.00 | 8990.20 | 9029.80 |
#
Let’s build up several models and compare their performance. Generally speaking we split randomly the learning set and we will learn by ten folder cross validation. Then we will test the models against the unseen data and we will test the ensamble technology, even.
# Setup
gc(reset = TRUE)
## used (Mb) gc trigger (Mb) max used (Mb)
## Ncells 771336 41.2 1265230 67.6 771336 41.2
## Vcells 512714595 3911.8 720315638 5495.6 512714595 3911.8
set.seed(42) #From random.org
# Libraries
library(caret)
## Loading required package: lattice
## Loading required package: ggplot2
##
## Attaching package: 'caret'
##
## The following objects are masked from 'package:galgo':
##
## best, confusionMatrix
library(devtools)
##
## Attaching package: 'devtools'
##
## The following objects are masked from 'package:R.oo':
##
## check, unload
# Solo una vez: install_github('caretEnsemble', 'zachmayer') #Install zach's
# caretEnsemble package Code gathered from the author's post.
library(caretEnsemble)
# Data
library(mlbench)
xx <- as.data.frame(xx)
X <- xx[, -(ncol(xx))]
rownames(X) <- 1:nrow(X)
X <- data.frame(X)
Y <- xx[, ncol(xx)]
# Split train/test
train <- runif(nrow(X)) <= 0.66
# Setup CV Folds returnData=FALSE saves some space
folds = 10
repeats = 1
myControl <- trainControl(method = "cv", number = folds, repeats = repeats,
returnResamp = "none", returnData = FALSE, savePredictions = TRUE, verboseIter = FALSE,
allowParallel = TRUE, index = createMultiFolds(Y[train], k = folds, times = repeats))
# Train some models
model1 <- train(X[train, ], Y[train], method = "gbm", trControl = myControl,
tuneGrid = expand.grid(.n.trees = 500, .interaction.depth = 15, .shrinkage = 0.01),
verbose = FALSE)
## Loading required package: gbm
## Loading required package: survival
## Loading required package: splines
##
## Attaching package: 'survival'
##
## The following object is masked from 'package:caret':
##
## cluster
##
## Loading required package: parallel
## Loaded gbm 2.1
model2 <- train(X[train, ], Y[train], method = "blackboost", trControl = myControl)
## Loading required package: party
## Loading required package: grid
## Loading required package: sandwich
## Loading required package: strucchange
## Loading required package: modeltools
## Loading required package: stats4
##
## Attaching package: 'modeltools'
##
## The following objects are masked from 'package:R.oo':
##
## clone, dimension
##
## The following object is masked from 'package:plyr':
##
## empty
##
## Loading required package: mboost
## This is mboost 2.2-3. See 'package?mboost' and the NEWS file
## for a complete list of changes.
## Note: The default for the computation of the degrees of freedom has changed.
## For details see section 'Global Options' of '?bols'.
##
## Attaching package: 'mboost'
##
## The following object is masked from 'package:ggplot2':
##
## %+%
model3 <- train(X[train, ], Y[train], method = "rf", trControl = myControl)
## Loading required package: randomForest
## randomForest 4.6-7
## Type rfNews() to see new features/changes/bug fixes.
model4 <- train(X[train, ], Y[train], method = "mlpWeightDecay", trControl = myControl,
trace = FALSE)
## Loading required package: RSNNS
## Loading required package: Rcpp
##
## Attaching package: 'RSNNS'
##
## The following objects are masked from 'package:caret':
##
## confusionMatrix, train
##
## The following object is masked from 'package:galgo':
##
## confusionMatrix
model5 <- train(X[train, ], Y[train], method = "ppr", trControl = myControl)
model6 <- train(X[train, ], Y[train], method = "earth", trControl = myControl)
## Loading required package: earth
## Loading required package: plotmo
## Loading required package: plotrix
model7 <- train(X[train, ], Y[train], method = "glm", trControl = myControl)
model8 <- train(X[train, ], Y[train], method = "svmRadial", trControl = myControl)
## Loading required package: kernlab
##
## Attaching package: 'kernlab'
##
## The following object is masked from 'package:modeltools':
##
## prior
##
## The following object is masked from 'package:galgo':
##
## scaling
model9 <- train(X[train, ], Y[train], method = "gam", trControl = myControl)
## Loading required package: mgcv
## Loading required package: nlme
## This is mgcv 1.7-28. For overview type 'help("mgcv-package")'.
##
## Attaching package: 'mgcv'
##
## The following object is masked from 'package:magic':
##
## magic
model10 <- train(X[train, ], Y[train], method = "glmnet", trControl = myControl)
## Loading required package: glmnet
## Loading required package: Matrix
## Loaded glmnet 1.9-5
model11 <- train(X[train, ], Y[train], method = "nnet", trControl = myControl,
trace = FALSE, maxit = 10000, reltol = 1e-11, abstol = 1e-06)
## Loading required package: nnet
# Make a list of all the models
all.models <- list(model1, model2, model3, model4, model5, model6, model7, model8,
model9, model10, model11)
names(all.models) <- sapply(all.models, function(x) x$method)
sort(sapply(all.models, function(x) min(x$results$RMSE)))
## ppr rf earth gbm blackboost
## 64.08 71.37 74.94 75.77 93.99
## gam svmRadial glm glmnet mlpWeightDecay
## 97.53 101.08 126.78 127.05 478.13
## nnet
## 2978.11
# Make a greedy ensemble - currently can only use RMSE
greedy <- caretEnsemble(all.models, iter = 1000L)
## Loading required package: pbapply
sort(greedy$weights, decreasing = TRUE)
## ppr earth rf svmRadial gam
## 0.496 0.192 0.139 0.091 0.082
greedy$error
## RMSE
## 62.37
# Make a linear regression ensemble
linear <- caretStack(all.models, method = "glm", trControl = trainControl(method = "cv"))
summary(linear$ens_model$finalModel)
##
## Call:
## NULL
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -334.9 -21.7 -1.0 25.0 211.3
##
## Coefficients: (1 not defined because of singularities)
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -3.3778 53.1730 -0.06 0.94941
## gbm -0.0639 0.1405 -0.46 0.64942
## blackboost -0.2494 0.0770 -3.24 0.00138 **
## rf 0.3468 0.1205 2.88 0.00440 **
## mlpWeightDecay -0.0228 0.0142 -1.60 0.11155
## ppr 0.4719 0.1000 4.72 4.2e-06 ***
## earth 0.2418 0.0782 3.09 0.00223 **
## glm -0.5273 0.3735 -1.41 0.15950
## svmRadial 0.2003 0.0573 3.50 0.00057 ***
## gam 0.0776 0.0458 1.69 0.09151 .
## glmnet 0.5249 0.3753 1.40 0.16337
## nnet NA NA NA NA
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for gaussian family taken to be 3696)
##
## Null deviance: 42844307 on 230 degrees of freedom
## Residual deviance: 813028 on 220 degrees of freedom
## AIC: 2566
##
## Number of Fisher Scoring iterations: 2
linear$error
## parameter RMSE Rsquared RMSESD RsquaredSD
## 1 none 63.28 0.977 21.54 0.01676
# Predict for test set:
preds <- data.frame(sapply(all.models, predict, newdata = X[!train, ]))
preds$ENS_greedy <- predict(greedy, newdata = X[!train, ])
preds$ENS_linear <- predict(linear, newdata = X[!train, ])
sort(sqrt(colMeans((preds - Y[!train])^2)))
## ENS_greedy ENS_linear earth ppr gbm
## 77.37 77.98 82.85 90.26 95.91
## rf blackboost svmRadial gam glmnet
## 97.98 105.82 133.10 135.25 169.20
## glm mlpWeightDecay nnet
## 170.04 503.46 2877.07
plot(Y[!train], preds$ENS_greedy)
lines(c(0, 5000), c(0, 5000), col = 2)

plot of chunk lee02.1
Let’s have a look about the frequencies of T present in learning dataset:
# T histogram
hist(d_bt[, 2], breaks = 20, main = "T histogram for learning dataset")

plot of chunk lee02.2
After having the models built as well as the ensamble of them it is time to predict for the IPAC dataset. Thus we start to prepare the data,
# Cargamos bq_clean (IPAC)
yy <- apply(sn, 1, feature_extr, bq_clean)
yy <- as.data.frame(yy)
colnames(yy) <- as.character(sn[, 1])
colnames(yy) <- str_replace(paste("X", colnames(yy), sep = ""), ":", ".")
# Predict for new dataset:
predf <- data.frame(sapply(all.models, predict, newdata = yy))
predf$ENS_greedy <- predict(greedy, newdata = yy)
predf$ENS_linear <- predict(linear, newdata = yy)
In order to realize how close or isolated the both sets (BT-SETTL and IPAC) are a PCA analysis is performed:
zz <- rbind(X, yy)
pcaz <- prcomp(zz)
plot(pcaz$x[, 1], pcaz$x[, 2], pch = ".")
points(pcaz$x[(nrow(X) + 1):nrow(zz), 1], pcaz$x[(nrow(X) + 1):nrow(zz), 2],
pch = "x", col = 3)
points(pcaz$x[1:nrow(X), 1], pcaz$x[1:nrow(X), 2], pch = "+", col = 2)

plot of chunk lee04
# BTSETTL: ROJO: 2 4 8 17 18 19 21 22 24 25 31 32 33 34 35 36 37 39 40 49 53
# 61
smpl_b = c(2, 4, 8, 17, 18, 19, 21, 22, 24, 25, 31, 32, 33, 34, 35, 36, 37,
39, 40, 49, 53, 61)
for (i in smpl_b) {
plot(bp_clean[[i]]$data[[1]], type = "l", main = bp_clean[[i]]$name)
}

# IPAC: VERDE : c(567,716,722,791) -nrow(X) 221 370 376 445
smpl_i = c(567, 716, 722, 791) - nrow(X)
for (i in smpl_i) {
plot(bq_clean[[i]]$data[[1]], type = "l", main = bq_clean[[i]]$name)
}

According to the obtained predictions, some comparison will be performed against the prediction carried out by using spectrafull projection by means of ICA/JADE technology. This was carried out by Dr Prendes Gero at http://innova.uned.es, so we will download her prediction datasets and just compare them.
# Cargamos estimacion (IPAC)
load("~/git/M_sel/IPAC_Belen_T.RData")
load("~/git/M_sel/belen_resul_T.RData")
rownames(dT) = rownames(dd)
lnm <- unlist(lapply(bq_clean, function(x) {
return(x$name)
}))
idx = apply(data.frame(rownames(dT)), 1, function(x, y) {
return(which(x == y))
}, lnm)
XY = cbind(dT[, ncol(dT)], predf$ENS_greedy[idx])
plot(XY)

plot of chunk lee05
plot(XY, xlim = c(0, 5000), ylim = c(0, 5000))
lines(c(0, 5000), c(0, 5000), col = 2)

plot of chunk lee05
hist((XY[, 1] - XY[, 2])/sd(XY[, 1] - XY[, 2]), breaks = 20)

plot of chunk lee05
mer1 = mean(XY[, 1] - XY[, 2])
sder1 = sd(XY[, 1] - XY[, 2])
Then, the structure of the residual has an average of 301.1111 and a standard deviation of 370.895.
Let’s compare predictions between current procedure and the one provided by Dr Sarro Baro as derived from the likelyhood according to the star’s spectral subtype.
# Cargamos bq_clean (IPAC)
load("~/git/M_sel/Teffs.RData")
#
buscar = function(x, y, l = 4) {
for (i in x[1:l]) {
stg = gsub("+", "p", gsub(" ", "_", i), fixed = TRUE)
if (nchar(stg) > 0) {
pos = grep(stg, y)
if (length(pos) == 1) {
return(pos)
}
}
}
return(-1)
}
#
idx = apply(ipac_names[, -5], 1, buscar, lnm, 5)
XY = cbind(teff$fit[idx > 0], predf$ENS_greedy[idx[idx > 0]])
plot(XY[!is.na(XY[, 1]), ])

plot of chunk lee06
# plot(dd[,1],predf$ENS_greedy[idx], xlim=c(1500,3500),ylim=c(1500,3500))
plot(XY[!is.na(XY[, 1]), ], xlim = c(1000, 5000), ylim = c(1000, 5000), xlab = "T from Dr Sarro",
ylab = "Greedy predicted T", main = "IPAC Predicted Temperature Comparison ")
lines(c(0, 5000), c(0, 5000), col = 2)

plot of chunk lee06
hist((XY[!is.na(XY[, 1]), 1] - XY[!is.na(XY[, 1]), 2])/sd(XY[!is.na(XY[, 1]),
1] - XY[!is.na(XY[, 1]), 2]), breaks = 20)

plot of chunk lee06
mer2 = mean(XY[!is.na(XY[, 1]), 1] - XY[!is.na(XY[, 1]), 2])
sder2 = sd(XY[!is.na(XY[, 1]), 1] - XY[!is.na(XY[, 1]), 2])
Then, the structure of the residual has an average of 14.1379 and a standard deviation of 1064.5538.
Let’s compare the predicitons between Mrs Prendes-Gero and Prof. Sarro’s ones.
# Buscamos los nombres de Belen ...
idbl = apply(ipac_names[, -5], 1, buscar, rownames(dT), 5)
XY = cbind(teff$fit[idbl[idbl > 0]], dT[idbl > 0, ncol(dT)])
plot(XY)

plot of chunk lee06.01
# plot(dd[,1],predf$ENS_greedy[idx], xlim=c(1500,3500),ylim=c(1500,3500))
plot(XY, xlim = c(1000, 5000), ylim = c(1000, 5000), xlab = "T from Dr Sarro",
ylab = "T from Mrs Prendes-Gero", main = "IPAC Predicted Temperature Comparison ")
lines(c(0, 5000), c(0, 5000), col = 2)

plot of chunk lee06.01
hist((XY[!is.na(XY[, 1]), 1] - XY[!is.na(XY[, 1]), 2])/sd(XY[!is.na(XY[, 1]),
1] - XY[!is.na(XY[, 1]), 2]), breaks = 20)
## Error: character(0)
mer21 = mean(XY[!is.na(XY[, 1]), 1] - XY[!is.na(XY[, 1]), 2])
sder21 = sd(XY[!is.na(XY[, 1]), 1] - XY[!is.na(XY[, 1]), 2])
Then, the structure of the residual has an average of NA and a standard deviation of NA.
As it was depicted in “The Infrared Telescope Facility (IRTF) spectral library: Spectral diagnostics for cool stars” A&A 549, A129 (2013), the authors M. Cesetti; A. Pizzella; V. D. Ivanov; L. Morelli; E. M. Corsini and E. Dalla Bontà have proposed a set of ranges for signals and for continuum inside the K band, it could make sense to test the same features for BT-SETTL instead of IRTF as they have done, especially whenever the data are inside the wavelenght range for IPAC. This means that the last two rows of the band I don’t make sense in the IPAC case and they are not presented.
Just as a remainder those values are:
#
require(xtable)
print(xtable(bi[, 1:7]), "html")
| Element | From | To | ContFrom_1 | ContTo_1 | ContFrom_2 | ContTo_2 | |
|---|---|---|---|---|---|---|---|
| 1 | Pa1 | 8461 | 8474 | 8474 | 8484 | 8563 | 8577 |
| 2 | Ca1 | 8484 | 8513 | 8474 | 8484 | 8563 | 8577 |
| 3 | Ca2 | 8522 | 8562 | 8474 | 8484 | 8563 | 8577 |
| 4 | Pa2 | 8577 | 8619 | 8563 | 8577 | 8619 | 8642 |
| 5 | Ca3 | 8642 | 8682 | 8619 | 8642 | 8700 | 8725 |
| 6 | Pa3 | 8730 | 8772 | 8700 | 8725 | 8776 | 8792 |
| 7 | Mg | 8802 | 8811 | 8776 | 8792 | 8815 | 8850 |
| 8 | Pa4 | 8850 | 8890 | 8815 | 8850 | 8890 | 8900 |
| 9 | Pa5 | 9000 | 9030 | 8983 | 8998 | 9040 | 9050 |
#
lgth <- 3.6
org <- 8461 - lgth
signal2 <- paste(round((bi[, 2] - org)/lgth), round((bi[, 3] - org)/lgth), sep = ":")
noise2 <- paste(round((bi[, 4] - org)/lgth), round((bi[, 5] - org)/lgth), sep = ":")
sn2 <- cbind(signal2, noise2)
xx2 <- apply(sn2, 1, feature_extr, bp_clean)
colnames(xx2) <- as.character(sn2[, 1])
colnames(xx2) <- str_replace(paste("X", colnames(xx2), sep = ""), ":", ".")
xx2 <- cbind(xx2, unlist(lapply(bp_clean, function(x) {
return(x$stellarp[1])
})))
colnames(xx2)[ncol(xx2)] <- "T"
Now we perform the regression analysis
# Data
library(mlbench)
xx2 <- as.data.frame(xx2)
X <- xx2[, -ncol(xx2)]
rownames(X) <- 1:nrow(X)
X <- data.frame(X)
Y <- xx2$T
# Train some models
model1.2 <- train(X[train, ], Y[train], method = "gbm", trControl = myControl,
tuneGrid = expand.grid(.n.trees = 500, .interaction.depth = 15, .shrinkage = 0.01),
verbose = FALSE)
model2.2 <- train(X[train, ], Y[train], method = "blackboost", trControl = myControl)
model3.2 <- train(X[train, ], Y[train], method = "rf", trControl = myControl)
model4.2 <- train(X[train, ], Y[train], method = "mlpWeightDecay", trControl = myControl,
trace = FALSE)
model5.2 <- train(X[train, ], Y[train], method = "ppr", trControl = myControl)
model6.2 <- train(X[train, ], Y[train], method = "earth", trControl = myControl)
model7.2 <- train(X[train, ], Y[train], method = "glm", trControl = myControl)
model8.2 <- train(X[train, ], Y[train], method = "svmRadial", trControl = myControl)
model9.2 <- train(X[train, ], Y[train], method = "gam", trControl = myControl)
model10.2 <- train(X[train, ], Y[train], method = "glmnet", trControl = myControl)
model11.2 <- train(X[train, ], Y[train], method = "nnet", trControl = myControl,
trace = FALSE, maxit = 10000, reltol = 1e-11, abstol = 1e-06)
# Make a list of all the models
all.models.2 <- list(model1.2, model2.2, model3.2, model4.2, model5.2, model6.2,
model7.2, model8.2, model9.2, model10.2, model11.2)
names(all.models.2) <- sapply(all.models.2, function(x) x$method)
sort(sapply(all.models.2, function(x) min(x$results$RMSE)))
## earth gbm rf ppr gam
## 65.76 74.76 74.94 82.12 84.52
## blackboost glm glmnet svmRadial mlpWeightDecay
## 85.81 106.24 106.51 111.06 504.97
## nnet
## 2978.11
# Make a greedy ensemble - currently can only use RMSE
greedy2 <- caretEnsemble(all.models.2, iter = 1000L)
sort(greedy2$weights, decreasing = TRUE)
## earth gbm ppr svmRadial
## 0.553 0.186 0.159 0.102
greedy2$error
## RMSE
## NA
# Make a linear regression ensemble
linear2 <- caretStack(all.models.2, method = "glm", trControl = trainControl(method = "cv"))
summary(linear2$ens_model$finalModel)
##
## Call:
## NULL
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -343.1 -22.8 5.3 28.8 190.9
##
## Coefficients: (1 not defined because of singularities)
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -58.06409 39.08139 -1.49 0.13896
## gbm 0.51797 0.15070 3.44 0.00072 ***
## blackboost -0.08531 0.13012 -0.66 0.51286
## rf -0.28102 0.15430 -1.82 0.07009 .
## mlpWeightDecay 0.00364 0.00925 0.39 0.69466
## ppr 0.15557 0.06604 2.36 0.01948 *
## earth 0.55749 0.08559 6.51 6e-10 ***
## glm 0.95023 0.44507 2.14 0.03400 *
## svmRadial 0.13341 0.04867 2.74 0.00669 **
## gam 0.05041 0.06219 0.81 0.41855
## glmnet -0.98373 0.44701 -2.20 0.02893 *
## nnet NA NA NA NA
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for gaussian family taken to be 3528)
##
## Null deviance: 38063792 on 206 degrees of freedom
## Residual deviance: 691462 on 196 degrees of freedom
## (24 observations deleted due to missingness)
## AIC: 2291
##
## Number of Fisher Scoring iterations: 2
linear2$error
## parameter RMSE Rsquared RMSESD RsquaredSD
## 1 none 64.82 0.9756 20.95 0.02026
# Predict for test set:
preds2 <- data.frame(sapply(all.models.2, predict, newdata = X[!train, ]))
preds2$ENS_greedy <- predict(greedy2, newdata = X[!train, ])
preds2$ENS_linear <- predict(linear2, newdata = X[!train, ])
sort(sqrt(colMeans((preds2 - Y[!train])^2)))
## ENS_greedy gbm ENS_linear rf earth
## 84.66 84.84 87.60 89.68 99.70
## blackboost gam ppr svmRadial glmnet
## 100.03 108.91 110.42 122.93 126.91
## glm mlpWeightDecay nnet
## 128.37 1646.61 2877.07
Let’s produce a comparison for IPAC against Dr Sarro’s estimations.
# Cargamos bq_clean (IPAC)
yy2 <- apply(sn2, 1, feature_extr, bq_clean)
yy2 <- as.data.frame(yy2)
colnames(yy2) <- as.character(sn2[, 1])
colnames(yy2) <- str_replace(paste("X", colnames(yy2), sep = ""), ":", ".")
# Predict for new dataset:
predf2 <- data.frame(sapply(all.models.2, predict, newdata = yy2))
predf2$ENS_greedy <- predict(greedy2, newdata = yy2)
predf2$ENS_linear <- predict(linear2, newdata = yy2)
#
idx = apply(ipac_names[, -5], 1, buscar, lnm, 5)
XY = cbind(teff$fit[idx > 0], predf2$ENS_greedy[idx[idx > 0]])
plot(XY[!is.na(XY[, 1]), ])

plot of chunk lee10.01
# plot(dd[,1],predf$ENS_greedy[idx], xlim=c(1500,3500),ylim=c(1500,3500))
plot(XY[!is.na(XY[, 1]), ], xlim = c(1000, 5000), ylim = c(1000, 5000), xlab = "T from Dr Sarro",
ylab = "Greedy predicted T", main = "IPAC Predicted Temperature Comparison ")
lines(c(0, 5000), c(0, 5000), col = 2)

plot of chunk lee10.01
hist((XY[!is.na(XY[, 1]), 1] - XY[!is.na(XY[, 1]), 2])/sd(XY[!is.na(XY[, 1]),
1] - XY[!is.na(XY[, 1]), 2]), breaks = 20)

plot of chunk lee10.01
mer2 = mean(XY[!is.na(XY[, 1]), 1] - XY[!is.na(XY[, 1]), 2])
sder2 = sd(XY[!is.na(XY[, 1]), 1] - XY[!is.na(XY[, 1]), 2])
Then, the structure of the residual has an average of 252.7606 and a standard deviation of 1097.9708.
Now we can test the paper’s second proposal for continuous range: Just as a remainder those values are:
#
require(xtable)
print(xtable(bi[, 1:7]), "html")
| Element | From | To | ContFrom_1 | ContTo_1 | ContFrom_2 | ContTo_2 | |
|---|---|---|---|---|---|---|---|
| 1 | Pa1 | 8461 | 8474 | 8474 | 8484 | 8563 | 8577 |
| 2 | Ca1 | 8484 | 8513 | 8474 | 8484 | 8563 | 8577 |
| 3 | Ca2 | 8522 | 8562 | 8474 | 8484 | 8563 | 8577 |
| 4 | Pa2 | 8577 | 8619 | 8563 | 8577 | 8619 | 8642 |
| 5 | Ca3 | 8642 | 8682 | 8619 | 8642 | 8700 | 8725 |
| 6 | Pa3 | 8730 | 8772 | 8700 | 8725 | 8776 | 8792 |
| 7 | Mg | 8802 | 8811 | 8776 | 8792 | 8815 | 8850 |
| 8 | Pa4 | 8850 | 8890 | 8815 | 8850 | 8890 | 8900 |
| 9 | Pa5 | 9000 | 9030 | 8983 | 8998 | 9040 | 9050 |
#
lgth <- 3.6
org <- 8461 - lgth
signal3 <- paste(round((bi[1:8, 2] - org)/lgth), round((bi[1:8, 3] - org)/lgth),
sep = ":")
noise3 <- paste(round((bi[1:8, 6] - org)/lgth), round((bi[1:8, 7] - org)/lgth),
sep = ":")
sn3 <- cbind(signal3, noise3)
xx3 <- apply(sn3, 1, feature_extr, bp_clean)
colnames(xx3) <- as.character(sn3[, 1])
colnames(xx3) <- str_replace(paste("X", colnames(xx3), sep = ""), ":", ".")
xx3 <- cbind(xx3, unlist(lapply(bp_clean, function(x) {
return(x$stellarp[1])
})))
colnames(xx3)[ncol(xx3)] <- "T"
Now we perform the regression analysis
# Data
xx3 <- as.data.frame(xx3)
X <- xx3[, -ncol(xx3)]
rownames(X) <- 1:nrow(X)
X <- data.frame(X)
Y <- xx3$T
# Train some models
model1.3 <- train(X[train, ], Y[train], method = "gbm", trControl = myControl,
tuneGrid = expand.grid(.n.trees = 500, .interaction.depth = 15, .shrinkage = 0.01),
verbose = FALSE)
model2.3 <- train(X[train, ], Y[train], method = "blackboost", trControl = myControl)
model3.3 <- train(X[train, ], Y[train], method = "rf", trControl = myControl)
model4.3 <- train(X[train, ], Y[train], method = "mlpWeightDecay", trControl = myControl,
trace = FALSE)
model5.3 <- train(X[train, ], Y[train], method = "ppr", trControl = myControl)
model6.3 <- train(X[train, ], Y[train], method = "earth", trControl = myControl)
model7.3 <- train(X[train, ], Y[train], method = "glm", trControl = myControl)
model8.3 <- train(X[train, ], Y[train], method = "svmRadial", trControl = myControl)
model9.3 <- train(X[train, ], Y[train], method = "gam", trControl = myControl)
model10.3 <- train(X[train, ], Y[train], method = "glmnet", trControl = myControl)
model11.3 <- train(X[train, ], Y[train], method = "nnet", trControl = myControl,
trace = FALSE, maxit = 10000, reltol = 1e-11, abstol = 1e-06)
# Make a list of all the models
all.models.3 <- list(model1.3, model2.3, model3.3, model4.3, model5.3, model6.3,
model7.3, model8.3, model9.3, model10.3, model11.3)
names(all.models.3) <- sapply(all.models.3, function(x) x$method)
sort(sapply(all.models.3, function(x) min(x$results$RMSE)))
## ppr rf gbm earth svmRadial
## 93.22 106.44 110.72 113.92 115.67
## gam blackboost glmnet glm mlpWeightDecay
## 120.10 126.68 135.13 135.13 609.73
## nnet
## 2978.11
# Make a greedy ensemble - currently can only use RMSE
greedy3 <- caretEnsemble(all.models.3, iter = 1000L)
sort(greedy3$weights, decreasing = TRUE)
## ppr svmRadial gam gbm earth rf nnet
## 0.512 0.160 0.100 0.097 0.088 0.042 0.001
greedy2$error
## RMSE
## NA
# Make a linear regression ensemble
linear3 <- caretStack(all.models.3, method = "glm", trControl = trainControl(method = "cv"))
summary(linear3$ens_model$finalModel)
##
## Call:
## NULL
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -477.0 -33.4 2.0 43.8 429.5
##
## Coefficients: (1 not defined because of singularities)
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -59.7198 56.4001 -1.06 0.291
## gbm 0.1361 0.1473 0.92 0.356
## blackboost 0.0156 0.1183 0.13 0.895
## rf 0.1360 0.1585 0.86 0.392
## mlpWeightDecay -0.0197 0.0104 -1.89 0.060 .
## ppr 0.5936 0.0844 7.03 2.6e-11 ***
## earth 0.1030 0.0788 1.31 0.193
## glm 1.1047 0.6765 1.63 0.104
## svmRadial 0.1534 0.0701 2.19 0.030 *
## gam 0.1025 0.0641 1.60 0.112
## glmnet -1.3086 0.6824 -1.92 0.056 .
## nnet NA NA NA NA
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for gaussian family taken to be 7222)
##
## Null deviance: 42844307 on 230 degrees of freedom
## Residual deviance: 1588931 on 220 degrees of freedom
## AIC: 2721
##
## Number of Fisher Scoring iterations: 2
linear3$error
## parameter RMSE Rsquared RMSESD RsquaredSD
## 1 none 84.78 0.959 38.25 0.03691
# Predict for test set:
preds3 <- data.frame(sapply(all.models.3, predict, newdata = X[!train, ]))
preds3$ENS_greedy <- predict(greedy3, newdata = X[!train, ])
preds3$ENS_linear <- predict(linear3, newdata = X[!train, ])
sort(sqrt(colMeans((preds3 - Y[!train])^2)))
## ENS_greedy ENS_linear rf gbm ppr
## 100.7 101.2 116.3 116.7 117.7
## blackboost gam glmnet glm earth
## 133.8 140.5 156.8 156.8 163.1
## svmRadial mlpWeightDecay nnet
## 163.1 754.3 2877.1
Let’s produce a comparison for IPAC against Dr Sarro’s estimations.
#
yy3 <- apply(sn3, 1, feature_extr, bq_clean)
yy3 <- as.data.frame(yy3)
colnames(yy3) <- as.character(sn3[, 1])
colnames(yy3) <- str_replace(paste("X", colnames(yy3), sep = ""), ":", ".")
# Predict for new dataset:
predf3 <- data.frame(sapply(all.models.3, predict, newdata = yy3))
predf3$ENS_greedy <- predict(greedy3, newdata = yy3)
predf3$ENS_linear <- predict(linear3, newdata = yy3)
#
idx = apply(ipac_names[, -5], 1, buscar, lnm, 5)
XY = cbind(teff$fit[idx > 0], predf3$ENS_greedy[idx[idx > 0]])
plot(XY[!is.na(XY[, 1]), ])

plot of chunk lee15
# plot(dd[,1],predf$ENS_greedy[idx], xlim=c(1500,3500),ylim=c(1500,3500))
plot(XY[!is.na(XY[, 1]), ], xlim = c(1000, 5000), ylim = c(1000, 5000), xlab = "T from Dr Sarro",
ylab = "Greedy predicted T", main = "IPAC Predicted Temperature Comparison ")
lines(c(0, 5000), c(0, 5000), col = 2)

plot of chunk lee15
hist((XY[!is.na(XY[, 1]), 1] - XY[!is.na(XY[, 1]), 2])/sd(XY[!is.na(XY[, 1]),
1] - XY[!is.na(XY[, 1]), 2]), breaks = 20)

plot of chunk lee15
mer3 = mean(XY[!is.na(XY[, 1]), 1] - XY[!is.na(XY[, 1]), 2])
sder3 = sd(XY[!is.na(XY[, 1]), 1] - XY[!is.na(XY[, 1]), 2])
Then, the structure of the residual has an average of 219.8236 and a standard deviation of 1120.2989.
save.image(file = "paso3_rf_v1.RData")