In previous steps features were selected by using GA technology to assess the suitability of such features.
Differnt GA strategies for fitness evaluation and we have tested nearcenter and randomforest technologies.
Feature selection was prepared acording to document paso2
suppressPackageStartupMessages(library(googleVis))
suppressPackageStartupMessages(library(xtable))
suppressPackageStartupMessages(library(Peaks))
suppressPackageStartupMessages(library(magic))
suppressPackageStartupMessages(library(segmented))
suppressPackageStartupMessages(library(fftw))
suppressPackageStartupMessages(library(FITSio))
suppressPackageStartupMessages(library(stringr))
suppressPackageStartupMessages(library(utils))
suppressPackageStartupMessages(library(e1071))
suppressPackageStartupMessages(library(quantmod))
suppressPackageStartupMessages(library(JADE))
suppressPackageStartupMessages(library(zoo))
suppressPackageStartupMessages(library(plyr))
suppressPackageStartupMessages(library(doMC))
suppressPackageStartupMessages(library(multicore))
suppressPackageStartupMessages(library(parallel))
suppressPackageStartupMessages(library(foreach))
suppressPackageStartupMessages(library(compiler))
suppressPackageStartupMessages(library(galgo))
##Feature extraction as defined by the GA
For feature selection BT-Settl 2012 library from France Allard was selected and wavelength reduction was performed to become compatible with the data coming from the satellite IPAC.
# Procesado del genético de Tª
setwd("~/git/M_sel")
load("nearcenter_ALL_m_5_900.RData")
plot(bb.nc.3, type = "fitness")
plot(bb.nc.3, type = "fitness", filter = "nosolutions")
plot(bb.nc.3, type = "confusion")
## Computing confusion from class prediction...
## 5% 10% 15% 20% 25% 30% 35% 40% 45% 50% 55% 60% 65% 70% 75% 80% 85% 90% 95% 100%
cpm <- classPredictionMatrix(bb.nc.3)
cm <- confusionMatrix(bb.nc.3, cpm)
sec <- sensitivityClass(bb.nc.3, cm)
spc <- specificityClass(bb.nc.3, cm)
plot(bb.nc.3, type = "confusion", set = c(1, 0), splits = 1)
## Computing confusion from class prediction...
## 5% 10% 15% 20% 25% 30% 35% 40% 45% 50% 55% 60% 65% 70% 75% 80% 85% 90% 95% 100%
plot(bb.nc.3, type = "confusion", set = c(1, 0), splits = 1, chromosomes = list(bb.nc.3$bestChromosomes[[1]]))
## Computing confusion from class prediction...
plot(bb.nc.3, type = "generankstability")
rchr <- lapply(bb.nc.3$bestChromosomes[1:300], robustGeneBackwardElimination,
bb.nc.3, result = "shortest")
fsm <- forwardSelectionModels(bb.nc.3, plot = FALSE)
## 4% 8% 12% 16% 20% 24% 29% 33% 37% 41% 45% 49% 53% 57% 61% 65% 69% 73% 78% 82% 86% 90% 94% 98%
fsm$models
## [[1]]
## [1] 27674 43391
##
## [[2]]
## [1] 27674 43391 14555 77120 7253 4736 17002 50196 2372 8133 57171
## [12] 9738 6949 889 895 3109 15275 77119 20750 35778 83871 49075
## [23] 70267 3402 50333 50334 63733 3403 57036 12072 35783 76709 69714
## [34] 6662 36065 43531
##
## [[3]]
## [1] 27674 43391 14555 77120 7253 4736 17002 50196 2372 8133 57171
## [12] 9738 6949 889 895 3109 15275 77119 20750 35778 83871 49075
## [23] 70267 3402 50333 50334 63733 3403 57036 12072 35783 76709 69714
## [34] 6662 36065 43531 83330
##
## [[4]]
## [1] 27674 43391 14555 77120 7253 4736 17002 50196 2372 8133 57171
## [12] 9738 6949 889 895 3109 15275 77119 20750 35778 83871 49075
## [23] 70267 3402 50333 50334 63733 3403 57036 12072 35783 76709 69714
## [34] 6662 36065 43531 83330 890
##
## [[5]]
## [1] 27674 43391 14555 77120 7253 4736 17002 50196 2372 8133 57171
## [12] 9738 6949 889 895 3109 15275 77119 20750 35778 83871 49075
## [23] 70267 3402 50333 50334 63733 3403 57036 12072 35783 76709 69714
## [34] 6662 36065 43531 83330 890 15276
##
## [[6]]
## [1] 27674 43391 14555 77120 7253 4736 17002 50196 2372 8133 57171
## [12] 9738 6949 889 895 3109 15275 77119 20750 35778 83871 49075
## [23] 70267 3402 50333 50334 63733 3403 57036 12072 35783 76709 69714
## [34] 6662 36065 43531 83330 890 15276 16717 2806
##
## [[7]]
## [1] 27674 43391 14555 77120 7253 4736 17002 50196 2372 8133 57171
## [12] 9738 6949 889 895 3109 15275 77119 20750 35778 83871 49075
## [23] 70267 3402 50333 50334 63733 3403 57036 12072 35783 76709 69714
## [34] 6662 36065 43531 83330 890 15276 16717 2806 7547 9595 2239
##
## [[8]]
## [1] 27674 43391 14555 77120 7253 4736 17002 50196 2372 8133 57171
## [12] 9738 6949 889 895 3109 15275 77119 20750 35778 83871 49075
## [23] 70267 3402 50333 50334 63733 3403 57036 12072 35783 76709 69714
## [34] 6662 36065 43531 83330 890 15276 16717 2806 7547 9595 2239
## [45] 69715
##
## [[9]]
## [1] 27674 43391 14555 77120 7253 4736 17002 50196 2372 8133 57171
## [12] 9738 6949 889 895 3109 15275 77119 20750 35778 83871 49075
## [23] 70267 3402 50333 50334 63733 3403 57036 12072 35783 76709 69714
## [34] 6662 36065 43531 83330 890 15276 16717 2806 7547 9595 2239
## [45] 69715 10179
##
## [[10]]
## [1] 27674 43391 14555 77120 7253 4736 17002 50196 2372 8133 57171
## [12] 9738 6949 889 895 3109 15275 77119 20750 35778 83871 49075
## [23] 70267 3402 50333 50334 63733 3403 57036 12072 35783 76709 69714
## [34] 6662 36065 43531 83330 890 15276 16717 2806 7547 9595 2239
## [45] 69715 10179 17864
##
## [[11]]
## [1] 27674 43391 14555 77120 7253 4736 17002 50196 2372 8133 57171
## [12] 9738 6949 889 895 3109 15275 77119 20750 35778 83871 49075
## [23] 70267 3402 50333 50334 63733 3403 57036 12072 35783 76709 69714
## [34] 6662 36065 43531 83330 890 15276 16717 2806 7547 9595 2239
## [45] 69715 10179 17864 65212
##
## [[12]]
## [1] 27674 43391 14555 77120 7253 4736 17002 50196 2372 8133 57171
## [12] 9738 6949 889 895 3109 15275 77119 20750 35778 83871 49075
## [23] 70267 3402 50333 50334 63733 3403 57036 12072 35783 76709 69714
## [34] 6662 36065 43531 83330 890 15276 16717 2806 7547 9595 2239
## [45] 69715 10179 17864 65212 17153
##
## [[13]]
## [1] 27674 43391 14555 77120 7253 4736 17002 50196 2372 8133 57171
## [12] 9738 6949 889 895 3109 15275 77119 20750 35778 83871 49075
## [23] 70267 3402 50333 50334 63733 3403 57036 12072 35783 76709 69714
## [34] 6662 36065 43531 83330 890 15276 16717 2806 7547 9595 2239
## [45] 69715 10179 17864 65212 17153 70129
rownames(ALL)[fsm$models[[3]]]
## [1] "145:148_130:144" "19:24_28:37" "13:16_1:12"
## [4] "19:26_28:37" "156:159_148:155" "93:96_97:104"
## [7] "48:51_52:63" "19:24_25:36" "60:63_49:56"
## [10] "28:31_16:25" "20:25_28:42" "60:63_49:58"
## [13] "131:134_142:149" "27:30_19:26" "33:36_19:26"
## [16] "72:75_64:71" "28:31_16:27" "18:25_28:37"
## [19] "145:148_130:141" "27:32_19:26" "19:26_28:39"
## [22] "13:18_1:12" "19:26_31:38" "64:67_70:77"
## [25] "19:24_28:39" "20:25_28:39" "20:25_28:44"
## [28] "65:68_70:77" "19:24_25:39" "93:96_97:106"
## [31] "32:37_19:26" "30:37_19:28" "29:36_19:26"
## [34] "145:148_136:143" "19:24_25:32" "20:25_31:40"
## [37] "29:36_16:27"
#
features <- list()
features$M <- c("19:26_28:37", "48:51_52:63", "93:96_97:104", "145:148_130:141",
"156:159_148:155")
output <- 7
#
Now, we start to load the original data bp_clean and bq_clean, and we will process the features according to the feature list.
In order to generate features we will use the formula \( \int{\lambda_{1}}{\lambda_{2}}{1-\frac{F_{line}}{F_{cont}} d\lambda} \) as depicted in formula (1) of the above reference.
The band frequences are defined in the previous table but \( F_{cont} \) is not defined. As such a genetic algorithm testing different potential continuum spectra will be tested and looking the best matching to explain spectral parameters by precious feature functions.
Once the signal points and continuous regions were identified, models were setted-up and assessed by crossvalidation procedure. Iedas for the implementation were taken from http://moderntoolmaking.blogspot.com.es/2013/03/caretensemble-classification-example.html
# Cargamos bp_clean (BT_SETTL) & bq_clean (IPAC)
load("~/git/M_prep/M_prep_cleanip_BT-Settl.RData")
rm(xtmp)
# Buscamos las catacterísticas para extraerla del conjunto de train par T
signal <- unlist(lapply(features$M, function(x) {
a <- strsplit(x, "_")
return(a[[1]][1])
}))
noise <- unlist(lapply(features$M, function(x) {
a <- strsplit(x, "_")
return(a[[1]][2])
}))
sn <- cbind(signal, noise)
int_spec <- function(x, idx, norm = 0) {
y <- x$data[[1]][eval(parse(text = idx)), ]
xz <- diff(as.numeric(y[, 1]), 1)
yz <- as.numeric(y[, 2])
if (norm > 0) {
z <- sum(xz)
} else {
z <- sum(xz * rollmean(yz, 2))
}
return(z)
}
#
feature_extr <- function(sn, bp) {
sig <- sn[1]
noi <- sn[2]
Fcont <- unlist(lapply(bp, int_spec, noi, 0))/unlist(lapply(bp, int_spec,
noi, 1))
fea <- unlist(lapply(bp, int_spec, sig, 1)) - unlist(lapply(bp, int_spec,
sig, 0))/Fcont
return(fea)
}
xx <- apply(sn, 1, feature_extr, bp_clean)
colnames(xx) <- as.character(sn[, 1])
colnames(xx) <- str_replace(paste("X", colnames(xx), sep = ""), ":", ".")
xx <- cbind(xx, unlist(lapply(bp_clean, function(x) {
return(x$stellarp[3])
})))
colnames(xx)[ncol(xx)] <- "G"
# Just informing for the features
newfea <- cbind(t(apply(sn, 1, function(x) {
return(range(bp_clean[[1]]$data[[1]][eval(parse(text = x[1])), 1]))
})), t(apply(sn, 1, function(x) {
return(range(bp_clean[[1]]$data[[1]][eval(parse(text = x[2])), 1]))
})))
colnames(newfea) <- c("Signal_from", "Signal_To", "Cont_From", "Cont_To")
print(xtable(newfea), type = "html")
| Signal_from | Signal_To | Cont_From | Cont_To | |
|---|---|---|---|---|
| 1 | 8525.80 | 8551.00 | 8558.20 | 8590.60 |
| 2 | 8630.20 | 8641.00 | 8644.60 | 8684.20 |
| 3 | 8792.20 | 8803.00 | 8806.60 | 8831.80 |
| 4 | 8979.40 | 8990.20 | 8925.40 | 8965.00 |
| 5 | 9019.00 | 9029.80 | 8990.20 | 9015.40 |
#
Let's build up several models and compare their performance. Generally speaking we split randomly the learning set and we will learn by ten folder cross validation. Then we will test the models against the unseen data and we will test the ensamble technology, even.
# Setup
gc(reset = TRUE)
## used (Mb) gc trigger (Mb) max used (Mb)
## Ncells 2149623 114.9 4953636 264.6 2149623 114.9
## Vcells 566078000 4318.9 732821450 5591.0 566078000 4318.9
set.seed(42) #From random.org
# Libraries
library(caret)
## Loading required package: lattice
##
## Attaching package: 'lattice'
##
## The following object is masked from 'package:multicore':
##
## parallel
##
## Loading required package: ggplot2
##
## Attaching package: 'caret'
##
## The following objects are masked from 'package:galgo':
##
## best, confusionMatrix
library(devtools)
##
## Attaching package: 'devtools'
##
## The following objects are masked from 'package:R.oo':
##
## check, unload
# Solo una vez: install_github('caretEnsemble', 'zachmayer') #Install zach's
# caretEnsemble package Code gathered from the author's post.
library(caretEnsemble)
# Data
library(mlbench)
xx <- as.data.frame(xx)
X <- xx[, -ncol(xx)]
rownames(X) <- 1:nrow(X)
X <- data.frame(X)
Y <- xx[, ncol(xx)]
# Split train/test
train <- runif(nrow(X)) <= 0.66
# Setup CV Folds returnData=FALSE saves some space
folds = 10
repeats = 1
myControl <- trainControl(method = "cv", number = folds, repeats = repeats,
returnResamp = "none", returnData = FALSE, savePredictions = TRUE, verboseIter = FALSE,
allowParallel = TRUE, index = createMultiFolds(Y[train], k = folds, times = repeats))
# Train some models
model1 <- train(X[train, ], Y[train], method = "gbm", trControl = myControl,
tuneGrid = expand.grid(.n.trees = 500, .interaction.depth = 15, .shrinkage = 0.01),
verbose = FALSE)
## Loading required package: gbm
## Loading required package: survival
## Loading required package: splines
##
## Attaching package: 'survival'
##
## The following object is masked from 'package:caret':
##
## cluster
##
## Loaded gbm 2.1
model2 <- train(X[train, ], Y[train], method = "blackboost", trControl = myControl)
## Loading required package: party
## Loading required package: grid
## Loading required package: sandwich
## Loading required package: strucchange
## Loading required package: modeltools
## Loading required package: stats4
##
## Attaching package: 'modeltools'
##
## The following objects are masked from 'package:R.oo':
##
## clone, dimension
##
## The following object is masked from 'package:plyr':
##
## empty
##
## Loading required package: mboost
## This is mboost 2.2-3. See 'package?mboost' and the NEWS file
## for a complete list of changes.
## Note: The default for the computation of the degrees of freedom has changed.
## For details see section 'Global Options' of '?bols'.
##
## Attaching package: 'mboost'
##
## The following object is masked from 'package:ggplot2':
##
## %+%
model3 <- train(X[train, ], Y[train], method = "rf", trControl = myControl)
## Loading required package: randomForest
## randomForest 4.6-7
## Type rfNews() to see new features/changes/bug fixes.
model4 <- train(X[train, ], Y[train], method = "mlpWeightDecay", trControl = myControl,
trace = FALSE)
## Loading required package: RSNNS
## Loading required package: Rcpp
##
## Attaching package: 'RSNNS'
##
## The following objects are masked from 'package:caret':
##
## confusionMatrix, train
##
## The following object is masked from 'package:galgo':
##
## confusionMatrix
model5 <- train(X[train, ], Y[train], method = "ppr", trControl = myControl)
model6 <- train(X[train, ], Y[train], method = "earth", trControl = myControl)
## Loading required package: earth
## Loading required package: plotmo
## Loading required package: plotrix
model7 <- train(X[train, ], Y[train], method = "glm", trControl = myControl)
model8 <- train(X[train, ], Y[train], method = "svmRadial", trControl = myControl)
## Loading required package: kernlab
##
## Attaching package: 'kernlab'
##
## The following object is masked from 'package:modeltools':
##
## prior
##
## The following object is masked from 'package:galgo':
##
## scaling
model9 <- train(X[train, ], Y[train], method = "gam", trControl = myControl)
## Loading required package: mgcv
## Loading required package: nlme
## This is mgcv 1.7-28. For overview type 'help("mgcv-package")'.
##
## Attaching package: 'mgcv'
##
## The following object is masked from 'package:magic':
##
## magic
model10 <- train(X[train, ], Y[train], method = "glmnet", trControl = myControl)
## Loading required package: glmnet
## Loading required package: Matrix
## Loaded glmnet 1.9-5
model11 <- train(X[train, ], Y[train], method = "nnet", trControl = myControl,
trace = FALSE, maxit = 10000, reltol = 1e-11, abstol = 1e-06)
## Loading required package: nnet
# Make a list of all the models
all.models <- list(model1, model2, model3, model4, model5, model6, model7, model8,
model9, model10)
names(all.models) <- sapply(all.models, function(x) x$method)
sort(sapply(all.models, function(x) min(x$results$RMSE)))
## svmRadial gbm rf gam mlpWeightDecay
## 0.2156 0.2517 0.2621 0.3175 0.3220
## ppr blackboost earth glm glmnet
## 0.3257 0.3303 0.3535 0.4921 0.5071
# Make a greedy ensemble - currently can only use RMSE
greedy <- caretEnsemble(all.models, iter = 1000L)
## Loading required package: pbapply
sort(greedy$weights, decreasing = TRUE)
## svmRadial rf gam
## 0.842 0.093 0.065
greedy$error
## RMSE
## 0.2392
# Make a linear regression ensemble
linear <- caretStack(all.models, method = "glm", trControl = trainControl(method = "cv"))
summary(linear$ens_model$finalModel)
##
## Call:
## NULL
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -0.6170 -0.1152 -0.0094 0.0865 2.2094
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 0.00284 0.02776 0.10 0.9185
## gbm -0.12431 0.16983 -0.73 0.4650
## blackboost -0.37121 0.12659 -2.93 0.0037 **
## rf 0.52004 0.17073 3.05 0.0026 **
## mlpWeightDecay -0.02802 0.09800 -0.29 0.7752
## ppr 0.06173 0.08864 0.70 0.4869
## earth -0.10917 0.09789 -1.12 0.2660
## glm 0.03883 0.16056 0.24 0.8091
## svmRadial 0.83710 0.11905 7.03 2.5e-11 ***
## gam 0.19951 0.12518 1.59 0.1124
## glmnet -0.05006 0.17832 -0.28 0.7792
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for gaussian family taken to be 0.05578)
##
## Null deviance: 164.234 on 230 degrees of freedom
## Residual deviance: 12.272 on 220 degrees of freedom
## AIC: 1.542
##
## Number of Fisher Scoring iterations: 2
linear$error
## parameter RMSE Rsquared RMSESD RsquaredSD
## 1 none 0.2357 0.8997 0.1281 0.1317
# Predict for test set:
preds <- data.frame(sapply(all.models, predict, newdata = X[!train, ]))
preds$ENS_greedy <- predict(greedy, newdata = X[!train, ])
preds$ENS_linear <- predict(linear, newdata = X[!train, ])
sort(sqrt(colMeans((preds - Y[!train])^2)))
## svmRadial ENS_greedy ENS_linear gbm rf
## 0.1642 0.1647 0.1696 0.2218 0.2255
## gam earth mlpWeightDecay ppr blackboost
## 0.2604 0.2706 0.2863 0.3047 0.3142
## glm glmnet
## 0.5427 0.5556
plot(Y[!train], preds$ENS_greedy)
lines(c(-5, 10), c(-5, 10), col = 2)
After having the models built as well as the ensamble of them it is time to predict for the IPAC dataset. Thus we start to prepare the data,
# Cargamos bq_clean (IPAC)
yy <- apply(sn, 1, feature_extr, bq_clean)
yy <- as.data.frame(yy)
colnames(yy) <- as.character(sn[, 1])
colnames(yy) <- str_replace(paste("X", colnames(yy), sep = ""), ":", ".")
# Predict for new dataset:
predf <- data.frame(sapply(all.models, predict, newdata = yy))
predf$ENS_greedy <- predict(greedy, newdata = yy)
predf$ENS_linear <- predict(linear, newdata = yy)
In order to realize how close or isolated the both sets (BT-SETTL and IPAC) are a PCA analysis is performed:
# Cargamos bq_clean (IPAC)
zz <- rbind(X, yy)
pcaz <- prcomp(zz)
plot(pcaz$x[, 1], pcaz$x[, 2], pch = ".")
points(pcaz$x[(nrow(X) + 1):nrow(zz), 1], pcaz$x[(nrow(X) + 1):nrow(zz), 2],
pch = "x", col = 3)
points(pcaz$x[1:nrow(X), 1], pcaz$x[1:nrow(X), 2], pch = "+", col = 2)
rownames(yy)[896 - nrow(X)]
## [1] "LP_799-3.7512.txt"
According to the obtained predictions, some comparison will be performed against the prediction carried out by using spectrafull projection by means of ICA/JADE technology. This was carried out by Miss Prendes Gero at http://innova.uned.es, so we will download her prediction datasets and just compare them.
# Cargamos bq_clean (IPAC)
load("~/git/M_sel/IPAC_Belen_M.RData")
load("~/git/M_sel/belen_resul_T.RData")
rownames(dM) = rownames(dd)
lnm <- unlist(lapply(bq_clean, function(x) {
return(x$name)
}))
idx = apply(data.frame(rownames(dM)), 1, function(x, y) {
return(which(x == y))
}, lnm)
XY = cbind(dM[, ncol(dM)], predf$ENS_greedy[idx])
plot(XY)
plot(XY, xlim = c(-5, 5), ylim = c(-5, 5))
lines(c(-5, 5), c(-5, 5), col = 2)
hist((XY[, 1] - XY[, 2])/sd(XY[, 1] - XY[, 2]), breaks = 20)
mer1 = mean(XY[, 1] - XY[, 2])
sder1 = sd(XY[, 1] - XY[, 2])