Preprocessing data for used oil viscosities (40º and 100ºC) and Total Acid Number (TAN)

Introduction

The starting point is a set of .csv files provided by the Beatriz Leal’s laboratory containing for different samples the total FTIR espectrum from 4000 to 550 \(cm^{-1}\). Theis path names is ACUS*.csv.

As a matter of complection this Laboratory provided the spectrum of the not used oil. The name of the file is AcvirgAeroshell_sep12.csv.

Then there is an additional file having a table with the name of the espectra, its TAN according to ASTM D-664, its viscosity by 100ºC according to the ASTM D-445 and its viscosity by 40ºC. Its namefile is Espectros-AN-Vis_31-oct2012.csv.

Data Preprocessing

The first step is to get data properly loaded into R. We start from dataset provided by JLV for spectra as well as from the chemical properties measured, which are provided by JLV too.

Now raw data is stored in object nacus and chemical parameters are in dmodel one.

In the underneath sections, we will perform an interating analysis by changing the percentage of samples used to validate, randomly selected from 66% till 100%.

## Loading the information about the spectra
acus <- read.csv(file = "csv2/todo_JLV_svirgen.csv", header = TRUE, stringsAsFactors = FALSE)
## Loading the original spectrum for a virgin oil
datos <- read.csv(file = "csv2/MuestrasSelecionadas.csv", header = TRUE, stringsAsFactors = FALSE, 
    sep = ";")
# 
dmodel = datos[datos[, 2] %in% acus[, 1], ]
nacus = acus[acus[, 1] %in% dmodel[, 2], ]
data = merge(nacus, dmodel[, c(2:4)], by.x = "Name", by.y = "Espectro")
rm(acus)
rm(datos)
lper = c(0.7, 0.8, 0.9, 1)
# 
iper = as.numeric("0.666")
ilx = sample(1:nrow(nacus), round(iper * nrow(nacus)), replace = FALSE)
ival = (1:nrow(nacus))[-ilx]

Analysis with learning percentage of 66.6%.

In the current iteration the percentage of samples used for training is 66.6%. Then, total samples for learning are 51 samples and 25 are used for testing pourposes.

According to the analysis requirements asked by JLV the 66% was randomly selected for traininglearning and the remaining 33% were used for model assessment.

Strategy 1: Project Raw Data before regressing models

According to the so called strategy 1, an ICA projection for a spectrum will be performed in order to be able to get a reduced dimension, then it could be recovered back with almost no loses.

As a matter of notation THe ICA component will store: * S matrix with is the projected components of the dataset * A matrix , which allows us to unproject the S data * W matrix, which allows us to project any raw spectrum * Xmu vector, which allows us to get the real values to unprojected data. Then, (X-mu) * t(W) = S and S * t(A) + mu = X

## 
ICAnacus = JADE(data[ilx, -c(1, 3553, 3554)], 5)
X = ICAnacus$S
Y = data[ilx, 3554]
require(caret)
folds = 5
repeats = 4
myControl = trainControl(method = "repeatedcv", number = folds, repeats = repeats, 
    returnResamp = "none", returnData = FALSE, savePredictions = TRUE, allowParallel = TRUE, 
    verboseIter = FALSE)
# 
if (require("multicore", quietly = TRUE, warn.conflicts = FALSE)) {
    myControl$workers <- multicore:::detectCores()/2
    myControl$computeFunction <- mclapply
    myControl$computeArgs <- list(mc.preschedule = FALSE, mc.set.seed = FALSE)
}
# 
lmls = c("lm", "rf", "ppr", "earth", "glm", "svmRadial", "glmnet", "pls", "svmPoly", 
    "bagEarth", "enet")
objs = list()
for (i in 1:length(lmls)) {
    cat(paste("Step:", i, ". Method:", lmls[i], "\n", sep = ""))
    objs[[i]] = train(X, Y, method = lmls[i], trControl = myControl)
}
## Step:1. Method:lm
## Step:2. Method:rf
## Step:3. Method:ppr
## Step:4. Method:earth
## Warning: There were missing values in resampled performance measures.
## Step:5. Method:glm
## Step:6. Method:svmRadial
## Step:7. Method:glmnet
## Warning: There were missing values in resampled performance measures.
## Step:8. Method:pls
## Step:9. Method:svmPoly
## Step:10. Method:bagEarth
## Step:11. Method:enet
i = i + 1
objs[[i]] = train(X, Y, method = "nnet", trControl = myControl, trace = FALSE)
## Warning: There were missing values in resampled performance measures.
imodels = i
lmls[i] = "nnet"
names(objs) = sapply(objs, function(x) x$method)
res = sapply(objs, function(x) x$results$Rsquared)
# 
res
## $lm
## [1] 0.1386
## 
## $rf
## [1] 0.10819 0.11465 0.09962
## 
## $ppr
## [1] 0.12322 0.07957 0.07115
## 
## $earth
## [1] 0.2175 0.1928 0.1928
## 
## $glm
## [1] 0.1812
## 
## $svmRadial
## [1] 0.09528 0.08053 0.06761
## 
## $glmnet
##  [1] 0.12030 0.11744 0.10602 0.11102 0.12573 0.17079 0.17199 0.17793
##  [9] 0.08491 0.11361 0.17820     NaN     NaN     NaN     NaN     NaN
## [17]     NaN     NaN 0.10686     NaN     NaN     NaN     NaN     NaN
## [25]     NaN     NaN     NaN
## 
## $pls
## [1] 0.13351 0.10457 0.08885
## 
## $svmPoly
##  [1] 0.1801 0.1811 0.1831 0.1599 0.1644 0.1393 0.1363 0.1566 0.1631 0.1812
## [11] 0.1832 0.1612 0.1660 0.1439 0.1399 0.1383 0.1424 0.1497 0.1823 0.1717
## [21] 0.1603 0.1579 0.1431 0.1439 0.1397 0.1448 0.1372
## 
## $bagEarth
## [1] 0.08550 0.06536 0.08454
## 
## $enet
## [1] 0.2001 0.2001 0.2003 0.1369 0.1369 0.1375 0.1564 0.1564 0.1573
## 
## $nnet
## [1]    NaN 0.2239 0.2125    NaN 0.2138 0.2115    NaN 0.1889 0.2098

After producing the models it’s time to assess their performance:

## 
Xnew = (as.matrix(sweep(data[ival, -c(1, 3553, 3554)], 2, ICAnacus$Xmu)) %*% 
    t(ICAnacus$W))
colnames(Xnew) = paste("IC.", 1:ncol(Xnew), sep = "", col = "")
Ytg = data[ival, 3554]
Yt = Ytg
for (i in 1:imodels) {
    Yt = cbind(Yt, predict(objs[[i]], Xnew))
    err = sum(abs(Yt[, ncol(Yt)] - Ytg)/Ytg/length(Ytg))
    cat(paste("Model:", lmls[i], ". Err:", err, "\n", sep = ""))
}

Model:lm. Err:0.596280347923419 Model:rf. Err:0.650583134124796 Model:ppr. Err:0.536784739403707 Model:earth. Err:0.826101383520536 Model:glm. Err:0.596280347923419 Model:svmRadial. Err:0.702312658484694 Model:glmnet. Err:0.777226047539238 Model:pls. Err:0.596280347923419 Model:svmPoly. Err:0.600313935432836 Model:bagEarth. Err:0.684523775174282 Model:enet. Err:0.807195298214509 Model:nnet. Err:0.505037262671792

# 
for (i in 2:(length(lmls) + 1)) {
    plot(Yt[, 1], Yt[, i], xlim = c(0, 5), ylim = c(0, 5), main = paste("Model:", 
        lmls[i], sep = ""), xlab = "Original Y", ylab = "Predicted Y")
}

plot of chunk unnamed-chunk-3 plot of chunk unnamed-chunk-3 plot of chunk unnamed-chunk-3 plot of chunk unnamed-chunk-3 plot of chunk unnamed-chunk-3 plot of chunk unnamed-chunk-3 plot of chunk unnamed-chunk-3 plot of chunk unnamed-chunk-3 plot of chunk unnamed-chunk-3 plot of chunk unnamed-chunk-3 plot of chunk unnamed-chunk-3 plot of chunk unnamed-chunk-3

# 

Strategy 2: Feature Selection over raw data

According to the so called strategy 1, an ICA projection for a spectrum will be Feature selection is an important step for practical commercial data mining which is often characterised by data sets with far too many variables for model building.

## 
control <- rfeControl(functions = rfFuncs, method = "boot", verbose = FALSE, 
    returnResamp = "final", number = 50)
if (require("multicore", quietly = TRUE, warn.conflicts = FALSE)) {
    control$workers <- multicore:::detectCores()/2
    control$computeFunction <- mclapply
    control$computeArgs <- list(mc.preschedule = FALSE, mc.set.seed = FALSE)
}
sizes = 5:25
profile.1 <- rfe(data[ilx, -c(1, 3553, 3554)], Y, sizes = sizes, rfeControl = control)
# cat( 'rf : Profile 1 predictors:', predictors(profile.1), fill = TRUE )
vars = predictors(profile.1)
vars = vars[1:min(25, length(vars))]
vars

[1] “X1709” “X722” “X469” “X2923” “X723” “X455” “X1715” “X1710” [9] “X1714” “X1708” “X1471” “X2924” “X720” “X715” “X1707” “X716” [17] “X1712” “X721” “X1711” “X2960” “X2873” “X2874” “X724” “X1706” [25] “X1366”

# 
X = data[ilx, vars]
obj2 = train(X, Y, method = "rf", trControl = myControl)
plot(Y, predict(obj2, X), xlim = c(0, 5), ylim = c(0, 5), xlab = "Original Y", 
    ylab = "Predicted Y", main = paste("RF with ", length(vars), " raw variables.Training"))
abline(0, 1, col = 2)
Yp = predict(obj2, X)
lmyp = lm(Yp ~ 1 + Y)
r2 = summary(lmyp)$adj.r.squared
abline(lmyp$coefficients, col = 3)
legend("topleft", legend = as.expression(c(bquote("Equality"), bquote(paste("LM regressed. ", 
    R^2 == .(round(r2, 3)))))), fill = c(2, 3))
plot of chunk unnamed-chunk-4

plot of chunk unnamed-chunk-4

# 
if (length(ival) > 0) {
    Xnew = data[ival, vars]
    Ynew = data[ival, ncol(data)]
    Yout = predict(obj2, Xnew)
    plot(Yout, Ynew, xlim = c(0, 5), ylim = c(0, 5), xlab = "Original Y", ylab = "Predicted Y", 
        main = paste("RF with ", length(vars), " raw variables.Testing"))
    abline(0, 1, col = 2)
    # 
    lmyp = lm(Yout ~ 1 + Ynew)
    r2 = summary(lmyp)$adj.r.squared
    abline(lmyp$coefficients, col = 3)
    legend("topleft", legend = as.expression(c(bquote("Equality"), bquote(paste("LM regressed. ", 
        R^2 == .(round(r2, 3)))))), fill = c(2, 3))
    # 
}
plot of chunk unnamed-chunk-4

plot of chunk unnamed-chunk-4

# 
obj3 = train(X, Y, method = "pls", trControl = myControl)
plot(Y, predict(obj3, X), xlim = c(0, 5), ylim = c(0, 5), xlab = "Original Y", 
    ylab = "Predicted Y", main = paste("PLS with ", length(vars), " raw variables.Training"))
abline(0, 1, col = 2)
Yp = predict(obj3, X)
lmyp = lm(Yp ~ 1 + Y)
r2 = summary(lmyp)$adj.r.squared
abline(lmyp$coefficients, col = 3)
legend("topleft", legend = as.expression(c(bquote("Equality"), bquote(paste("LM regressed. ", 
    R^2 == .(round(r2, 3)))))), fill = c(2, 3))
plot of chunk unnamed-chunk-4

plot of chunk unnamed-chunk-4

# 
if (length(ival) > 0) {
    Xnew = data[ival, vars]
    Ynew = data[ival, ncol(data)]
    Yout = predict(obj3, Xnew)
    plot(Yout, Ynew, xlim = c(0, 5), ylim = c(0, 5), xlab = "Original Y", ylab = "Predicted Y", 
        main = paste("PLS with ", length(vars), " raw variables.Testing"))
    abline(0, 1, col = 2)
    # 
    lmyp = lm(Yout ~ 1 + Ynew)
    r2 = summary(lmyp)$adj.r.squared
    abline(lmyp$coefficients, col = 3)
    legend("topleft", legend = as.expression(c(bquote("Equality"), bquote(paste("LM regressed. ", 
        R^2 == .(round(r2, 3)))))), fill = c(2, 3))
    # 
}
plot of chunk unnamed-chunk-4

plot of chunk unnamed-chunk-4

# 
obj4 = train(X, Y, method = "lm", trControl = myControl)
plot(Y, predict(obj4, X), xlim = c(0, 5), ylim = c(0, 5), xlab = "Original Y", 
    ylab = "Predicted Y", main = paste("LM with ", length(vars), " raw variables.Training"))
abline(0, 1, col = 2)
Yp = predict(obj4, X)
lmyp = lm(Yp ~ 1 + Y)
r2 = summary(lmyp)$adj.r.squared
abline(lmyp$coefficients, col = 3)
legend("topleft", legend = as.expression(c(bquote("Equality"), bquote(paste("LM regressed. ", 
    R^2 == .(round(r2, 3)))))), fill = c(2, 3))
plot of chunk unnamed-chunk-4

plot of chunk unnamed-chunk-4

# 
if (length(ival) > 0) {
    Xnew = data[ival, vars]
    Ynew = data[ival, ncol(data)]
    Yout = predict(obj4, Xnew)
    plot(Yout, Ynew, xlim = c(0, 5), ylim = c(0, 5), xlab = "Original Y", ylab = "Predicted Y", 
        main = paste("LM with ", length(vars), " raw variables.Testing"))
    abline(0, 1, col = 2)
    # 
    lmyp = lm(Yout ~ 1 + Ynew)
    r2 = summary(lmyp)$adj.r.squared
    abline(lmyp$coefficients, col = 3)
    legend("topleft", legend = as.expression(c(bquote("Equality"), bquote(paste("LM regressed. ", 
        R^2 == .(round(r2, 3)))))), fill = c(2, 3))
    # 
}
plot of chunk unnamed-chunk-4

plot of chunk unnamed-chunk-4

iper = as.numeric("0.7")
ilx = sample(1:nrow(nacus), round(iper * nrow(nacus)), replace = FALSE)
ival = (1:nrow(nacus))[-ilx]

Analysis with learning percentage of 70%.

In the current iteration the percentage of samples used for training is 70%. Then, total samples for learning are 53 samples and 23 are used for testing pourposes.

According to the analysis requirements asked by JLV the 66% was randomly selected for traininglearning and the remaining 33% were used for model assessment.

Strategy 1: Project Raw Data before regressing models

According to the so called strategy 1, an ICA projection for a spectrum will be performed in order to be able to get a reduced dimension, then it could be recovered back with almost no loses.

As a matter of notation THe ICA component will store: * S matrix with is the projected components of the dataset * A matrix , which allows us to unproject the S data * W matrix, which allows us to project any raw spectrum * Xmu vector, which allows us to get the real values to unprojected data. Then, (X-mu) * t(W) = S and S * t(A) + mu = X

## 
ICAnacus = JADE(data[ilx, -c(1, 3553, 3554)], 5)
X = ICAnacus$S
Y = data[ilx, 3554]
require(caret)
folds = 5
repeats = 4
myControl = trainControl(method = "repeatedcv", number = folds, repeats = repeats, 
    returnResamp = "none", returnData = FALSE, savePredictions = TRUE, allowParallel = TRUE, 
    verboseIter = FALSE)
# 
if (require("multicore", quietly = TRUE, warn.conflicts = FALSE)) {
    myControl$workers <- multicore:::detectCores()/2
    myControl$computeFunction <- mclapply
    myControl$computeArgs <- list(mc.preschedule = FALSE, mc.set.seed = FALSE)
}
# 
lmls = c("lm", "rf", "ppr", "earth", "glm", "svmRadial", "glmnet", "pls", "svmPoly", 
    "bagEarth", "enet")
objs = list()
for (i in 1:length(lmls)) {
    cat(paste("Step:", i, ". Method:", lmls[i], "\n", sep = ""))
    objs[[i]] = train(X, Y, method = lmls[i], trControl = myControl)
}
## Step:1. Method:lm
## Step:2. Method:rf
## Step:3. Method:ppr
## Step:4. Method:earth
## Warning: There were missing values in resampled performance measures.
## Step:5. Method:glm
## Step:6. Method:svmRadial
## Step:7. Method:glmnet
## Warning: There were missing values in resampled performance measures.
## Step:8. Method:pls
## Step:9. Method:svmPoly
## Step:10. Method:bagEarth
## Step:11. Method:enet
i = i + 1
objs[[i]] = train(X, Y, method = "nnet", trControl = myControl, trace = FALSE)
## Warning: There were missing values in resampled performance measures.
imodels = i
lmls[i] = "nnet"
names(objs) = sapply(objs, function(x) x$method)
res = sapply(objs, function(x) x$results$Rsquared)
# 
res
## $lm
## [1] 0.09589
## 
## $rf
## [1] 0.1322 0.1240 0.1121
## 
## $ppr
## [1] 0.08516 0.06541 0.06391
## 
## $earth
## [1] 0.1436 0.1138 0.1138
## 
## $glm
## [1] 0.1503
## 
## $svmRadial
## [1] 0.1301 0.1139 0.1078
## 
## $glmnet
##  [1] 0.11273 0.11186 0.10436 0.05338 0.05376 0.03053 0.05167 0.06714
##  [9]     NaN 0.11100 0.11573     NaN     NaN     NaN     NaN     NaN
## [17]     NaN     NaN 0.08918     NaN     NaN     NaN     NaN     NaN
## [25]     NaN     NaN     NaN
## 
## $pls
## [1] 0.1126 0.1109 0.1071
## 
## $svmPoly
##  [1] 0.09244 0.08975 0.09832 0.10525 0.11919 0.11892 0.11724 0.11794
##  [9] 0.11371 0.08975 0.09868 0.10562 0.11665 0.12156 0.12193 0.09989
## [17] 0.09360 0.10462 0.09045 0.10719 0.10825 0.12069 0.12463 0.12673
## [25] 0.08935 0.09493 0.10282
## 
## $bagEarth
## [1] 0.05079 0.05206 0.04865
## 
## $enet
## [1] 0.06542 0.06542 0.06540 0.13851 0.13851 0.13811 0.14788 0.14788 0.14731
## 
## $nnet
## [1]     NaN 0.12437 0.10463     NaN 0.12458 0.10256     NaN 0.14581 0.09972

After producing the models it’s time to assess their performance:

## 
Xnew = (as.matrix(sweep(data[ival, -c(1, 3553, 3554)], 2, ICAnacus$Xmu)) %*% 
    t(ICAnacus$W))
colnames(Xnew) = paste("IC.", 1:ncol(Xnew), sep = "", col = "")
Ytg = data[ival, 3554]
Yt = Ytg
for (i in 1:imodels) {
    Yt = cbind(Yt, predict(objs[[i]], Xnew))
    err = sum(abs(Yt[, ncol(Yt)] - Ytg)/Ytg/length(Ytg))
    cat(paste("Model:", lmls[i], ". Err:", err, "\n", sep = ""))
}

Model:lm. Err:0.39007318118104 Model:rf. Err:0.407990851220801 Model:ppr. Err:0.415257238779418 Model:earth. Err:0.51519211858418 Model:glm. Err:0.39007318118104 Model:svmRadial. Err:0.395582405582832 Model:glmnet. Err:0.477394211674009 Model:pls. Err:0.39007318118104 Model:svmPoly. Err:0.485454375764386 Model:bagEarth. Err:0.457883164408954 Model:enet. Err:0.506912171190098 Model:nnet. Err:0.401837799625377

# 
for (i in 2:(length(lmls) + 1)) {
    plot(Yt[, 1], Yt[, i], xlim = c(0, 5), ylim = c(0, 5), main = paste("Model:", 
        lmls[i], sep = ""), xlab = "Original Y", ylab = "Predicted Y")
}

plot of chunk unnamed-chunk-7 plot of chunk unnamed-chunk-7 plot of chunk unnamed-chunk-7 plot of chunk unnamed-chunk-7 plot of chunk unnamed-chunk-7 plot of chunk unnamed-chunk-7 plot of chunk unnamed-chunk-7 plot of chunk unnamed-chunk-7 plot of chunk unnamed-chunk-7 plot of chunk unnamed-chunk-7 plot of chunk unnamed-chunk-7 plot of chunk unnamed-chunk-7

# 

Strategy 2: Feature Selection over raw data

According to the so called strategy 1, an ICA projection for a spectrum will be Feature selection is an important step for practical commercial data mining which is often characterised by data sets with far too many variables for model building.

## 
control <- rfeControl(functions = rfFuncs, method = "boot", verbose = FALSE, 
    returnResamp = "final", number = 50)
if (require("multicore", quietly = TRUE, warn.conflicts = FALSE)) {
    control$workers <- multicore:::detectCores()/2
    control$computeFunction <- mclapply
    control$computeArgs <- list(mc.preschedule = FALSE, mc.set.seed = FALSE)
}
sizes = 5:25
profile.1 <- rfe(data[ilx, -c(1, 3553, 3554)], Y, sizes = sizes, rfeControl = control)
# cat( 'rf : Profile 1 predictors:', predictors(profile.1), fill = TRUE )
vars = predictors(profile.1)
vars = vars[1:min(25, length(vars))]
vars

[1] “X1456” “X1708” “X1710” “X1709” “X1707” “X1706” “X454” “X1715” [9] “X455” “X817” “X1714” “X1716” “X1417” “X2860” “X1712” “X458” [17] “X1711” “X1705” “X2933” “X1457” “X1713” “X451” “X456” “X2868” [25] “X2870”

# 
X = data[ilx, vars]
obj2 = train(X, Y, method = "rf", trControl = myControl)
plot(Y, predict(obj2, X), xlim = c(0, 5), ylim = c(0, 5), xlab = "Original Y", 
    ylab = "Predicted Y", main = paste("RF with ", length(vars), " raw variables.Training"))
abline(0, 1, col = 2)
Yp = predict(obj2, X)
lmyp = lm(Yp ~ 1 + Y)
r2 = summary(lmyp)$adj.r.squared
abline(lmyp$coefficients, col = 3)
legend("topleft", legend = as.expression(c(bquote("Equality"), bquote(paste("LM regressed. ", 
    R^2 == .(round(r2, 3)))))), fill = c(2, 3))
plot of chunk unnamed-chunk-8

plot of chunk unnamed-chunk-8

# 
if (length(ival) > 0) {
    Xnew = data[ival, vars]
    Ynew = data[ival, ncol(data)]
    Yout = predict(obj2, Xnew)
    plot(Yout, Ynew, xlim = c(0, 5), ylim = c(0, 5), xlab = "Original Y", ylab = "Predicted Y", 
        main = paste("RF with ", length(vars), " raw variables.Testing"))
    abline(0, 1, col = 2)
    # 
    lmyp = lm(Yout ~ 1 + Ynew)
    r2 = summary(lmyp)$adj.r.squared
    abline(lmyp$coefficients, col = 3)
    legend("topleft", legend = as.expression(c(bquote("Equality"), bquote(paste("LM regressed. ", 
        R^2 == .(round(r2, 3)))))), fill = c(2, 3))
    # 
}
plot of chunk unnamed-chunk-8

plot of chunk unnamed-chunk-8

# 
obj3 = train(X, Y, method = "pls", trControl = myControl)
plot(Y, predict(obj3, X), xlim = c(0, 5), ylim = c(0, 5), xlab = "Original Y", 
    ylab = "Predicted Y", main = paste("PLS with ", length(vars), " raw variables.Training"))
abline(0, 1, col = 2)
Yp = predict(obj3, X)
lmyp = lm(Yp ~ 1 + Y)
r2 = summary(lmyp)$adj.r.squared
abline(lmyp$coefficients, col = 3)
legend("topleft", legend = as.expression(c(bquote("Equality"), bquote(paste("LM regressed. ", 
    R^2 == .(round(r2, 3)))))), fill = c(2, 3))
plot of chunk unnamed-chunk-8

plot of chunk unnamed-chunk-8

# 
if (length(ival) > 0) {
    Xnew = data[ival, vars]
    Ynew = data[ival, ncol(data)]
    Yout = predict(obj3, Xnew)
    plot(Yout, Ynew, xlim = c(0, 5), ylim = c(0, 5), xlab = "Original Y", ylab = "Predicted Y", 
        main = paste("PLS with ", length(vars), " raw variables.Testing"))
    abline(0, 1, col = 2)
    # 
    lmyp = lm(Yout ~ 1 + Ynew)
    r2 = summary(lmyp)$adj.r.squared
    abline(lmyp$coefficients, col = 3)
    legend("topleft", legend = as.expression(c(bquote("Equality"), bquote(paste("LM regressed. ", 
        R^2 == .(round(r2, 3)))))), fill = c(2, 3))
    # 
}
plot of chunk unnamed-chunk-8

plot of chunk unnamed-chunk-8

# 
obj4 = train(X, Y, method = "lm", trControl = myControl)
plot(Y, predict(obj4, X), xlim = c(0, 5), ylim = c(0, 5), xlab = "Original Y", 
    ylab = "Predicted Y", main = paste("LM with ", length(vars), " raw variables.Training"))
abline(0, 1, col = 2)
Yp = predict(obj4, X)
lmyp = lm(Yp ~ 1 + Y)
r2 = summary(lmyp)$adj.r.squared
abline(lmyp$coefficients, col = 3)
legend("topleft", legend = as.expression(c(bquote("Equality"), bquote(paste("LM regressed. ", 
    R^2 == .(round(r2, 3)))))), fill = c(2, 3))
plot of chunk unnamed-chunk-8

plot of chunk unnamed-chunk-8

# 
if (length(ival) > 0) {
    Xnew = data[ival, vars]
    Ynew = data[ival, ncol(data)]
    Yout = predict(obj4, Xnew)
    plot(Yout, Ynew, xlim = c(0, 5), ylim = c(0, 5), xlab = "Original Y", ylab = "Predicted Y", 
        main = paste("LM with ", length(vars), " raw variables.Testing"))
    abline(0, 1, col = 2)
    # 
    lmyp = lm(Yout ~ 1 + Ynew)
    r2 = summary(lmyp)$adj.r.squared
    abline(lmyp$coefficients, col = 3)
    legend("topleft", legend = as.expression(c(bquote("Equality"), bquote(paste("LM regressed. ", 
        R^2 == .(round(r2, 3)))))), fill = c(2, 3))
    # 
}
plot of chunk unnamed-chunk-8

plot of chunk unnamed-chunk-8

iper = as.numeric("0.75")
ilx = sample(1:nrow(nacus), round(iper * nrow(nacus)), replace = FALSE)
ival = (1:nrow(nacus))[-ilx]

Analysis with learning percentage of 75%.

In the current iteration the percentage of samples used for training is 75%. Then, total samples for learning are 57 samples and 19 are used for testing pourposes.

According to the analysis requirements asked by JLV the 66% was randomly selected for traininglearning and the remaining 33% were used for model assessment.

Strategy 1: Project Raw Data before regressing models

According to the so called strategy 1, an ICA projection for a spectrum will be performed in order to be able to get a reduced dimension, then it could be recovered back with almost no loses.

As a matter of notation THe ICA component will store: * S matrix with is the projected components of the dataset * A matrix , which allows us to unproject the S data * W matrix, which allows us to project any raw spectrum * Xmu vector, which allows us to get the real values to unprojected data. Then, (X-mu) * t(W) = S and S * t(A) + mu = X

## 
ICAnacus = JADE(data[ilx, -c(1, 3553, 3554)], 5)
X = ICAnacus$S
Y = data[ilx, 3554]
require(caret)
folds = 5
repeats = 4
myControl = trainControl(method = "repeatedcv", number = folds, repeats = repeats, 
    returnResamp = "none", returnData = FALSE, savePredictions = TRUE, allowParallel = TRUE, 
    verboseIter = FALSE)
# 
if (require("multicore", quietly = TRUE, warn.conflicts = FALSE)) {
    myControl$workers <- multicore:::detectCores()/2
    myControl$computeFunction <- mclapply
    myControl$computeArgs <- list(mc.preschedule = FALSE, mc.set.seed = FALSE)
}
# 
lmls = c("lm", "rf", "ppr", "earth", "glm", "svmRadial", "glmnet", "pls", "svmPoly", 
    "bagEarth", "enet")
objs = list()
for (i in 1:length(lmls)) {
    cat(paste("Step:", i, ". Method:", lmls[i], "\n", sep = ""))
    objs[[i]] = train(X, Y, method = lmls[i], trControl = myControl)
}
## Step:1. Method:lm
## Step:2. Method:rf
## Step:3. Method:ppr
## Step:4. Method:earth
## Warning: There were missing values in resampled performance measures.
## Step:5. Method:glm
## Step:6. Method:svmRadial
## Step:7. Method:glmnet
## Warning: There were missing values in resampled performance measures.
## Step:8. Method:pls
## Step:9. Method:svmPoly
## Step:10. Method:bagEarth
## Step:11. Method:enet
i = i + 1
objs[[i]] = train(X, Y, method = "nnet", trControl = myControl, trace = FALSE)
## Warning: There were missing values in resampled performance measures.
imodels = i
lmls[i] = "nnet"
names(objs) = sapply(objs, function(x) x$method)
res = sapply(objs, function(x) x$results$Rsquared)
# 
res
## $lm
## [1] 0.15
## 
## $rf
## [1] 0.1406 0.1405 0.1293
## 
## $ppr
## [1] 0.0978 0.1528 0.1417
## 
## $earth
## [1] 0.06932 0.08282 0.08282
## 
## $glm
## [1] 0.1739
## 
## $svmRadial
## [1] 0.1979 0.2093 0.2007
## 
## $glmnet
##  [1] 0.16081 0.15138 0.14138 0.13144 0.12700 0.13532 0.12715 0.10057
##  [9] 0.06870 0.14898 0.07912     NaN     NaN     NaN     NaN     NaN
## [17]     NaN     NaN 0.13291     NaN     NaN     NaN     NaN     NaN
## [25]     NaN     NaN     NaN
## 
## $pls
## [1] 0.1927 0.1895 0.1851
## 
## $svmPoly
##  [1] 0.1411 0.1407 0.1407 0.1680 0.1808 0.1724 0.1678 0.1517 0.1555 0.1406
## [11] 0.1407 0.1546 0.1833 0.1758 0.1707 0.1330 0.1410 0.1463 0.1405 0.1461
## [21] 0.1770 0.1808 0.1796 0.1655 0.1453 0.1397 0.1288
## 
## $bagEarth
## [1] 0.09239 0.13706 0.15646
## 
## $enet
## [1] 0.1730 0.1730 0.1730 0.1344 0.1344 0.1347 0.1383 0.1383 0.1392
## 
## $nnet
## [1]    NaN 0.1443 0.1133    NaN 0.1393 0.1120    NaN 0.1304 0.1120

After producing the models it’s time to assess their performance:

## 
Xnew = (as.matrix(sweep(data[ival, -c(1, 3553, 3554)], 2, ICAnacus$Xmu)) %*% 
    t(ICAnacus$W))
colnames(Xnew) = paste("IC.", 1:ncol(Xnew), sep = "", col = "")
Ytg = data[ival, 3554]
Yt = Ytg
for (i in 1:imodels) {
    Yt = cbind(Yt, predict(objs[[i]], Xnew))
    err = sum(abs(Yt[, ncol(Yt)] - Ytg)/Ytg/length(Ytg))
    cat(paste("Model:", lmls[i], ". Err:", err, "\n", sep = ""))
}

Model:lm. Err:0.508554360865687 Model:rf. Err:0.523044773242005 Model:ppr. Err:0.667603854356286 Model:earth. Err:0.560454528942568 Model:glm. Err:0.508554360865687 Model:svmRadial. Err:0.479115375332186 Model:glmnet. Err:0.510399657149445 Model:pls. Err:0.508554360865687 Model:svmPoly. Err:0.465725851216821 Model:bagEarth. Err:0.520218440703968 Model:enet. Err:0.511770142459125 Model:nnet. Err:0.387874376630858

# 
for (i in 2:(length(lmls) + 1)) {
    plot(Yt[, 1], Yt[, i], xlim = c(0, 5), ylim = c(0, 5), main = paste("Model:", 
        lmls[i], sep = ""), xlab = "Original Y", ylab = "Predicted Y")
}

plot of chunk unnamed-chunk-11 plot of chunk unnamed-chunk-11 plot of chunk unnamed-chunk-11 plot of chunk unnamed-chunk-11 plot of chunk unnamed-chunk-11 plot of chunk unnamed-chunk-11 plot of chunk unnamed-chunk-11 plot of chunk unnamed-chunk-11 plot of chunk unnamed-chunk-11 plot of chunk unnamed-chunk-11 plot of chunk unnamed-chunk-11 plot of chunk unnamed-chunk-11

# 

Strategy 2: Feature Selection over raw data

According to the so called strategy 1, an ICA projection for a spectrum will be Feature selection is an important step for practical commercial data mining which is often characterised by data sets with far too many variables for model building.

## 
control <- rfeControl(functions = rfFuncs, method = "boot", verbose = FALSE, 
    returnResamp = "final", number = 50)
if (require("multicore", quietly = TRUE, warn.conflicts = FALSE)) {
    control$workers <- multicore:::detectCores()/2
    control$computeFunction <- mclapply
    control$computeArgs <- list(mc.preschedule = FALSE, mc.set.seed = FALSE)
}
sizes = 5:25
profile.1 <- rfe(data[ilx, -c(1, 3553, 3554)], Y, sizes = sizes, rfeControl = control)
# cat( 'rf : Profile 1 predictors:', predictors(profile.1), fill = TRUE )
vars = predictors(profile.1)
vars = vars[1:min(25, length(vars))]
vars

[1] “X1456” “X1710” “X1708” “X1709” “X1706” “X1464” “X1707” “X450” [9] “X456” “X1460” “X455” “X1457” “X451” “X1630” “X454” “X453” [17] “X1629” “X452” “X2863” “X1712” “X1632” “X2916” “X1492” “X2864” [25] “X1631”

# 
X = data[ilx, vars]
obj2 = train(X, Y, method = "rf", trControl = myControl)
plot(Y, predict(obj2, X), xlim = c(0, 5), ylim = c(0, 5), xlab = "Original Y", 
    ylab = "Predicted Y", main = paste("RF with ", length(vars), " raw variables.Training"))
abline(0, 1, col = 2)
Yp = predict(obj2, X)
lmyp = lm(Yp ~ 1 + Y)
r2 = summary(lmyp)$adj.r.squared
abline(lmyp$coefficients, col = 3)
legend("topleft", legend = as.expression(c(bquote("Equality"), bquote(paste("LM regressed. ", 
    R^2 == .(round(r2, 3)))))), fill = c(2, 3))
plot of chunk unnamed-chunk-12

plot of chunk unnamed-chunk-12

# 
if (length(ival) > 0) {
    Xnew = data[ival, vars]
    Ynew = data[ival, ncol(data)]
    Yout = predict(obj2, Xnew)
    plot(Yout, Ynew, xlim = c(0, 5), ylim = c(0, 5), xlab = "Original Y", ylab = "Predicted Y", 
        main = paste("RF with ", length(vars), " raw variables.Testing"))
    abline(0, 1, col = 2)
    # 
    lmyp = lm(Yout ~ 1 + Ynew)
    r2 = summary(lmyp)$adj.r.squared
    abline(lmyp$coefficients, col = 3)
    legend("topleft", legend = as.expression(c(bquote("Equality"), bquote(paste("LM regressed. ", 
        R^2 == .(round(r2, 3)))))), fill = c(2, 3))
    # 
}
plot of chunk unnamed-chunk-12

plot of chunk unnamed-chunk-12

# 
obj3 = train(X, Y, method = "pls", trControl = myControl)
plot(Y, predict(obj3, X), xlim = c(0, 5), ylim = c(0, 5), xlab = "Original Y", 
    ylab = "Predicted Y", main = paste("PLS with ", length(vars), " raw variables.Training"))
abline(0, 1, col = 2)
Yp = predict(obj3, X)
lmyp = lm(Yp ~ 1 + Y)
r2 = summary(lmyp)$adj.r.squared
abline(lmyp$coefficients, col = 3)
legend("topleft", legend = as.expression(c(bquote("Equality"), bquote(paste("LM regressed. ", 
    R^2 == .(round(r2, 3)))))), fill = c(2, 3))
plot of chunk unnamed-chunk-12

plot of chunk unnamed-chunk-12

# 
if (length(ival) > 0) {
    Xnew = data[ival, vars]
    Ynew = data[ival, ncol(data)]
    Yout = predict(obj3, Xnew)
    plot(Yout, Ynew, xlim = c(0, 5), ylim = c(0, 5), xlab = "Original Y", ylab = "Predicted Y", 
        main = paste("PLS with ", length(vars), " raw variables.Testing"))
    abline(0, 1, col = 2)
    # 
    lmyp = lm(Yout ~ 1 + Ynew)
    r2 = summary(lmyp)$adj.r.squared
    abline(lmyp$coefficients, col = 3)
    legend("topleft", legend = as.expression(c(bquote("Equality"), bquote(paste("LM regressed. ", 
        R^2 == .(round(r2, 3)))))), fill = c(2, 3))
    # 
}
plot of chunk unnamed-chunk-12

plot of chunk unnamed-chunk-12

# 
obj4 = train(X, Y, method = "lm", trControl = myControl)
plot(Y, predict(obj4, X), xlim = c(0, 5), ylim = c(0, 5), xlab = "Original Y", 
    ylab = "Predicted Y", main = paste("LM with ", length(vars), " raw variables.Training"))
abline(0, 1, col = 2)
Yp = predict(obj4, X)
lmyp = lm(Yp ~ 1 + Y)
r2 = summary(lmyp)$adj.r.squared
abline(lmyp$coefficients, col = 3)
legend("topleft", legend = as.expression(c(bquote("Equality"), bquote(paste("LM regressed. ", 
    R^2 == .(round(r2, 3)))))), fill = c(2, 3))
plot of chunk unnamed-chunk-12

plot of chunk unnamed-chunk-12

# 
if (length(ival) > 0) {
    Xnew = data[ival, vars]
    Ynew = data[ival, ncol(data)]
    Yout = predict(obj4, Xnew)
    plot(Yout, Ynew, xlim = c(0, 5), ylim = c(0, 5), xlab = "Original Y", ylab = "Predicted Y", 
        main = paste("LM with ", length(vars), " raw variables.Testing"))
    abline(0, 1, col = 2)
    # 
    lmyp = lm(Yout ~ 1 + Ynew)
    r2 = summary(lmyp)$adj.r.squared
    abline(lmyp$coefficients, col = 3)
    legend("topleft", legend = as.expression(c(bquote("Equality"), bquote(paste("LM regressed. ", 
        R^2 == .(round(r2, 3)))))), fill = c(2, 3))
    # 
}
plot of chunk unnamed-chunk-12

plot of chunk unnamed-chunk-12

iper = as.numeric("0.8")
ilx = sample(1:nrow(nacus), round(iper * nrow(nacus)), replace = FALSE)
ival = (1:nrow(nacus))[-ilx]

Analysis with learning percentage of 80%.

In the current iteration the percentage of samples used for training is 80%. Then, total samples for learning are 61 samples and 15 are used for testing pourposes.

According to the analysis requirements asked by JLV the 66% was randomly selected for traininglearning and the remaining 33% were used for model assessment.

Strategy 1: Project Raw Data before regressing models

According to the so called strategy 1, an ICA projection for a spectrum will be performed in order to be able to get a reduced dimension, then it could be recovered back with almost no loses.

As a matter of notation THe ICA component will store: * S matrix with is the projected components of the dataset * A matrix , which allows us to unproject the S data * W matrix, which allows us to project any raw spectrum * Xmu vector, which allows us to get the real values to unprojected data. Then, (X-mu) * t(W) = S and S * t(A) + mu = X

## 
ICAnacus = JADE(data[ilx, -c(1, 3553, 3554)], 5)
X = ICAnacus$S
Y = data[ilx, 3554]
require(caret)
folds = 5
repeats = 4
myControl = trainControl(method = "repeatedcv", number = folds, repeats = repeats, 
    returnResamp = "none", returnData = FALSE, savePredictions = TRUE, allowParallel = TRUE, 
    verboseIter = FALSE)
# 
if (require("multicore", quietly = TRUE, warn.conflicts = FALSE)) {
    myControl$workers <- multicore:::detectCores()/2
    myControl$computeFunction <- mclapply
    myControl$computeArgs <- list(mc.preschedule = FALSE, mc.set.seed = FALSE)
}
# 
lmls = c("lm", "rf", "ppr", "earth", "glm", "svmRadial", "glmnet", "pls", "svmPoly", 
    "bagEarth", "enet")
objs = list()
for (i in 1:length(lmls)) {
    cat(paste("Step:", i, ". Method:", lmls[i], "\n", sep = ""))
    objs[[i]] = train(X, Y, method = lmls[i], trControl = myControl)
}
## Step:1. Method:lm
## Step:2. Method:rf
## Step:3. Method:ppr
## Step:4. Method:earth
## Warning: There were missing values in resampled performance measures.
## Step:5. Method:glm
## Step:6. Method:svmRadial
## Step:7. Method:glmnet
## Warning: There were missing values in resampled performance measures.
## Step:8. Method:pls
## Step:9. Method:svmPoly
## Step:10. Method:bagEarth
## Step:11. Method:enet
i = i + 1
objs[[i]] = train(X, Y, method = "nnet", trControl = myControl, trace = FALSE)
## Warning: There were missing values in resampled performance measures.
imodels = i
lmls[i] = "nnet"
names(objs) = sapply(objs, function(x) x$method)
res = sapply(objs, function(x) x$results$Rsquared)
# 
res
## $lm
## [1] 0.2381
## 
## $rf
## [1] 0.2723 0.2653 0.2686
## 
## $ppr
## [1] 0.1468 0.2023 0.1812
## 
## $earth
## [1] 0.1312 0.2575 0.2575
## 
## $glm
## [1] 0.2096
## 
## $svmRadial
## [1] 0.2300 0.2232 0.2095
## 
## $glmnet
##  [1] 0.26004 0.25334 0.24779 0.23531 0.21448 0.19112 0.14326 0.09338
##  [9] 0.03046 0.25029 0.04943     NaN     NaN     NaN     NaN     NaN
## [17]     NaN     NaN 0.24178     NaN     NaN     NaN     NaN     NaN
## [25]     NaN     NaN     NaN
## 
## $pls
## [1] 0.2499 0.2469 0.2488
## 
## $svmPoly
##  [1] 0.1811 0.1811 0.1826 0.1882 0.2010 0.1922 0.1985 0.2143 0.2069 0.1807
## [11] 0.1825 0.1851 0.2002 0.1908 0.1995 0.2123 0.2252 0.2141 0.1803 0.1854
## [21] 0.1926 0.1936 0.1933 0.2012 0.1826 0.1788 0.1826
## 
## $bagEarth
## [1] 0.1628 0.1578 0.1422
## 
## $enet
## [1] 0.1644 0.1644 0.1644 0.2180 0.2180 0.2180 0.2355 0.2355 0.2374
## 
## $nnet
## [1]    NaN 0.2026 0.2577    NaN 0.2210 0.2572    NaN 0.2124 0.2541

After producing the models it’s time to assess their performance:

## 
Xnew = (as.matrix(sweep(data[ival, -c(1, 3553, 3554)], 2, ICAnacus$Xmu)) %*% 
    t(ICAnacus$W))
colnames(Xnew) = paste("IC.", 1:ncol(Xnew), sep = "", col = "")
Ytg = data[ival, 3554]
Yt = Ytg
for (i in 1:imodels) {
    Yt = cbind(Yt, predict(objs[[i]], Xnew))
    err = sum(abs(Yt[, ncol(Yt)] - Ytg)/Ytg/length(Ytg))
    cat(paste("Model:", lmls[i], ". Err:", err, "\n", sep = ""))
}

Model:lm. Err:0.483985185148561 Model:rf. Err:0.48238865317739 Model:ppr. Err:0.552523799894043 Model:earth. Err:0.46732593784035 Model:glm. Err:0.483985185148561 Model:svmRadial. Err:0.457084561999641 Model:glmnet. Err:0.405531981335386 Model:pls. Err:0.483985185148561 Model:svmPoly. Err:0.446569835888122 Model:bagEarth. Err:0.411578696154764 Model:enet. Err:0.423131379371532 Model:nnet. Err:0.447462735223527

# 
for (i in 2:(length(lmls) + 1)) {
    plot(Yt[, 1], Yt[, i], xlim = c(0, 5), ylim = c(0, 5), main = paste("Model:", 
        lmls[i], sep = ""), xlab = "Original Y", ylab = "Predicted Y")
}

plot of chunk unnamed-chunk-15 plot of chunk unnamed-chunk-15 plot of chunk unnamed-chunk-15 plot of chunk unnamed-chunk-15 plot of chunk unnamed-chunk-15 plot of chunk unnamed-chunk-15 plot of chunk unnamed-chunk-15 plot of chunk unnamed-chunk-15 plot of chunk unnamed-chunk-15 plot of chunk unnamed-chunk-15 plot of chunk unnamed-chunk-15 plot of chunk unnamed-chunk-15

# 

Strategy 2: Feature Selection over raw data

According to the so called strategy 1, an ICA projection for a spectrum will be Feature selection is an important step for practical commercial data mining which is often characterised by data sets with far too many variables for model building.

## 
control <- rfeControl(functions = rfFuncs, method = "boot", verbose = FALSE, 
    returnResamp = "final", number = 50)
if (require("multicore", quietly = TRUE, warn.conflicts = FALSE)) {
    control$workers <- multicore:::detectCores()/2
    control$computeFunction <- mclapply
    control$computeArgs <- list(mc.preschedule = FALSE, mc.set.seed = FALSE)
}
sizes = 5:25
profile.1 <- rfe(data[ilx, -c(1, 3553, 3554)], Y, sizes = sizes, rfeControl = control)
# cat( 'rf : Profile 1 predictors:', predictors(profile.1), fill = TRUE )
vars = predictors(profile.1)
vars = vars[1:min(25, length(vars))]
vars

[1] “X1711” “X1712” “X1709” “X1710” “X1375” “X1714”

# 
X = data[ilx, vars]
obj2 = train(X, Y, method = "rf", trControl = myControl)
plot(Y, predict(obj2, X), xlim = c(0, 5), ylim = c(0, 5), xlab = "Original Y", 
    ylab = "Predicted Y", main = paste("RF with ", length(vars), " raw variables.Training"))
abline(0, 1, col = 2)
Yp = predict(obj2, X)
lmyp = lm(Yp ~ 1 + Y)
r2 = summary(lmyp)$adj.r.squared
abline(lmyp$coefficients, col = 3)
legend("topleft", legend = as.expression(c(bquote("Equality"), bquote(paste("LM regressed. ", 
    R^2 == .(round(r2, 3)))))), fill = c(2, 3))
plot of chunk unnamed-chunk-16

plot of chunk unnamed-chunk-16

# 
if (length(ival) > 0) {
    Xnew = data[ival, vars]
    Ynew = data[ival, ncol(data)]
    Yout = predict(obj2, Xnew)
    plot(Yout, Ynew, xlim = c(0, 5), ylim = c(0, 5), xlab = "Original Y", ylab = "Predicted Y", 
        main = paste("RF with ", length(vars), " raw variables.Testing"))
    abline(0, 1, col = 2)
    # 
    lmyp = lm(Yout ~ 1 + Ynew)
    r2 = summary(lmyp)$adj.r.squared
    abline(lmyp$coefficients, col = 3)
    legend("topleft", legend = as.expression(c(bquote("Equality"), bquote(paste("LM regressed. ", 
        R^2 == .(round(r2, 3)))))), fill = c(2, 3))
    # 
}
plot of chunk unnamed-chunk-16

plot of chunk unnamed-chunk-16

# 
obj3 = train(X, Y, method = "pls", trControl = myControl)
plot(Y, predict(obj3, X), xlim = c(0, 5), ylim = c(0, 5), xlab = "Original Y", 
    ylab = "Predicted Y", main = paste("PLS with ", length(vars), " raw variables.Training"))
abline(0, 1, col = 2)
Yp = predict(obj3, X)
lmyp = lm(Yp ~ 1 + Y)
r2 = summary(lmyp)$adj.r.squared
abline(lmyp$coefficients, col = 3)
legend("topleft", legend = as.expression(c(bquote("Equality"), bquote(paste("LM regressed. ", 
    R^2 == .(round(r2, 3)))))), fill = c(2, 3))
plot of chunk unnamed-chunk-16

plot of chunk unnamed-chunk-16

# 
if (length(ival) > 0) {
    Xnew = data[ival, vars]
    Ynew = data[ival, ncol(data)]
    Yout = predict(obj3, Xnew)
    plot(Yout, Ynew, xlim = c(0, 5), ylim = c(0, 5), xlab = "Original Y", ylab = "Predicted Y", 
        main = paste("PLS with ", length(vars), " raw variables.Testing"))
    abline(0, 1, col = 2)
    # 
    lmyp = lm(Yout ~ 1 + Ynew)
    r2 = summary(lmyp)$adj.r.squared
    abline(lmyp$coefficients, col = 3)
    legend("topleft", legend = as.expression(c(bquote("Equality"), bquote(paste("LM regressed. ", 
        R^2 == .(round(r2, 3)))))), fill = c(2, 3))
    # 
}
plot of chunk unnamed-chunk-16

plot of chunk unnamed-chunk-16

# 
obj4 = train(X, Y, method = "lm", trControl = myControl)
plot(Y, predict(obj4, X), xlim = c(0, 5), ylim = c(0, 5), xlab = "Original Y", 
    ylab = "Predicted Y", main = paste("LM with ", length(vars), " raw variables.Training"))
abline(0, 1, col = 2)
Yp = predict(obj4, X)
lmyp = lm(Yp ~ 1 + Y)
r2 = summary(lmyp)$adj.r.squared
abline(lmyp$coefficients, col = 3)
legend("topleft", legend = as.expression(c(bquote("Equality"), bquote(paste("LM regressed. ", 
    R^2 == .(round(r2, 3)))))), fill = c(2, 3))
plot of chunk unnamed-chunk-16

plot of chunk unnamed-chunk-16

# 
if (length(ival) > 0) {
    Xnew = data[ival, vars]
    Ynew = data[ival, ncol(data)]
    Yout = predict(obj4, Xnew)
    plot(Yout, Ynew, xlim = c(0, 5), ylim = c(0, 5), xlab = "Original Y", ylab = "Predicted Y", 
        main = paste("LM with ", length(vars), " raw variables.Testing"))
    abline(0, 1, col = 2)
    # 
    lmyp = lm(Yout ~ 1 + Ynew)
    r2 = summary(lmyp)$adj.r.squared
    abline(lmyp$coefficients, col = 3)
    legend("topleft", legend = as.expression(c(bquote("Equality"), bquote(paste("LM regressed. ", 
        R^2 == .(round(r2, 3)))))), fill = c(2, 3))
    # 
}
plot of chunk unnamed-chunk-16

plot of chunk unnamed-chunk-16

iper = as.numeric("0.85")
ilx = sample(1:nrow(nacus), round(iper * nrow(nacus)), replace = FALSE)
ival = (1:nrow(nacus))[-ilx]

Analysis with learning percentage of 85%.

In the current iteration the percentage of samples used for training is 85%. Then, total samples for learning are 65 samples and 11 are used for testing pourposes.

According to the analysis requirements asked by JLV the 66% was randomly selected for traininglearning and the remaining 33% were used for model assessment.

Strategy 1: Project Raw Data before regressing models

According to the so called strategy 1, an ICA projection for a spectrum will be performed in order to be able to get a reduced dimension, then it could be recovered back with almost no loses.

As a matter of notation THe ICA component will store: * S matrix with is the projected components of the dataset * A matrix , which allows us to unproject the S data * W matrix, which allows us to project any raw spectrum * Xmu vector, which allows us to get the real values to unprojected data. Then, (X-mu) * t(W) = S and S * t(A) + mu = X

## 
ICAnacus = JADE(data[ilx, -c(1, 3553, 3554)], 5)
X = ICAnacus$S
Y = data[ilx, 3554]
require(caret)
folds = 5
repeats = 4
myControl = trainControl(method = "repeatedcv", number = folds, repeats = repeats, 
    returnResamp = "none", returnData = FALSE, savePredictions = TRUE, allowParallel = TRUE, 
    verboseIter = FALSE)
# 
if (require("multicore", quietly = TRUE, warn.conflicts = FALSE)) {
    myControl$workers <- multicore:::detectCores()/2
    myControl$computeFunction <- mclapply
    myControl$computeArgs <- list(mc.preschedule = FALSE, mc.set.seed = FALSE)
}
# 
lmls = c("lm", "rf", "ppr", "earth", "glm", "svmRadial", "glmnet", "pls", "svmPoly", 
    "bagEarth", "enet")
objs = list()
for (i in 1:length(lmls)) {
    cat(paste("Step:", i, ". Method:", lmls[i], "\n", sep = ""))
    objs[[i]] = train(X, Y, method = lmls[i], trControl = myControl)
}
## Step:1. Method:lm
## Step:2. Method:rf
## Step:3. Method:ppr
## Step:4. Method:earth
## Warning: There were missing values in resampled performance measures.
## Step:5. Method:glm
## Step:6. Method:svmRadial
## Step:7. Method:glmnet
## Warning: There were missing values in resampled performance measures.
## Step:8. Method:pls
## Step:9. Method:svmPoly
## Step:10. Method:bagEarth
## Step:11. Method:enet
i = i + 1
objs[[i]] = train(X, Y, method = "nnet", trControl = myControl, trace = FALSE)
## Warning: There were missing values in resampled performance measures.
imodels = i
lmls[i] = "nnet"
names(objs) = sapply(objs, function(x) x$method)
res = sapply(objs, function(x) x$results$Rsquared)
# 
res
## $lm
## [1] 0.2487
## 
## $rf
## [1] 0.2078 0.1957 0.1897
## 
## $ppr
## [1] 0.1919 0.1767 0.1433
## 
## $earth
## [1] 0.07507 0.23300 0.23300
## 
## $glm
## [1] 0.2351
## 
## $svmRadial
## [1] 0.1528 0.1432 0.1367
## 
## $glmnet
##  [1] 0.21433 0.21342 0.20485 0.19533 0.17954 0.15417 0.14628 0.14079
##  [9] 0.06598 0.21184 0.08079     NaN     NaN     NaN     NaN     NaN
## [17]     NaN     NaN 0.19844     NaN     NaN     NaN     NaN     NaN
## [25]     NaN     NaN     NaN
## 
## $pls
## [1] 0.2731 0.2658 0.2643
## 
## $svmPoly
##  [1] 0.1973 0.1955 0.1948 0.2018 0.2074 0.1997 0.2170 0.1784 0.1698 0.1956
## [11] 0.1950 0.1989 0.2084 0.2022 0.2074 0.1450 0.1254 0.1276 0.1949 0.1977
## [21] 0.2067 0.2042 0.1920 0.1961 0.1327 0.1310 0.1210
## 
## $bagEarth
## [1] 0.1319 0.2124 0.2118
## 
## $enet
## [1] 0.1289 0.1289 0.1290 0.2142 0.2142 0.2140 0.2181 0.2181 0.2177
## 
## $nnet
## [1]    NaN 0.1353 0.2333    NaN 0.1115 0.2340    NaN 0.1711 0.2345

After producing the models it’s time to assess their performance:

## 
Xnew = (as.matrix(sweep(data[ival, -c(1, 3553, 3554)], 2, ICAnacus$Xmu)) %*% 
    t(ICAnacus$W))
colnames(Xnew) = paste("IC.", 1:ncol(Xnew), sep = "", col = "")
Ytg = data[ival, 3554]
Yt = Ytg
for (i in 1:imodels) {
    Yt = cbind(Yt, predict(objs[[i]], Xnew))
    err = sum(abs(Yt[, ncol(Yt)] - Ytg)/Ytg/length(Ytg))
    cat(paste("Model:", lmls[i], ". Err:", err, "\n", sep = ""))
}

Model:lm. Err:0.582408463720319 Model:rf. Err:0.61407251962362 Model:ppr. Err:0.721363423698323 Model:earth. Err:0.641871143537574 Model:glm. Err:0.582408463720319 Model:svmRadial. Err:0.65129315143776 Model:glmnet. Err:0.633691677861698 Model:pls. Err:0.582408463720319 Model:svmPoly. Err:0.604709147345013 Model:bagEarth. Err:0.679390392940914 Model:enet. Err:0.622939358790662 Model:nnet. Err:0.426100845418746

# 
for (i in 2:(length(lmls) + 1)) {
    plot(Yt[, 1], Yt[, i], xlim = c(0, 5), ylim = c(0, 5), main = paste("Model:", 
        lmls[i], sep = ""), xlab = "Original Y", ylab = "Predicted Y")
}

plot of chunk unnamed-chunk-19 plot of chunk unnamed-chunk-19 plot of chunk unnamed-chunk-19 plot of chunk unnamed-chunk-19 plot of chunk unnamed-chunk-19 plot of chunk unnamed-chunk-19 plot of chunk unnamed-chunk-19 plot of chunk unnamed-chunk-19 plot of chunk unnamed-chunk-19 plot of chunk unnamed-chunk-19 plot of chunk unnamed-chunk-19 plot of chunk unnamed-chunk-19

# 

Strategy 2: Feature Selection over raw data

According to the so called strategy 1, an ICA projection for a spectrum will be Feature selection is an important step for practical commercial data mining which is often characterised by data sets with far too many variables for model building.

## 
control <- rfeControl(functions = rfFuncs, method = "boot", verbose = FALSE, 
    returnResamp = "final", number = 50)
if (require("multicore", quietly = TRUE, warn.conflicts = FALSE)) {
    control$workers <- multicore:::detectCores()/2
    control$computeFunction <- mclapply
    control$computeArgs <- list(mc.preschedule = FALSE, mc.set.seed = FALSE)
}
sizes = 5:25
profile.1 <- rfe(data[ilx, -c(1, 3553, 3554)], Y, sizes = sizes, rfeControl = control)
# cat( 'rf : Profile 1 predictors:', predictors(profile.1), fill = TRUE )
vars = predictors(profile.1)
vars = vars[1:min(25, length(vars))]
vars

[1] “X1384” “X2862” “X454” “X722” “X451” “X455” “X1464” “X2863” [9] “X1366” “X2924” “X456” “X1456” “X1383” “X1369” “X1715” “X2922” [17] “X2987” “X2923” “X1379” “X2954” “X1722” “X469” “X2933” “X1367” [25] “X457”

# 
X = data[ilx, vars]
obj2 = train(X, Y, method = "rf", trControl = myControl)
plot(Y, predict(obj2, X), xlim = c(0, 5), ylim = c(0, 5), xlab = "Original Y", 
    ylab = "Predicted Y", main = paste("RF with ", length(vars), " raw variables.Training"))
abline(0, 1, col = 2)
Yp = predict(obj2, X)
lmyp = lm(Yp ~ 1 + Y)
r2 = summary(lmyp)$adj.r.squared
abline(lmyp$coefficients, col = 3)
legend("topleft", legend = as.expression(c(bquote("Equality"), bquote(paste("LM regressed. ", 
    R^2 == .(round(r2, 3)))))), fill = c(2, 3))
plot of chunk unnamed-chunk-20

plot of chunk unnamed-chunk-20

# 
if (length(ival) > 0) {
    Xnew = data[ival, vars]
    Ynew = data[ival, ncol(data)]
    Yout = predict(obj2, Xnew)
    plot(Yout, Ynew, xlim = c(0, 5), ylim = c(0, 5), xlab = "Original Y", ylab = "Predicted Y", 
        main = paste("RF with ", length(vars), " raw variables.Testing"))
    abline(0, 1, col = 2)
    # 
    lmyp = lm(Yout ~ 1 + Ynew)
    r2 = summary(lmyp)$adj.r.squared
    abline(lmyp$coefficients, col = 3)
    legend("topleft", legend = as.expression(c(bquote("Equality"), bquote(paste("LM regressed. ", 
        R^2 == .(round(r2, 3)))))), fill = c(2, 3))
    # 
}
plot of chunk unnamed-chunk-20

plot of chunk unnamed-chunk-20

# 
obj3 = train(X, Y, method = "pls", trControl = myControl)
plot(Y, predict(obj3, X), xlim = c(0, 5), ylim = c(0, 5), xlab = "Original Y", 
    ylab = "Predicted Y", main = paste("PLS with ", length(vars), " raw variables.Training"))
abline(0, 1, col = 2)
Yp = predict(obj3, X)
lmyp = lm(Yp ~ 1 + Y)
r2 = summary(lmyp)$adj.r.squared
abline(lmyp$coefficients, col = 3)
legend("topleft", legend = as.expression(c(bquote("Equality"), bquote(paste("LM regressed. ", 
    R^2 == .(round(r2, 3)))))), fill = c(2, 3))
plot of chunk unnamed-chunk-20

plot of chunk unnamed-chunk-20

# 
if (length(ival) > 0) {
    Xnew = data[ival, vars]
    Ynew = data[ival, ncol(data)]
    Yout = predict(obj3, Xnew)
    plot(Yout, Ynew, xlim = c(0, 5), ylim = c(0, 5), xlab = "Original Y", ylab = "Predicted Y", 
        main = paste("PLS with ", length(vars), " raw variables.Testing"))
    abline(0, 1, col = 2)
    # 
    lmyp = lm(Yout ~ 1 + Ynew)
    r2 = summary(lmyp)$adj.r.squared
    abline(lmyp$coefficients, col = 3)
    legend("topleft", legend = as.expression(c(bquote("Equality"), bquote(paste("LM regressed. ", 
        R^2 == .(round(r2, 3)))))), fill = c(2, 3))
    # 
}
plot of chunk unnamed-chunk-20

plot of chunk unnamed-chunk-20

# 
obj4 = train(X, Y, method = "lm", trControl = myControl)
plot(Y, predict(obj4, X), xlim = c(0, 5), ylim = c(0, 5), xlab = "Original Y", 
    ylab = "Predicted Y", main = paste("LM with ", length(vars), " raw variables.Training"))
abline(0, 1, col = 2)
Yp = predict(obj4, X)
lmyp = lm(Yp ~ 1 + Y)
r2 = summary(lmyp)$adj.r.squared
abline(lmyp$coefficients, col = 3)
legend("topleft", legend = as.expression(c(bquote("Equality"), bquote(paste("LM regressed. ", 
    R^2 == .(round(r2, 3)))))), fill = c(2, 3))
plot of chunk unnamed-chunk-20

plot of chunk unnamed-chunk-20

# 
if (length(ival) > 0) {
    Xnew = data[ival, vars]
    Ynew = data[ival, ncol(data)]
    Yout = predict(obj4, Xnew)
    plot(Yout, Ynew, xlim = c(0, 5), ylim = c(0, 5), xlab = "Original Y", ylab = "Predicted Y", 
        main = paste("LM with ", length(vars), " raw variables.Testing"))
    abline(0, 1, col = 2)
    # 
    lmyp = lm(Yout ~ 1 + Ynew)
    r2 = summary(lmyp)$adj.r.squared
    abline(lmyp$coefficients, col = 3)
    legend("topleft", legend = as.expression(c(bquote("Equality"), bquote(paste("LM regressed. ", 
        R^2 == .(round(r2, 3)))))), fill = c(2, 3))
    # 
}
plot of chunk unnamed-chunk-20

plot of chunk unnamed-chunk-20

iper = as.numeric("0.9")
ilx = sample(1:nrow(nacus), round(iper * nrow(nacus)), replace = FALSE)
ival = (1:nrow(nacus))[-ilx]

Analysis with learning percentage of 90%.

In the current iteration the percentage of samples used for training is 90%. Then, total samples for learning are 68 samples and 8 are used for testing pourposes.

According to the analysis requirements asked by JLV the 66% was randomly selected for traininglearning and the remaining 33% were used for model assessment.

Strategy 1: Project Raw Data before regressing models

According to the so called strategy 1, an ICA projection for a spectrum will be performed in order to be able to get a reduced dimension, then it could be recovered back with almost no loses.

As a matter of notation THe ICA component will store: * S matrix with is the projected components of the dataset * A matrix , which allows us to unproject the S data * W matrix, which allows us to project any raw spectrum * Xmu vector, which allows us to get the real values to unprojected data. Then, (X-mu) * t(W) = S and S * t(A) + mu = X

## 
ICAnacus = JADE(data[ilx, -c(1, 3553, 3554)], 5)
X = ICAnacus$S
Y = data[ilx, 3554]
require(caret)
folds = 5
repeats = 4
myControl = trainControl(method = "repeatedcv", number = folds, repeats = repeats, 
    returnResamp = "none", returnData = FALSE, savePredictions = TRUE, allowParallel = TRUE, 
    verboseIter = FALSE)
# 
if (require("multicore", quietly = TRUE, warn.conflicts = FALSE)) {
    myControl$workers <- multicore:::detectCores()/2
    myControl$computeFunction <- mclapply
    myControl$computeArgs <- list(mc.preschedule = FALSE, mc.set.seed = FALSE)
}
# 
lmls = c("lm", "rf", "ppr", "earth", "glm", "svmRadial", "glmnet", "pls", "svmPoly", 
    "bagEarth", "enet")
objs = list()
for (i in 1:length(lmls)) {
    cat(paste("Step:", i, ". Method:", lmls[i], "\n", sep = ""))
    objs[[i]] = train(X, Y, method = lmls[i], trControl = myControl)
}
## Step:1. Method:lm
## Step:2. Method:rf
## Step:3. Method:ppr
## Step:4. Method:earth
## Warning: There were missing values in resampled performance measures.
## Step:5. Method:glm
## Step:6. Method:svmRadial
## Step:7. Method:glmnet
## Warning: There were missing values in resampled performance measures.
## Step:8. Method:pls
## Step:9. Method:svmPoly
## Step:10. Method:bagEarth
## Step:11. Method:enet
i = i + 1
objs[[i]] = train(X, Y, method = "nnet", trControl = myControl, trace = FALSE)
## Warning: There were missing values in resampled performance measures.
imodels = i
lmls[i] = "nnet"
names(objs) = sapply(objs, function(x) x$method)
res = sapply(objs, function(x) x$results$Rsquared)
# 
res
## $lm
## [1] 0.2269
## 
## $rf
## [1] 0.1727 0.1666 0.1568
## 
## $ppr
## [1] 0.1197 0.1356 0.1092
## 
## $earth
## [1] 0.06802 0.11219 0.11219
## 
## $glm
## [1] 0.2039
## 
## $svmRadial
## [1] 0.2024 0.1880 0.1871
## 
## $glmnet
##  [1] 0.212424 0.209386 0.204041 0.180379 0.150278 0.113634 0.072641
##  [8] 0.018370 0.004132 0.206728 0.006583      NaN      NaN      NaN
## [15]      NaN      NaN      NaN      NaN 0.191869      NaN      NaN
## [22]      NaN      NaN      NaN      NaN      NaN      NaN
## 
## $pls
## [1] 0.2171 0.2143 0.2050
## 
## $svmPoly
##  [1] 0.2082 0.2067 0.1976 0.1940 0.2054 0.2092 0.2366 0.2391 0.2203 0.2056
## [11] 0.1966 0.1959 0.1993 0.2058 0.2269 0.1795 0.1686 0.1362 0.2001 0.1967
## [21] 0.1937 0.1994 0.2179 0.2286 0.1445 0.1228 0.1266
## 
## $bagEarth
## [1] 0.09631 0.10161 0.10476
## 
## $enet
## [1] 0.08561 0.08561 0.08547 0.19382 0.19382 0.19428 0.22054 0.22055 0.22099
## 
## $nnet
## [1]    NaN 0.1958 0.2054    NaN 0.1847 0.2050    NaN 0.1520 0.2048

After producing the models it’s time to assess their performance:

## 
Xnew = (as.matrix(sweep(data[ival, -c(1, 3553, 3554)], 2, ICAnacus$Xmu)) %*% 
    t(ICAnacus$W))
colnames(Xnew) = paste("IC.", 1:ncol(Xnew), sep = "", col = "")
Ytg = data[ival, 3554]
Yt = Ytg
for (i in 1:imodels) {
    Yt = cbind(Yt, predict(objs[[i]], Xnew))
    err = sum(abs(Yt[, ncol(Yt)] - Ytg)/Ytg/length(Ytg))
    cat(paste("Model:", lmls[i], ". Err:", err, "\n", sep = ""))
}

Model:lm. Err:1.79859009720703 Model:rf. Err:1.55873470859917 Model:ppr. Err:1.87677094877035 Model:earth. Err:1.93075189006891 Model:glm. Err:1.79859009720703 Model:svmRadial. Err:1.44924308921809 Model:glmnet. Err:1.77117005892429 Model:pls. Err:1.79859009720703 Model:svmPoly. Err:1.54213903071568 Model:bagEarth. Err:1.83651340559752 Model:enet. Err:1.79859009720703 Model:nnet. Err:0.799770541687389

# 
for (i in 2:(length(lmls) + 1)) {
    plot(Yt[, 1], Yt[, i], xlim = c(0, 5), ylim = c(0, 5), main = paste("Model:", 
        lmls[i], sep = ""), xlab = "Original Y", ylab = "Predicted Y")
}

plot of chunk unnamed-chunk-23 plot of chunk unnamed-chunk-23 plot of chunk unnamed-chunk-23 plot of chunk unnamed-chunk-23 plot of chunk unnamed-chunk-23 plot of chunk unnamed-chunk-23 plot of chunk unnamed-chunk-23 plot of chunk unnamed-chunk-23 plot of chunk unnamed-chunk-23 plot of chunk unnamed-chunk-23 plot of chunk unnamed-chunk-23 plot of chunk unnamed-chunk-23

# 

Strategy 2: Feature Selection over raw data

According to the so called strategy 1, an ICA projection for a spectrum will be Feature selection is an important step for practical commercial data mining which is often characterised by data sets with far too many variables for model building.

## 
control <- rfeControl(functions = rfFuncs, method = "boot", verbose = FALSE, 
    returnResamp = "final", number = 50)
if (require("multicore", quietly = TRUE, warn.conflicts = FALSE)) {
    control$workers <- multicore:::detectCores()/2
    control$computeFunction <- mclapply
    control$computeArgs <- list(mc.preschedule = FALSE, mc.set.seed = FALSE)
}
sizes = 5:25
profile.1 <- rfe(data[ilx, -c(1, 3553, 3554)], Y, sizes = sizes, rfeControl = control)
# cat( 'rf : Profile 1 predictors:', predictors(profile.1), fill = TRUE )
vars = predictors(profile.1)
vars = vars[1:min(25, length(vars))]
vars

[1] “X1709” “X1715” “X1714” “X722” “X1710” “X1711” “X1706” “X1712” [9] “X1708” “X1707” “X450” “X1436” “X1716” “X717” “X1713” “X716” [17] “X1717” “X1721” “X452” “X451” “X1718” “X1379” “X1705” “X2922” [25] “X2954”

# 
X = data[ilx, vars]
obj2 = train(X, Y, method = "rf", trControl = myControl)
plot(Y, predict(obj2, X), xlim = c(0, 5), ylim = c(0, 5), xlab = "Original Y", 
    ylab = "Predicted Y", main = paste("RF with ", length(vars), " raw variables.Training"))
abline(0, 1, col = 2)
Yp = predict(obj2, X)
lmyp = lm(Yp ~ 1 + Y)
r2 = summary(lmyp)$adj.r.squared
abline(lmyp$coefficients, col = 3)
legend("topleft", legend = as.expression(c(bquote("Equality"), bquote(paste("LM regressed. ", 
    R^2 == .(round(r2, 3)))))), fill = c(2, 3))
plot of chunk unnamed-chunk-24

plot of chunk unnamed-chunk-24

# 
if (length(ival) > 0) {
    Xnew = data[ival, vars]
    Ynew = data[ival, ncol(data)]
    Yout = predict(obj2, Xnew)
    plot(Yout, Ynew, xlim = c(0, 5), ylim = c(0, 5), xlab = "Original Y", ylab = "Predicted Y", 
        main = paste("RF with ", length(vars), " raw variables.Testing"))
    abline(0, 1, col = 2)
    # 
    lmyp = lm(Yout ~ 1 + Ynew)
    r2 = summary(lmyp)$adj.r.squared
    abline(lmyp$coefficients, col = 3)
    legend("topleft", legend = as.expression(c(bquote("Equality"), bquote(paste("LM regressed. ", 
        R^2 == .(round(r2, 3)))))), fill = c(2, 3))
    # 
}
plot of chunk unnamed-chunk-24

plot of chunk unnamed-chunk-24

# 
obj3 = train(X, Y, method = "pls", trControl = myControl)
plot(Y, predict(obj3, X), xlim = c(0, 5), ylim = c(0, 5), xlab = "Original Y", 
    ylab = "Predicted Y", main = paste("PLS with ", length(vars), " raw variables.Training"))
abline(0, 1, col = 2)
Yp = predict(obj3, X)
lmyp = lm(Yp ~ 1 + Y)
r2 = summary(lmyp)$adj.r.squared
abline(lmyp$coefficients, col = 3)
legend("topleft", legend = as.expression(c(bquote("Equality"), bquote(paste("LM regressed. ", 
    R^2 == .(round(r2, 3)))))), fill = c(2, 3))
plot of chunk unnamed-chunk-24

plot of chunk unnamed-chunk-24

# 
if (length(ival) > 0) {
    Xnew = data[ival, vars]
    Ynew = data[ival, ncol(data)]
    Yout = predict(obj3, Xnew)
    plot(Yout, Ynew, xlim = c(0, 5), ylim = c(0, 5), xlab = "Original Y", ylab = "Predicted Y", 
        main = paste("PLS with ", length(vars), " raw variables.Testing"))
    abline(0, 1, col = 2)
    # 
    lmyp = lm(Yout ~ 1 + Ynew)
    r2 = summary(lmyp)$adj.r.squared
    abline(lmyp$coefficients, col = 3)
    legend("topleft", legend = as.expression(c(bquote("Equality"), bquote(paste("LM regressed. ", 
        R^2 == .(round(r2, 3)))))), fill = c(2, 3))
    # 
}
plot of chunk unnamed-chunk-24

plot of chunk unnamed-chunk-24

# 
obj4 = train(X, Y, method = "lm", trControl = myControl)
plot(Y, predict(obj4, X), xlim = c(0, 5), ylim = c(0, 5), xlab = "Original Y", 
    ylab = "Predicted Y", main = paste("LM with ", length(vars), " raw variables.Training"))
abline(0, 1, col = 2)
Yp = predict(obj4, X)
lmyp = lm(Yp ~ 1 + Y)
r2 = summary(lmyp)$adj.r.squared
abline(lmyp$coefficients, col = 3)
legend("topleft", legend = as.expression(c(bquote("Equality"), bquote(paste("LM regressed. ", 
    R^2 == .(round(r2, 3)))))), fill = c(2, 3))
plot of chunk unnamed-chunk-24

plot of chunk unnamed-chunk-24

# 
if (length(ival) > 0) {
    Xnew = data[ival, vars]
    Ynew = data[ival, ncol(data)]
    Yout = predict(obj4, Xnew)
    plot(Yout, Ynew, xlim = c(0, 5), ylim = c(0, 5), xlab = "Original Y", ylab = "Predicted Y", 
        main = paste("LM with ", length(vars), " raw variables.Testing"))
    abline(0, 1, col = 2)
    # 
    lmyp = lm(Yout ~ 1 + Ynew)
    r2 = summary(lmyp)$adj.r.squared
    abline(lmyp$coefficients, col = 3)
    legend("topleft", legend = as.expression(c(bquote("Equality"), bquote(paste("LM regressed. ", 
        R^2 == .(round(r2, 3)))))), fill = c(2, 3))
    # 
}
plot of chunk unnamed-chunk-24

plot of chunk unnamed-chunk-24

iper = as.numeric("0.95")
ilx = sample(1:nrow(nacus), round(iper * nrow(nacus)), replace = FALSE)
ival = (1:nrow(nacus))[-ilx]

Analysis with learning percentage of 95%.

In the current iteration the percentage of samples used for training is 95%. Then, total samples for learning are 72 samples and 4 are used for testing pourposes.

According to the analysis requirements asked by JLV the 66% was randomly selected for traininglearning and the remaining 33% were used for model assessment.

Strategy 1: Project Raw Data before regressing models

According to the so called strategy 1, an ICA projection for a spectrum will be performed in order to be able to get a reduced dimension, then it could be recovered back with almost no loses.

As a matter of notation THe ICA component will store: * S matrix with is the projected components of the dataset * A matrix , which allows us to unproject the S data * W matrix, which allows us to project any raw spectrum * Xmu vector, which allows us to get the real values to unprojected data. Then, (X-mu) * t(W) = S and S * t(A) + mu = X

## 
ICAnacus = JADE(data[ilx, -c(1, 3553, 3554)], 5)
X = ICAnacus$S
Y = data[ilx, 3554]
require(caret)
folds = 5
repeats = 4
myControl = trainControl(method = "repeatedcv", number = folds, repeats = repeats, 
    returnResamp = "none", returnData = FALSE, savePredictions = TRUE, allowParallel = TRUE, 
    verboseIter = FALSE)
# 
if (require("multicore", quietly = TRUE, warn.conflicts = FALSE)) {
    myControl$workers <- multicore:::detectCores()/2
    myControl$computeFunction <- mclapply
    myControl$computeArgs <- list(mc.preschedule = FALSE, mc.set.seed = FALSE)
}
# 
lmls = c("lm", "rf", "ppr", "earth", "glm", "svmRadial", "glmnet", "pls", "svmPoly", 
    "bagEarth", "enet")
objs = list()
for (i in 1:length(lmls)) {
    cat(paste("Step:", i, ". Method:", lmls[i], "\n", sep = ""))
    objs[[i]] = train(X, Y, method = lmls[i], trControl = myControl)
}
## Step:1. Method:lm
## Step:2. Method:rf
## Step:3. Method:ppr
## Step:4. Method:earth
## Warning: There were missing values in resampled performance measures.
## Step:5. Method:glm
## Step:6. Method:svmRadial
## Step:7. Method:glmnet
## Warning: There were missing values in resampled performance measures.
## Step:8. Method:pls
## Step:9. Method:svmPoly
## Step:10. Method:bagEarth
## Step:11. Method:enet
i = i + 1
objs[[i]] = train(X, Y, method = "nnet", trControl = myControl, trace = FALSE)
## Warning: There were missing values in resampled performance measures.
imodels = i
lmls[i] = "nnet"
names(objs) = sapply(objs, function(x) x$method)
res = sapply(objs, function(x) x$results$Rsquared)
# 
res
## $lm
## [1] 0.1673
## 
## $rf
## [1] 0.1455 0.1367 0.1258
## 
## $ppr
## [1] 0.09005 0.10902 0.09352
## 
## $earth
## [1] 0.02042 0.07277 0.07277
## 
## $glm
## [1] 0.1742
## 
## $svmRadial
## [1] 0.1686 0.1580 0.1386
## 
## $glmnet
##  [1] 0.15942 0.15309 0.13419 0.09523 0.06913 0.04814 0.04230 0.02936
##  [9] 0.04337 0.14905 0.03093     NaN     NaN     NaN     NaN     NaN
## [17]     NaN     NaN 0.11505     NaN     NaN     NaN     NaN     NaN
## [25]     NaN     NaN     NaN
## 
## $pls
## [1] 0.1924 0.1818 0.1818
## 
## $svmPoly
##  [1] 0.1560 0.1539 0.1604 0.1547 0.1757 0.1712 0.1719 0.1688 0.1635 0.1533
## [11] 0.1599 0.1539 0.1735 0.1728 0.1744 0.1399 0.1342 0.1428 0.1576 0.1560
## [21] 0.1574 0.1699 0.1747 0.1650 0.1303 0.1433 0.1418
## 
## $bagEarth
## [1] 0.10250 0.11607 0.08273
## 
## $enet
## [1] 0.08121 0.08121 0.08114 0.16049 0.16049 0.16003 0.15458 0.15458 0.15410
## 
## $nnet
## [1]     NaN 0.11801 0.16056     NaN 0.08952 0.16202     NaN 0.12992 0.16330

After producing the models it’s time to assess their performance:

## 
Xnew = (as.matrix(sweep(data[ival, -c(1, 3553, 3554)], 2, ICAnacus$Xmu)) %*% 
    t(ICAnacus$W))
colnames(Xnew) = paste("IC.", 1:ncol(Xnew), sep = "", col = "")
Ytg = data[ival, 3554]
Yt = Ytg
for (i in 1:imodels) {
    Yt = cbind(Yt, predict(objs[[i]], Xnew))
    err = sum(abs(Yt[, ncol(Yt)] - Ytg)/Ytg/length(Ytg))
    cat(paste("Model:", lmls[i], ". Err:", err, "\n", sep = ""))
}

Model:lm. Err:0.291108077405638 Model:rf. Err:0.236280721818786 Model:ppr. Err:0.489272738337661 Model:earth. Err:0.292508626182346 Model:glm. Err:0.291108077405638 Model:svmRadial. Err:0.209816755349296 Model:glmnet. Err:0.247308255387814 Model:pls. Err:0.291108077405638 Model:svmPoly. Err:0.228690169321732 Model:bagEarth. Err:0.256014687811818 Model:enet. Err:0.244923502622106 Model:nnet. Err:0.287563815002839

# 
for (i in 2:(length(lmls) + 1)) {
    plot(Yt[, 1], Yt[, i], xlim = c(0, 5), ylim = c(0, 5), main = paste("Model:", 
        lmls[i], sep = ""), xlab = "Original Y", ylab = "Predicted Y")
}

plot of chunk unnamed-chunk-27 plot of chunk unnamed-chunk-27 plot of chunk unnamed-chunk-27 plot of chunk unnamed-chunk-27 plot of chunk unnamed-chunk-27 plot of chunk unnamed-chunk-27 plot of chunk unnamed-chunk-27 plot of chunk unnamed-chunk-27 plot of chunk unnamed-chunk-27 plot of chunk unnamed-chunk-27 plot of chunk unnamed-chunk-27 plot of chunk unnamed-chunk-27

# 

Strategy 2: Feature Selection over raw data

According to the so called strategy 1, an ICA projection for a spectrum will be Feature selection is an important step for practical commercial data mining which is often characterised by data sets with far too many variables for model building.

## 
control <- rfeControl(functions = rfFuncs, method = "boot", verbose = FALSE, 
    returnResamp = "final", number = 50)
if (require("multicore", quietly = TRUE, warn.conflicts = FALSE)) {
    control$workers <- multicore:::detectCores()/2
    control$computeFunction <- mclapply
    control$computeArgs <- list(mc.preschedule = FALSE, mc.set.seed = FALSE)
}
sizes = 5:25
profile.1 <- rfe(data[ilx, -c(1, 3553, 3554)], Y, sizes = sizes, rfeControl = control)
# cat( 'rf : Profile 1 predictors:', predictors(profile.1), fill = TRUE )
vars = predictors(profile.1)
vars = vars[1:min(25, length(vars))]
vars

[1] “X1715” “X1712” “X1714” “X1711” “X452” “X722” “X1710” “X1709” [9] “X450” “X2925” “X1708” “X1379” “X2933” “X469” “X454” “X1706” [17] “X451” “X1707” “X2918” “X1717” “X2919” “X2863” “X1716” “X1713” [25] “X1718”

# 
X = data[ilx, vars]
obj2 = train(X, Y, method = "rf", trControl = myControl)
plot(Y, predict(obj2, X), xlim = c(0, 5), ylim = c(0, 5), xlab = "Original Y", 
    ylab = "Predicted Y", main = paste("RF with ", length(vars), " raw variables.Training"))
abline(0, 1, col = 2)
Yp = predict(obj2, X)
lmyp = lm(Yp ~ 1 + Y)
r2 = summary(lmyp)$adj.r.squared
abline(lmyp$coefficients, col = 3)
legend("topleft", legend = as.expression(c(bquote("Equality"), bquote(paste("LM regressed. ", 
    R^2 == .(round(r2, 3)))))), fill = c(2, 3))
plot of chunk unnamed-chunk-28

plot of chunk unnamed-chunk-28

# 
if (length(ival) > 0) {
    Xnew = data[ival, vars]
    Ynew = data[ival, ncol(data)]
    Yout = predict(obj2, Xnew)
    plot(Yout, Ynew, xlim = c(0, 5), ylim = c(0, 5), xlab = "Original Y", ylab = "Predicted Y", 
        main = paste("RF with ", length(vars), " raw variables.Testing"))
    abline(0, 1, col = 2)
    # 
    lmyp = lm(Yout ~ 1 + Ynew)
    r2 = summary(lmyp)$adj.r.squared
    abline(lmyp$coefficients, col = 3)
    legend("topleft", legend = as.expression(c(bquote("Equality"), bquote(paste("LM regressed. ", 
        R^2 == .(round(r2, 3)))))), fill = c(2, 3))
    # 
}
plot of chunk unnamed-chunk-28

plot of chunk unnamed-chunk-28

# 
obj3 = train(X, Y, method = "pls", trControl = myControl)
plot(Y, predict(obj3, X), xlim = c(0, 5), ylim = c(0, 5), xlab = "Original Y", 
    ylab = "Predicted Y", main = paste("PLS with ", length(vars), " raw variables.Training"))
abline(0, 1, col = 2)
Yp = predict(obj3, X)
lmyp = lm(Yp ~ 1 + Y)
r2 = summary(lmyp)$adj.r.squared
abline(lmyp$coefficients, col = 3)
legend("topleft", legend = as.expression(c(bquote("Equality"), bquote(paste("LM regressed. ", 
    R^2 == .(round(r2, 3)))))), fill = c(2, 3))
plot of chunk unnamed-chunk-28

plot of chunk unnamed-chunk-28

# 
if (length(ival) > 0) {
    Xnew = data[ival, vars]
    Ynew = data[ival, ncol(data)]
    Yout = predict(obj3, Xnew)
    plot(Yout, Ynew, xlim = c(0, 5), ylim = c(0, 5), xlab = "Original Y", ylab = "Predicted Y", 
        main = paste("PLS with ", length(vars), " raw variables.Testing"))
    abline(0, 1, col = 2)
    # 
    lmyp = lm(Yout ~ 1 + Ynew)
    r2 = summary(lmyp)$adj.r.squared
    abline(lmyp$coefficients, col = 3)
    legend("topleft", legend = as.expression(c(bquote("Equality"), bquote(paste("LM regressed. ", 
        R^2 == .(round(r2, 3)))))), fill = c(2, 3))
    # 
}
plot of chunk unnamed-chunk-28

plot of chunk unnamed-chunk-28

# 
obj4 = train(X, Y, method = "lm", trControl = myControl)
plot(Y, predict(obj4, X), xlim = c(0, 5), ylim = c(0, 5), xlab = "Original Y", 
    ylab = "Predicted Y", main = paste("LM with ", length(vars), " raw variables.Training"))
abline(0, 1, col = 2)
Yp = predict(obj4, X)
lmyp = lm(Yp ~ 1 + Y)
r2 = summary(lmyp)$adj.r.squared
abline(lmyp$coefficients, col = 3)
legend("topleft", legend = as.expression(c(bquote("Equality"), bquote(paste("LM regressed. ", 
    R^2 == .(round(r2, 3)))))), fill = c(2, 3))
plot of chunk unnamed-chunk-28

plot of chunk unnamed-chunk-28

# 
if (length(ival) > 0) {
    Xnew = data[ival, vars]
    Ynew = data[ival, ncol(data)]
    Yout = predict(obj4, Xnew)
    plot(Yout, Ynew, xlim = c(0, 5), ylim = c(0, 5), xlab = "Original Y", ylab = "Predicted Y", 
        main = paste("LM with ", length(vars), " raw variables.Testing"))
    abline(0, 1, col = 2)
    # 
    lmyp = lm(Yout ~ 1 + Ynew)
    r2 = summary(lmyp)$adj.r.squared
    abline(lmyp$coefficients, col = 3)
    legend("topleft", legend = as.expression(c(bquote("Equality"), bquote(paste("LM regressed. ", 
        R^2 == .(round(r2, 3)))))), fill = c(2, 3))
    # 
}
plot of chunk unnamed-chunk-28

plot of chunk unnamed-chunk-28

iper = as.numeric("1")
ilx = sample(1:nrow(nacus), round(iper * nrow(nacus)), replace = FALSE)
ival = (1:nrow(nacus))[-ilx]

Analysis with learning percentage of 100%.

In the current iteration the percentage of samples used for training is 100%. Then, total samples for learning are 76 samples and 0 are used for testing pourposes.

According to the analysis requirements asked by JLV the 66% was randomly selected for traininglearning and the remaining 33% were used for model assessment.

Strategy 1: Project Raw Data before regressing models

According to the so called strategy 1, an ICA projection for a spectrum will be performed in order to be able to get a reduced dimension, then it could be recovered back with almost no loses.

As a matter of notation THe ICA component will store: * S matrix with is the projected components of the dataset * A matrix , which allows us to unproject the S data * W matrix, which allows us to project any raw spectrum * Xmu vector, which allows us to get the real values to unprojected data. Then, (X-mu) * t(W) = S and S * t(A) + mu = X

## 
ICAnacus = JADE(data[ilx, -c(1, 3553, 3554)], 5)
X = ICAnacus$S
Y = data[ilx, 3554]
require(caret)
folds = 5
repeats = 4
myControl = trainControl(method = "repeatedcv", number = folds, repeats = repeats, 
    returnResamp = "none", returnData = FALSE, savePredictions = TRUE, allowParallel = TRUE, 
    verboseIter = FALSE)
# 
if (require("multicore", quietly = TRUE, warn.conflicts = FALSE)) {
    myControl$workers <- multicore:::detectCores()/2
    myControl$computeFunction <- mclapply
    myControl$computeArgs <- list(mc.preschedule = FALSE, mc.set.seed = FALSE)
}
# 
lmls = c("lm", "rf", "ppr", "earth", "glm", "svmRadial", "glmnet", "pls", "svmPoly", 
    "bagEarth", "enet")
objs = list()
for (i in 1:length(lmls)) {
    cat(paste("Step:", i, ". Method:", lmls[i], "\n", sep = ""))
    objs[[i]] = train(X, Y, method = lmls[i], trControl = myControl)
}
## Step:1. Method:lm
## Step:2. Method:rf
## Step:3. Method:ppr
## Step:4. Method:earth
## Warning: There were missing values in resampled performance measures.
## Step:5. Method:glm
## Step:6. Method:svmRadial
## Step:7. Method:glmnet
## Warning: There were missing values in resampled performance measures.
## Step:8. Method:pls
## Step:9. Method:svmPoly
## Step:10. Method:bagEarth
## Step:11. Method:enet
i = i + 1
objs[[i]] = train(X, Y, method = "nnet", trControl = myControl, trace = FALSE)
## Warning: There were missing values in resampled performance measures.
imodels = i
lmls[i] = "nnet"
names(objs) = sapply(objs, function(x) x$method)
res = sapply(objs, function(x) x$results$Rsquared)
# 
res
## $lm
## [1] 0.1699
## 
## $rf
## [1] 0.1907 0.1764 0.1678
## 
## $ppr
## [1] 0.12983 0.11267 0.07196
## 
## $earth
## [1] 0.02056 0.19550 0.19550
## 
## $glm
## [1] 0.1643
## 
## $svmRadial
## [1] 0.1723 0.1695 0.1633
## 
## $glmnet
##  [1] 0.17435 0.17717 0.16867 0.14207 0.11820 0.11408 0.08302 0.05466
##  [9] 0.01044 0.17565 0.03547     NaN     NaN     NaN     NaN     NaN
## [17]     NaN     NaN 0.15536     NaN     NaN     NaN     NaN     NaN
## [25]     NaN     NaN     NaN
## 
## $pls
## [1] 0.1582 0.1635 0.1642
## 
## $svmPoly
##  [1] 0.1712 0.1774 0.1916 0.1858 0.1854 0.1702 0.1687 0.1652 0.1790 0.1765
## [11] 0.1911 0.1877 0.1828 0.1691 0.1749 0.1309 0.1173 0.1252 0.1858 0.1888
## [21] 0.1853 0.1707 0.1738 0.1550 0.1140 0.1303 0.1178
## 
## $bagEarth
## [1] 0.11734 0.11687 0.09877
## 
## $enet
## [1] 0.1282 0.1282 0.1282 0.1374 0.1374 0.1372 0.1527 0.1527 0.1526
## 
## $nnet
## [1]     NaN 0.13026 0.14429     NaN 0.11940 0.14409     NaN 0.09565 0.14376

After producing the models it’s time to assess their performance:

## 
Xnew = (as.matrix(sweep(data[ival, -c(1, 3553, 3554)], 2, ICAnacus$Xmu)) %*% 
    t(ICAnacus$W))
## Error: non-conformable arguments
colnames(Xnew) = paste("IC.", 1:ncol(Xnew), sep = "", col = "")
Ytg = data[ival, 3554]
Yt = Ytg
for (i in 1:imodels) {
    Yt = cbind(Yt, predict(objs[[i]], Xnew))
    err = sum(abs(Yt[, ncol(Yt)] - Ytg)/Ytg/length(Ytg))
    cat(paste("Model:", lmls[i], ". Err:", err, "\n", sep = ""))
}

Model:lm. Err:0 Model:rf. Err:0

## Error: wrong number of columns in 'x'
# 
for (i in 2:(length(lmls) + 1)) {
    plot(Yt[, 1], Yt[, i], xlim = c(0, 5), ylim = c(0, 5), main = paste("Model:", 
        lmls[i], sep = ""), xlab = "Original Y", ylab = "Predicted Y")
}
## Error: error in evaluating the argument 'y' in selecting a method for function 'plot': Error in Yt[, i] : subscript out of bounds
plot of chunk unnamed-chunk-31

plot of chunk unnamed-chunk-31

# 

Strategy 2: Feature Selection over raw data

According to the so called strategy 1, an ICA projection for a spectrum will be Feature selection is an important step for practical commercial data mining which is often characterised by data sets with far too many variables for model building.

## 
control <- rfeControl(functions = rfFuncs, method = "boot", verbose = FALSE, 
    returnResamp = "final", number = 50)
if (require("multicore", quietly = TRUE, warn.conflicts = FALSE)) {
    control$workers <- multicore:::detectCores()/2
    control$computeFunction <- mclapply
    control$computeArgs <- list(mc.preschedule = FALSE, mc.set.seed = FALSE)
}
sizes = 5:25
profile.1 <- rfe(data[ilx, -c(1, 3553, 3554)], Y, sizes = sizes, rfeControl = control)
# cat( 'rf : Profile 1 predictors:', predictors(profile.1), fill = TRUE )
vars = predictors(profile.1)
vars = vars[1:min(25, length(vars))]
vars

[1] “X1715” “X1709” “X1712” “X1710” “X1714” “X1711” “X1713” “X1706” [9] “X456” “X2933” “X1708” “X453” “X450” “X1707” “X2925” “X1716” [17] “X1717” “X1718” “X469” “X454” “X2919” “X2924” “X2922” “X1705” [25] “X2918”

# 
X = data[ilx, vars]
obj2 = train(X, Y, method = "rf", trControl = myControl)
plot(Y, predict(obj2, X), xlim = c(0, 5), ylim = c(0, 5), xlab = "Original Y", 
    ylab = "Predicted Y", main = paste("RF with ", length(vars), " raw variables.Training"))
abline(0, 1, col = 2)
Yp = predict(obj2, X)
lmyp = lm(Yp ~ 1 + Y)
r2 = summary(lmyp)$adj.r.squared
abline(lmyp$coefficients, col = 3)
legend("topleft", legend = as.expression(c(bquote("Equality"), bquote(paste("LM regressed. ", 
    R^2 == .(round(r2, 3)))))), fill = c(2, 3))
plot of chunk unnamed-chunk-32

plot of chunk unnamed-chunk-32

# 
if (length(ival) > 0) {
    Xnew = data[ival, vars]
    Ynew = data[ival, ncol(data)]
    Yout = predict(obj2, Xnew)
    plot(Yout, Ynew, xlim = c(0, 5), ylim = c(0, 5), xlab = "Original Y", ylab = "Predicted Y", 
        main = paste("RF with ", length(vars), " raw variables.Testing"))
    abline(0, 1, col = 2)
    # 
    lmyp = lm(Yout ~ 1 + Ynew)
    r2 = summary(lmyp)$adj.r.squared
    abline(lmyp$coefficients, col = 3)
    legend("topleft", legend = as.expression(c(bquote("Equality"), bquote(paste("LM regressed. ", 
        R^2 == .(round(r2, 3)))))), fill = c(2, 3))
    # 
}
# 
obj3 = train(X, Y, method = "pls", trControl = myControl)
plot(Y, predict(obj3, X), xlim = c(0, 5), ylim = c(0, 5), xlab = "Original Y", 
    ylab = "Predicted Y", main = paste("PLS with ", length(vars), " raw variables.Training"))
abline(0, 1, col = 2)
Yp = predict(obj3, X)
lmyp = lm(Yp ~ 1 + Y)
r2 = summary(lmyp)$adj.r.squared
abline(lmyp$coefficients, col = 3)
legend("topleft", legend = as.expression(c(bquote("Equality"), bquote(paste("LM regressed. ", 
    R^2 == .(round(r2, 3)))))), fill = c(2, 3))
plot of chunk unnamed-chunk-32

plot of chunk unnamed-chunk-32

# 
if (length(ival) > 0) {
    Xnew = data[ival, vars]
    Ynew = data[ival, ncol(data)]
    Yout = predict(obj3, Xnew)
    plot(Yout, Ynew, xlim = c(0, 5), ylim = c(0, 5), xlab = "Original Y", ylab = "Predicted Y", 
        main = paste("PLS with ", length(vars), " raw variables.Testing"))
    abline(0, 1, col = 2)
    # 
    lmyp = lm(Yout ~ 1 + Ynew)
    r2 = summary(lmyp)$adj.r.squared
    abline(lmyp$coefficients, col = 3)
    legend("topleft", legend = as.expression(c(bquote("Equality"), bquote(paste("LM regressed. ", 
        R^2 == .(round(r2, 3)))))), fill = c(2, 3))
    # 
}
# 
obj4 = train(X, Y, method = "lm", trControl = myControl)
plot(Y, predict(obj4, X), xlim = c(0, 5), ylim = c(0, 5), xlab = "Original Y", 
    ylab = "Predicted Y", main = paste("LM with ", length(vars), " raw variables.Training"))
abline(0, 1, col = 2)
Yp = predict(obj4, X)
lmyp = lm(Yp ~ 1 + Y)
r2 = summary(lmyp)$adj.r.squared
abline(lmyp$coefficients, col = 3)
legend("topleft", legend = as.expression(c(bquote("Equality"), bquote(paste("LM regressed. ", 
    R^2 == .(round(r2, 3)))))), fill = c(2, 3))
plot of chunk unnamed-chunk-32

plot of chunk unnamed-chunk-32

# 
if (length(ival) > 0) {
    Xnew = data[ival, vars]
    Ynew = data[ival, ncol(data)]
    Yout = predict(obj4, Xnew)
    plot(Yout, Ynew, xlim = c(0, 5), ylim = c(0, 5), xlab = "Original Y", ylab = "Predicted Y", 
        main = paste("LM with ", length(vars), " raw variables.Testing"))
    abline(0, 1, col = 2)
    # 
    lmyp = lm(Yout ~ 1 + Ynew)
    r2 = summary(lmyp)$adj.r.squared
    abline(lmyp$coefficients, col = 3)
    legend("topleft", legend = as.expression(c(bquote("Equality"), bquote(paste("LM regressed. ", 
        R^2 == .(round(r2, 3)))))), fill = c(2, 3))
    # 
}