Data624 HW Week 4

Nonlinear Regression Models, Regression Trees and Rules-Based Models

KJ 7.2

Friedman (1991) introduced several benchmark data sets created by simulation. One of these simulations use dthe following nonlinear equestion to create data:

\[ y = 10sin(\pi x_1x_2) + 20(x_x-0.5)^2 + 10x_4 + 5x_5 +N(0,\sigma^2) \] where the x values are random variables uniformily distributed between [0, 1] (there are also 5 other non-informative variables also created in the simulation). The package mlbench contains a function called mlbench.friedman1 that simulates these data:

library(mlbench)
set.seed(200)
trainingData <- mlbench.friedman1(200, sd = 1)
## We convert the ' x ' data from a matrix to a data frame
## One reason is that this will give the columns names.
trainingData$x <- data.frame(trainingData$x)
## Look at the data using
featurePlot(trainingData$x, trainingData$y)

## or other methods.

## This creates a list with a vector 'y' and a matrix
## of predictors 'x' . Also simulate a large test set to
## estimate the true error rate with good precision:
testData <- mlbench.friedman1(5000, sd = 1)
testData$x <- data.frame(testData$x)

Tune several models on these data. For example:

library(caret)
knnModel <- train(x = trainingData$x,
                  y = trainingData$y,
                  method = "knn",
                  preProc = c("center", "scale"),
                  tuneLength = 10)
knnModel

## k-Nearest Neighbors 
## 
## 200 samples
##  10 predictor
## 
## Pre-processing: centered (10), scaled (10) 
## Resampling: Bootstrapped (25 reps) 
## Summary of sample sizes: 200, 200, 200, 200, 200, 200, ... 
## Resampling results across tuning parameters:
## 
##   k   RMSE      Rsquared   MAE     
##    5  3.466085  0.5121775  2.816838
##    7  3.349428  0.5452823  2.727410
##    9  3.264276  0.5785990  2.660026
##   11  3.214216  0.6024244  2.603767
##   13  3.196510  0.6176570  2.591935
##   15  3.184173  0.6305506  2.577482
##   17  3.183130  0.6425367  2.567787
##   19  3.198752  0.6483184  2.592683
##   21  3.188993  0.6611428  2.588787
##   23  3.200458  0.6638353  2.604529
## 
## RMSE was used to select the optimal model using the smallest value.
## The final value used for the model was k = 17.

knnPred <- predict(knnModel, newdata = testData$x)
## The function 'postResample' can be used to get the test set
## perforamnce values
postResample(pred = knnPred, obs = testData$y)

##      RMSE  Rsquared       MAE 
## 3.2040595 0.6819919 2.5683461

ctrl <- trainControl(method = 'cv', number = 10)

# neural network
nn_grid <- expand.grid(.decay = c(0, 0.01, 0.1),
                       .size = c(1:10),
                       .bag = F)

set.seed(101)

nn_tune <- train(trainingData$x, trainingData$y,
                 method = 'avNNet',
                 tuneGrid = nn_grid,
                 trControl = ctrl,
                 preProc = c('center', 'scale'),
                 linout = T,
                 trace = F,
                 maxit = 500
                 )

plot(nn_tune)

nnPred <- predict(nn_tune$finalModel, newdata = testData$x)

postResample(pred = nnPred, obs = testData$y)

##      RMSE  Rsquared       MAE 
## 5.6597855 0.6082073 4.6787589

# MARS
library(earth)

marsGrid <- expand.grid(.degree = 1:2, .nprune = 2:38)

marsTuned <- train(trainingData$x, trainingData$y,
                   method = 'earth',
                   tuneGrid = marsGrid,
                   trControl = trainControl(method = 'cv')
                   )

plot(marsTuned)

marsPred <- predict(marsTuned$finalModel, newdata = testData$x)

postResample(pred = marsPred, obs = testData$y)

##      RMSE  Rsquared       MAE 
## 1.1589948 0.9460418 0.9250230

# Support Vector Machines
library(kernlab)

svmTuned <- train(trainingData$x, trainingData$y,
                  method = 'svmRadial',
                  preProc = c('center', 'scale'),
                  tuneLength = 14,
                  trControl = trainControl(method = 'cv')
                  )

plot(svmTuned)

svmPred <- predict(svmTuned$finalModel, newdata = testData$x)

postResample(pred = svmPred, obs = testData$y)

##      RMSE  Rsquared       MAE 
## 7.0946250 0.6531504 6.1004886

Which models appear to give the best performance? Does MARS select the informative predictors (those named X1-X5)

It looks like MARS had the lowest errors out of the 4 models.

varImp(marsTuned)

## earth variable importance
## 
##    Overall
## X1  100.00
## X4   75.24
## X2   48.73
## X5   15.52
## X3    0.00

KJ 7.5

Exercise 6.3 describes data for a chemical manufacturing process. Use the same data imputation, data splitting, and pre-processing steps as before and train several nonlinear regression models.

# Exercise 6.3

# load data
library(AppliedPredictiveModeling)
data("ChemicalManufacturingProcess")
df <- ChemicalManufacturingProcess

# impute
trans <- preProcess(ChemicalManufacturingProcess, method = 'knnImpute')
df <- predict(trans, ChemicalManufacturingProcess)

# train test split
train_index <- createDataPartition(df$Yield, p = 0.8, list = F, times = 1)
train <- df[train_index,]
test <- df[-train_index,]

# train using cross validation
ctrl <- trainControl(method = 'cv', number = 10)

# tuning settings
ridgeGrid <- data.frame(.lambda = seq(0, .1, length = 15))

set.seed(101)

# fit model
ridgeRegFit <- train(df[,2:ncol(df)],
                     df$Yield,
                     method = 'ridge',
                     tuneGrid = ridgeGrid,
                     trControl = ctrl,
                     preProc = c('center','scale'))

# predict using model
test$pred <- predict(ridgeRegFit, test)

# variable importance, top 10
results <- test %>%
  select(obs = Yield, pred = pred)

# defaultSummary(results)

variable_importance <- varImp(ridgeRegFit, scale = F)

# plot(variable_importance, top = 10)

postResample(pred = test$pred, obs = test$Yield)

##      RMSE  Rsquared       MAE 
## 0.4246691 0.7762743 0.3378656

# KNN
knn75 <- train(x = train[,2:ncol(train)],
               y = train$Yield,
               method = "knn",
               preProc = c("center", "scale"),
               tuneLength = 10)

test$knn_pred <- predict(knn75, test)
postResample(pred = test$knn_pred, obs = test$Yield)

##      RMSE  Rsquared       MAE 
## 0.7094513 0.3580255 0.5336584

# neural network
set.seed(101)

nn75 <- train(x = train[,2:ncol(train)],
              y = train$Yield,
              method = 'avNNet',
              tuneGrid = nn_grid,
              trControl = ctrl,
              preProc = c('center', 'scale'),
              linout = T,
              trace = F,
              maxit = 500
              )

test$nn_pred <- predict(nn75, test)
postResample(pred = test$nn_pred, obs = test$Yield)

##      RMSE  Rsquared       MAE 
## 0.5683714 0.5923704 0.4253315

# MARS
mars75 <- train(x = train[,2:ncol(train)],
                y = train$Yield,
                method = 'earth',
                tuneGrid = marsGrid,
                trControl = trainControl(method = 'cv')
                )

test$mars_pred <- predict(mars75, test)
postResample(pred = test$mars_pred, obs = test$Yield)

##     RMSE Rsquared      MAE 
## 0.708873 0.401047 0.584355

# Support Vector Machines
svm75 <- train(Yield ~ ., 
               data = train,
               method = 'svmRadial',
               preProc = c('center', 'scale'),
               tuneLength = 14,
               trControl = trainControl(method = 'cv')
               )

test$svm_pred <- predict(svm75, test)
postResample(pred = test$svm_pred, obs = test$Yield)

##      RMSE  Rsquared       MAE 
## 0.5989630 0.5474256 0.4567931

a

Which nonlinear regression model gives the optimal resampling and test set performance?

It looks like the neural network model had the lowest RMSE/MAE at 0.6545 and SVM had the highest \(R^2\). Considering RMSE and MAE were very close for the neural network vs SVM, and \(R^2\) was clearly higher for SVM, we can consider the SVM model to be the best performing on the test data.

b

Which predictors are most important in the optimal nonlinear regression model? Do either the biological or process variables dominate the list? How do the top ten important predictors compare to the top ten predictors from the optimal linear model?

varImp(svm75)

## loess r-squared variable importance
## 
##   only 20 most important variables shown (out of 57)
## 
##                        Overall
## ManufacturingProcess13  100.00
## ManufacturingProcess32   95.82
## ManufacturingProcess17   87.98
## BiologicalMaterial06     80.97
## BiologicalMaterial03     77.27
## ManufacturingProcess09   74.18
## BiologicalMaterial02     71.97
## BiologicalMaterial12     70.23
## ManufacturingProcess06   69.04
## ManufacturingProcess31   66.89
## ManufacturingProcess36   66.26
## BiologicalMaterial11     62.31
## ManufacturingProcess11   50.62
## ManufacturingProcess29   49.47
## BiologicalMaterial04     47.16
## ManufacturingProcess33   46.51
## BiologicalMaterial08     46.28
## ManufacturingProcess02   45.78
## BiologicalMaterial01     44.45
## ManufacturingProcess30   40.70

varImp(ridgeRegFit)

## loess r-squared variable importance
## 
##   only 20 most important variables shown (out of 57)
## 
##                        Overall
## ManufacturingProcess32  100.00
## ManufacturingProcess13   90.02
## BiologicalMaterial06     84.56
## ManufacturingProcess17   74.88
## ManufacturingProcess36   74.68
## BiologicalMaterial03     73.53
## ManufacturingProcess09   70.37
## BiologicalMaterial12     67.97
## BiologicalMaterial02     65.32
## ManufacturingProcess31   63.93
## ManufacturingProcess06   58.00
## ManufacturingProcess33   48.99
## BiologicalMaterial11     48.11
## ManufacturingProcess11   47.77
## BiologicalMaterial04     47.12
## BiologicalMaterial08     41.87
## BiologicalMaterial01     39.13
## ManufacturingProcess30   34.83
## ManufacturingProcess12   33.32
## BiologicalMaterial09     32.41

It looks like ManufacturingProcess32 was the most important variable for both the SVM and Ridge models, and both had a mix of manufacturing and biological variables, with no clear type of variable showing dominance in importance. The two models had 9 out of 10 variables match in their respective top 10.

b

Explore the relationships between the top predictors and their response for the predictors that are unique to the optimal nonlinear regression model. Do these plots reveal intuition about the biological or process predictors and their relationship with yield?

# data frame of top 10 variables
vi <- varImp(svm75)$importance %>%
  data.frame() %>%
  top_n(10) %>%
  arrange(desc(Overall))

vi$variable = row.names(vi)
row.names(vi) <- seq(1,10,1)
vi$rank <- row.names(vi)
vi$title = paste0(vi$rank, ' - ', vi$variable)

df %>%
  gather(variable, value, -Yield) %>%
  filter(variable %in% vi$variable) %>%
  ggplot(aes(x = value, y = Yield)) +
  geom_point(alpha = .2) + 
  facet_wrap(~variable, scales = 'free')

It does seem like there are observable relationships between each of the top 10 variables and Yield.

KJ 8.1

Recreate the simulated data from Exercise 7.2

library(mlbench)
set.seed(200)
simulated <- mlbench.friedman1(200, sd = 1)
simulated <- cbind(simulated$x, simulated$y)
simulated <- as.data.frame(simulated)
colnames(simulated)[ncol(simulated)] <- "y"

a

Fit a random forest model to all of the predictors, then estimate the variable importance scores:

library(randomForest)
library(caret)
model1 <- randomForest(y ~ ., data = simulated,
                       importance = TRUE,
                       ntree = 1000)
rfImp1 <- varImp(model1, scale = FALSE)

rfImp1

##          Overall
## V1   8.732235404
## V2   6.415369387
## V3   0.763591825
## V4   7.615118809
## V5   2.023524577
## V6   0.165111172
## V7  -0.005961659
## V8  -0.166362581
## V9  -0.095292651
## V10 -0.074944788

Did the random forest model significantly use the uninformative predictors (V6 - v10)?

It looks like V6 - V10 had low importance scores compared to V1 - V5.

b

Now add an additional predictor that is highly correlated with one of the informative predictors. For example:

simulated$duplicate1 <- simulated$V1 + rnorm(200) * .1
cor(simulated$duplicate1, simulated$V1)

## [1] 0.9460206

Fit another random forest model to these data. Did the importance score from V1 change? What happens when you add another predictor that is also highly correlated with V1?

model2 <- randomForest(y ~ ., data = simulated,
                       importance = TRUE,
                       ntree = 1000)
rfImp2 <- varImp(model2, scale = FALSE)

rfImp2

##                Overall
## V1          5.69119973
## V2          6.06896061
## V3          0.62970218
## V4          7.04752238
## V5          1.87238438
## V6          0.13569065
## V7         -0.01345645
## V8         -0.04370565
## V9          0.00840438
## V10         0.02894814
## duplicate1  4.28331581

simulated$duplicate2 <- simulated$V1 + rnorm(200) * .1

model3 <- randomForest(y ~ ., data = simulated,
                       importance = TRUE,
                       ntree = 1000)
rfImp3 <- varImp(model3, scale = FALSE)

rfImp3

##                Overall
## V1          4.91687329
## V2          6.52816504
## V3          0.58711552
## V4          7.04870917
## V5          2.03115561
## V6          0.14213148
## V7          0.10991985
## V8         -0.08405687
## V9         -0.01075028
## V10         0.09230576
## duplicate1  3.80068234
## duplicate2  1.87721959

It looks like the importance measure of V1 went from 8.7 to 5.7 with the introduction of one correlated derived variable, and down to 4.7 with the addition of two variables that were correlated. This suggests that the presence of correlated variables can significantly alter the percieved importance of correlated variables.

c

Use the cforest function in the party package to fit a random forest model using conditional inference trees. The party package function varimp can calculate predictor importance. The conditional argument of that function toggles between the traditional importance measure and the modified version described in Strobl et al. (2007). Do these importances show the same pattern as the traditional random forest model?

library(party)

cf <- cforest(y ~ ., data = simulated)

varimp(cf) %>%
  data.frame()

##                      .
## V1          4.11910478
## V2          5.81149273
## V3          0.01133118
## V4          7.16991462
## V5          1.62054023
## V6         -0.01818532
## V7         -0.02795336
## V8         -0.03663807
## V9         -0.04360946
## V10        -0.03454443
## duplicate1  4.62957645
## duplicate2  1.08104915

The importances using the party::varimp function seem to be pretty closely aligned to the randomForest variable importance out of caret.

d

Repeat this process with different tree models, such as boosted trees and Cubist. Does the same pattern occur?

# random trees
library(rpart)
rtree <- rpart(y ~ ., data = simulated)
varImp(rtree)

##              Overall
## duplicate1 1.6165532
## duplicate2 0.3521296
## V1         1.5149230
## V10        0.5254491
## V2         2.0690787
## V3         0.5805123
## V4         2.9099218
## V5         2.3438469
## V6         0.3027516
## V7         0.5113927
## V9         0.2054104
## V8         0.0000000

The variable importance scores show a similar pattern to the random forest with V6 - V10 being much less than V1 - V5, however the scores all seem to be within the same range. In the randomForest model, V4 had a score greater than 7 , and the top score we see here is 2.9.

# boosted trees
library(gbm)

gbm <- train(y ~ ., data = simulated, method = 'gbm', verbose = F)

varImp(gbm, scale = F)

## gbm variable importance
## 
##            Overall
## V4         4662.70
## V2         3649.58
## V1         3237.12
## V5         2060.69
## V3         1307.70
## duplicate1 1246.79
## duplicate2  375.23
## V6          202.48
## V7          145.16
## V9          102.80
## V10          88.39
## V8           84.47

Here we see V4 with the highest score and V1 - V5 showing at the top, with V6 - V10 at the bottom.

# Cubist
library(Cubist)

cubist <- train(y ~ ., data = simulated, method = 'cubist')

varImp(cubist, scale = F)

## cubist variable importance
## 
##            Overall
## V2            69.5
## V1            54.0
## V4            50.0
## V5            38.0
## V3            32.0
## duplicate1    25.0
## duplicate2    25.0
## V6            10.0
## V8             3.0
## V7             0.0
## V9             0.0
## V10            0.0

The results here show a different order, with V5 and V6 showing similiar levels of importance, and with V1 showing the most, even with 2 additional correlated variables. Otherwise, V1 - V5 are at the top and V7 - V10 are at the bottom.

KJ 8.2

Use a simulation to show tree bias with different granularities.

set.seed(101)
v1 <- sample(1:2, 300, replace = T)
v2 <- sample(1:5, 300, replace = T)
v3 <- sample(1:100, 300, replace = T)

y <- rnorm(300)

df_rt <- data.frame(y=y, v1=v1, v2=v2, v3=v3)

rt_sim <- rpart(y ~ ., data = df_rt)

varImp(rt_sim)

##        Overall
## v1 0.002939551
## v2 0.108035126
## v3 0.388991570

It looks like the random trees model favors more granular data.

KJ 8.3

In stochastic gradient boosting the bagging fraction and learning rate will govern the construction of the trees as they are guided by the gradient. Although the optimal values of these parameters should be obtained through the tuning process, it is helpful to understand how the magnitudes of these parameters affect magnitudes of variable importance. Figure 8.24 provides the variable importance plots for boosting using two extreme values for the bagging fraction (0.1 and 0.9) and the learning rate (0.1 and 0.9) for the solubility data. The left-hand plot has both parameters set to 0.1, and the right-hand plot has both set to 0.9:

a

Why does the model on the right focus its importance on just the first few of predictors, whereas the model on the left spread importance across more predictors?

Bagging fraction is the fraction of the training data that is randomly selected to create the next tree.
Shrinkage determines the size and impact of additional steps.
The model on the left has a low bagging fraction and low shrinkage, so the model iterates in slow steps and with high randomness, which picks up more subtleties in the data, where the model on the right is trained with large steps and low randomness, which result in finding importance in a few variables.

b

Which model do you think would be more predictive of other samples?

The model on the left would be more likely to have a better prediction against new data.

c

How would increasing interaction depth affect the slope of predictor importance for either model in Fig. 8.24?

By increasing the interaction depth, we would get more complicated trees with possibly more options for splitting variables, so the sleop of predictor importance would spread importance toa higher number of variables.

KJ 8.7

Refer to Exercises 6.3 and 7.5 which describe a chemical manufacturing process. Use the same data imputation, data splitting, and pre-processing steps as before and train several tree-based models:

# random tree
rt87 <- rpart(Yield ~ ., data = train)

test$rt_pred <- predict(rt87, test)
postResample(pred = test$rt_pred, obs = test$Yield)

##      RMSE  Rsquared       MAE 
## 0.6876238 0.4401792 0.5245336

# bagged tree
library(ipred)
bt87 <- bagging(Yield ~ ., data = train)

test$bt_pred <- predict(bt87, test)
postResample(pred = test$bt_pred, obs = test$Yield)

##      RMSE  Rsquared       MAE 
## 0.5752769 0.5917306 0.4108245

# random forest
# rf87 <- randomForest(Yield ~ ., data = train, importance = T)
rf87 <- train(Yield ~ ., data = train, method = 'rf', importance = T)

test$rf_pred <- predict(rf87, test)
postResample(pred = test$rf_pred, obs = test$Yield)

##      RMSE  Rsquared       MAE 
## 0.5440224 0.6375295 0.3722084

# boosted tree
boost87 <- gbm(Yield ~ ., data = train, distribution = 'gaussian')

test$boost_pred <- predict(boost87, test)
postResample(pred = test$boost_pred, obs = test$Yield)

##      RMSE  Rsquared       MAE 
## 0.6452175 0.4689544 0.4991946

a

Which tree-based regression model gives the optimal resampling and test set performance?

randomForest performed the best, with RMSE = 0.73, \(R^2\) = 0.71, and MAE = 0.529.

b

Which predictors are most important in the optimal tree-based regression model? Do either the biological or process variables dominate the list? How do the top 10 important predictors compare to the top 10 predictors from the optimal linear and nonlinear models?

rf_vimp <- varImp(rf87)

plot(rf_vimp, top = 20)

It looks like ManufacturingProcess 32 is the most important variable in all 3 modeling methods, and biological material 3/6/12 also being in the top 10 for all 3 models.

c

Plot the optimal single tree with the distribution of yield in the terminal nodes. Does this view of the data provide additional knowledge about the biological or process predictors and their relationship with yield?

getTree(rf87$finalModel, labelVar = T)

##    left daughter right daughter              split var  split point status
## 1              2              3 ManufacturingProcess09 -0.704917838     -3
## 2              4              5   BiologicalMaterial03 -0.332411567     -3
## 3              6              7 ManufacturingProcess32  0.006316353     -3
## 4              8              9   BiologicalMaterial10 -0.835567987     -3
## 5             10             11 ManufacturingProcess20  0.094667400     -3
## 6             12             13   BiologicalMaterial12 -0.588054954     -3
## 7             14             15 ManufacturingProcess17 -0.395567765     -3
## 8              0              0                   <NA>  0.000000000     -1
## 9             16             17 ManufacturingProcess44 -0.171157071     -3
## 10             0              0                   <NA>  0.000000000     -1
## 11             0              0                   <NA>  0.000000000     -1
## 12            18             19   BiologicalMaterial12 -1.551154700     -3
## 13            20             21 ManufacturingProcess11 -0.049658378     -3
## 14            22             23   BiologicalMaterial08  0.133284271     -3
## 15            24             25 ManufacturingProcess32  0.562155431     -3
## 16             0              0                   <NA>  0.000000000     -1
## 17            26             27   BiologicalMaterial05  0.648737688     -3
## 18            28             29 ManufacturingProcess22  0.778918315     -3
## 19            30             31 ManufacturingProcess05  0.116514308     -3
## 20            32             33 ManufacturingProcess01 -0.305870345     -3
## 21            34             35 ManufacturingProcess30  0.859145186     -3
## 22            36             37 ManufacturingProcess24  0.459651684     -3
## 23            38             39 ManufacturingProcess45  0.397960456     -3
## 24            40             41 ManufacturingProcess43  0.043858060     -3
## 25            42             43   BiologicalMaterial09 -0.903368928     -3
## 26            44             45 ManufacturingProcess37 -0.255315108     -3
## 27            46             47 ManufacturingProcess42  0.202795698     -3
## 28            48             49 ManufacturingProcess22  0.328552549     -3
## 29             0              0                   <NA>  0.000000000     -1
## 30            50             51 ManufacturingProcess30  0.295668166     -3
## 31            52             53 ManufacturingProcess29 -0.367279021     -3
## 32            54             55 ManufacturingProcess37 -0.929346991     -3
## 33            56             57 ManufacturingProcess20  0.050255936     -3
## 34            58             59 ManufacturingProcess18 -0.057641092     -3
## 35            60             61   BiologicalMaterial12  0.426754844     -3
## 36            62             63 ManufacturingProcess04 -1.012270101     -3
## 37             0              0                   <NA>  0.000000000     -1
## 38            64             65 ManufacturingProcess39  0.264938662     -3
## 39             0              0                   <NA>  0.000000000     -1
## 40            66             67 ManufacturingProcess37 -0.817008344     -3
## 41             0              0                   <NA>  0.000000000     -1
## 42             0              0                   <NA>  0.000000000     -1
## 43            68             69 ManufacturingProcess33  1.593045883     -3
## 44             0              0                   <NA>  0.000000000     -1
## 45             0              0                   <NA>  0.000000000     -1
## 46             0              0                   <NA>  0.000000000     -1
## 47             0              0                   <NA>  0.000000000     -1
## 48            70             71 ManufacturingProcess11  0.928359173     -3
## 49             0              0                   <NA>  0.000000000     -1
## 50             0              0                   <NA>  0.000000000     -1
## 51             0              0                   <NA>  0.000000000     -1
## 52             0              0                   <NA>  0.000000000     -1
## 53             0              0                   <NA>  0.000000000     -1
## 54             0              0                   <NA>  0.000000000     -1
## 55             0              0                   <NA>  0.000000000     -1
## 56             0              0                   <NA>  0.000000000     -1
## 57             0              0                   <NA>  0.000000000     -1
## 58             0              0                   <NA>  0.000000000     -1
## 59            72             73 ManufacturingProcess32 -0.178963339     -3
## 60            74             75   BiologicalMaterial09  0.337069788     -3
## 61             0              0                   <NA>  0.000000000     -1
## 62             0              0                   <NA>  0.000000000     -1
## 63             0              0                   <NA>  0.000000000     -1
## 64             0              0                   <NA>  0.000000000     -1
## 65             0              0                   <NA>  0.000000000     -1
## 66             0              0                   <NA>  0.000000000     -1
## 67            76             77 ManufacturingProcess17 -0.035050308     -3
## 68            78             79 ManufacturingProcess15  0.001364123     -3
## 69             0              0                   <NA>  0.000000000     -1
## 70             0              0                   <NA>  0.000000000     -1
## 71             0              0                   <NA>  0.000000000     -1
## 72            80             81 ManufacturingProcess18 -0.029067933     -3
## 73             0              0                   <NA>  0.000000000     -1
## 74             0              0                   <NA>  0.000000000     -1
## 75             0              0                   <NA>  0.000000000     -1
## 76             0              0                   <NA>  0.000000000     -1
## 77             0              0                   <NA>  0.000000000     -1
## 78            82             83   BiologicalMaterial08  0.539515928     -3
## 79            84             85 ManufacturingProcess43 -0.013746556     -3
## 80             0              0                   <NA>  0.000000000     -1
## 81            86             87   BiologicalMaterial10  1.676351754     -3
## 82             0              0                   <NA>  0.000000000     -1
## 83             0              0                   <NA>  0.000000000     -1
## 84             0              0                   <NA>  0.000000000     -1
## 85             0              0                   <NA>  0.000000000     -1
## 86            88             89   BiologicalMaterial06  0.117350199     -3
## 87             0              0                   <NA>  0.000000000     -1
## 88             0              0                   <NA>  0.000000000     -1
## 89             0              0                   <NA>  0.000000000     -1
##      prediction
## 1  -0.140836322
## 2  -1.107433844
## 3   0.124336450
## 4  -1.409782735
## 5  -0.368358779
## 6  -0.428770563
## 7   0.750495331
## 8  -0.252772705
## 9  -1.525483738
## 10 -0.140347188
## 11 -0.550768051
## 12 -0.943709101
## 13 -0.151495965
## 14  1.258102400
## 15  0.417378192
## 16 -2.061333551
## 17 -1.346867133
## 18 -0.568226364
## 19 -1.225321153
## 20 -0.571085916
## 21  0.140392696
## 22  0.750560386
## 23  1.816398616
## 24 -0.041883691
## 25  0.731610007
## 26 -1.176558278
## 27 -1.541505825
## 28 -0.611721645
## 29 -0.220264122
## 30 -1.451075203
## 31 -0.999567103
## 32 -0.291602402
## 33 -0.738776024
## 34 -0.105323060
## 35  0.522617205
## 36  0.925362431
## 37 -0.036048817
## 38  1.640465260
## 39  2.226909780
## 40  0.234254032
## 41 -0.663193568
## 42 -0.548059003
## 43  0.882159303
## 44 -1.233448299
## 45 -1.119668257
## 46 -1.650641783
## 47 -1.268665930
## 48 -0.579406565
## 49 -0.837927203
## 50 -1.489453391
## 51 -1.374318826
## 52 -1.163013035
## 53 -0.917844137
## 54 -0.800000523
## 55 -0.189922778
## 56 -0.601698165
## 57 -0.875853883
## 58 -0.794582425
## 59  0.009553501
## 60  0.326963695
## 61  0.913924225
## 62  0.653404052
## 63  1.142929134
## 64  1.751923259
## 65  1.556871760
## 66 -0.420733718
## 67  0.421393389
## 68  1.072855077
## 69  0.424489445
## 70 -0.594112829
## 71 -0.559798213
## 72  0.102413667
## 73 -0.269026997
## 74  0.395231720
## 75 -0.014376428
## 76  0.746866228
## 77  0.291204254
## 78  0.905797080
## 79  1.239913074
## 80  0.251110334
## 81  0.083826583
## 82  0.933790582
## 83  0.765829569
## 84  1.287050519
## 85  1.004225845
## 86  0.103918694
## 87  0.023550252
## 88  0.092630991
## 89  0.126494099