DATA624 HW2

library(AppliedPredictiveModeling)
library(caret)
library(kableExtra)
library(dplyr)
library(elasticnet)
library(naniar) 
library(mlbench)
library(earth)
library(kernlab)
library(randomForest)
library(party)
library(tibble)
library(gbm)
library(rpart)
library(arules)
library(arulesViz)
library(tidyverse)
library(readxl)
library(knitr)
library(ggplot2)
library(lubridate)
library(plyr)
library(naniar)
library(RANN)
library(pls)
library(Cubist)
library(partykit)

KJ 6.3

A chemical manufacturing process for a pharmaceutical product was discussed in Sect.1.4. In this problem, the objective is to understand the relationship between biological measurements of the raw materials (predictors), measurements of the manufacturing process (predictors), and the response of product yield. Biological predictors cannot be changed but can be used to assess the quality of the raw material before processing. On the other hand, manufacturing process predictors can be changed in the manufacturing process. Improving product yield by 1% will boost revenue by approximately one hundred thousand dollars per batch:

(a) Start R and use these commands to load the data: The matrix processPredictors contains the 57 predictors (12 describing the input biological material and 45 describing the process predictors) for the 176 manufacturing runs. yield contains the percent yield for each run.

library(AppliedPredictiveModeling)
data(ChemicalManufacturingProcess)

(b) A small percentage of cells in the predictor set contain missing values. Use an imputation function to fill in these missing values

missing <- ChemicalManufacturingProcess %>%
  miss_var_summary() 
kable(missing %>% 
  filter(pct_miss > 0)) %>%
  kable_styling(bootstrap_options = c("striped", "hover", "condensed"))

variable	n_miss	pct_miss
ManufacturingProcess03	15	8.5227273
ManufacturingProcess11	10	5.6818182
ManufacturingProcess10	9	5.1136364
ManufacturingProcess25	5	2.8409091
ManufacturingProcess26	5	2.8409091
ManufacturingProcess27	5	2.8409091
ManufacturingProcess28	5	2.8409091
ManufacturingProcess29	5	2.8409091
ManufacturingProcess30	5	2.8409091
ManufacturingProcess31	5	2.8409091
ManufacturingProcess33	5	2.8409091
ManufacturingProcess34	5	2.8409091
ManufacturingProcess35	5	2.8409091
ManufacturingProcess36	5	2.8409091
ManufacturingProcess02	3	1.7045455
ManufacturingProcess06	2	1.1363636
ManufacturingProcess01	1	0.5681818
ManufacturingProcess04	1	0.5681818
ManufacturingProcess05	1	0.5681818
ManufacturingProcess07	1	0.5681818
ManufacturingProcess08	1	0.5681818
ManufacturingProcess12	1	0.5681818
ManufacturingProcess14	1	0.5681818
ManufacturingProcess22	1	0.5681818
ManufacturingProcess23	1	0.5681818
ManufacturingProcess24	1	0.5681818
ManufacturingProcess40	1	0.5681818
ManufacturingProcess41	1	0.5681818

imputation <- preProcess(ChemicalManufacturingProcess, method = "knnImpute")
predict_imputer <- predict(imputation, ChemicalManufacturingProcess)
#compare to above chart
missing2 <- predict_imputer %>%
  miss_var_summary() 
kable(missing2 %>% 
  filter(pct_miss > 0)) %>%
  kable_styling(bootstrap_options = c("striped", "hover", "condensed"))

variable	n_miss	pct_miss

(c) Split the data into a training and a test set, pre-process the data, and tune a model of your choice from this chapter. What is the optimal value of the performance metric?

near_zero <- nearZeroVar(predict_imputer)
part_c <- predict_imputer[,-near_zero]

training_rows <- createDataPartition(part_c$Yield, p = .80, list= FALSE)

train <- part_c[training_rows, ]
test <- part_c[-training_rows, ]

ridge_model <- train(train[,-1], train$Yield,
                      method = "ridge",
                      preProcess = c("center","scale"))
ridge_model

## Ridge Regression 
## 
## 144 samples
##  56 predictor
## 
## Pre-processing: centered (56), scaled (56) 
## Resampling: Bootstrapped (25 reps) 
## Summary of sample sizes: 144, 144, 144, 144, 144, 144, ... 
## Resampling results across tuning parameters:
## 
##   lambda  RMSE      Rsquared   MAE      
##   0e+00   7.640782  0.2153287  1.8302603
##   1e-04   6.617755  0.2435536  1.6419417
##   1e-01   1.481222  0.3879112  0.7139594
## 
## RMSE was used to select the optimal model using the smallest value.
## The final value used for the model was lambda = 0.1.

ridge_model2 <- enet(x = as.matrix(train[,-1]), y = train$Yield, lambda = 0.1, 
                   normalize = TRUE)

(d) Predict the response for the test set. What is the value of the performance metric and how does this compare with the resampled performance metric on the training set?

predict_value <- predict(ridge_model, test[,-1], s = 1, 
                     mode = "raw",
                     type = "raw")

postResample(pred = predict_value, obs = test$Yield)

##      RMSE  Rsquared       MAE 
## 1.4340003 0.2587960 0.7018681

predict_value2 <- predict(ridge_model2, as.matrix(test[,-1]), s = 1, 
                     mode = "fraction",
                     type = "fit")

postResample(pred = predict_value2$fit, obs = test$Yield)

##      RMSE  Rsquared       MAE 
## 1.4340003 0.2587960 0.7018681

(e) Which predictors are most important in the model you have trained? Do either the biological or process predictors dominate the list?

get_coefficients <- predict(ridge_model2, as.matrix(test[,-1]), s = 1, 
                     mode = "fraction",
                     type = "coefficients")

top5 <- head(sort(get_coefficients$coefficients, decreasing = TRUE), 5)
bottom5 <- tail(sort(get_coefficients$coefficients, decreasing = TRUE), 5)

ManufacturingProcess dominate the list.

top5

## ManufacturingProcess32 ManufacturingProcess09 ManufacturingProcess04 
##              0.3959117              0.2013714              0.1626802 
## ManufacturingProcess19 ManufacturingProcess34 
##              0.1510261              0.1326094

bottom5

## ManufacturingProcess28 ManufacturingProcess17 ManufacturingProcess13 
##             -0.1261190             -0.1403821             -0.1784783 
## ManufacturingProcess37 ManufacturingProcess20 
##             -0.1934206             -0.5915375

(f) Explore the relationships between each of the top predictors and the response. How could this information be helpful in improving yield in future runs of the manufacturing process?

top_predictors <- c('ManufacturingProcess32', 'ManufacturingProcess09', 'ManufacturingProcess04',
         'ManufacturingProcess34', 'ManufacturingProcess39', 'ManufacturingProcess36',
         'ManufacturingProcess28', 'ManufacturingProcess37', 'ManufacturingProcess17', 'ManufacturingProcess13')
multi_linear_model <- lm(Yield~ManufacturingProcess32+ManufacturingProcess09+ManufacturingProcess04+
        ManufacturingProcess34+ManufacturingProcess39+ManufacturingProcess36+ManufacturingProcess28+
          ManufacturingProcess37+ManufacturingProcess17+ManufacturingProcess13,
        data = predict_imputer)
summary(multi_linear_model)

## 
## Call:
## lm(formula = Yield ~ ManufacturingProcess32 + ManufacturingProcess09 + 
##     ManufacturingProcess04 + ManufacturingProcess34 + ManufacturingProcess39 + 
##     ManufacturingProcess36 + ManufacturingProcess28 + ManufacturingProcess37 + 
##     ManufacturingProcess17 + ManufacturingProcess13, data = predict_imputer)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -1.4187 -0.3904  0.0430  0.3862  1.6256 
## 
## Coefficients:
##                         Estimate Std. Error t value Pr(>|t|)    
## (Intercept)            -0.001536   0.044719  -0.034  0.97265    
## ManufacturingProcess32  0.540806   0.080931   6.682 3.45e-10 ***
## ManufacturingProcess09  0.361180   0.079144   4.564 9.78e-06 ***
## ManufacturingProcess04  0.108567   0.058421   1.858  0.06490 .  
## ManufacturingProcess34  0.096110   0.046815   2.053  0.04166 *  
## ManufacturingProcess39  0.145824   0.049084   2.971  0.00341 ** 
## ManufacturingProcess36 -0.126394   0.075187  -1.681  0.09464 .  
## ManufacturingProcess28 -0.035867   0.063876  -0.562  0.57521    
## ManufacturingProcess37 -0.079526   0.048249  -1.648  0.10120    
## ManufacturingProcess17 -0.111111   0.081515  -1.363  0.17472    
## ManufacturingProcess13 -0.096458   0.088673  -1.088  0.27827    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.5927 on 165 degrees of freedom
## Multiple R-squared:  0.6688, Adjusted R-squared:  0.6487 
## F-statistic: 33.32 on 10 and 165 DF,  p-value: < 2.2e-16

KJ 7.2

Friedman (1991) introduced several benchmark data sets create by simulation. One of these simulations used the following nonlinear equation to create data:

\[ y = 10\sin(\pi x_1x_2) + 20(x_3 - 0.5)^2 + 10x_4 + 5x_5 + N (0, \sigma ^2) \]

where the \(x\) values are random variables uniformly distributed between [0, 1] (there are also 5 other non-informative variables also created in the simulation). The package mlbench contains a function called mlbench.friedman1 that simulates these data:

set.seed(200)
trainingData <- mlbench.friedman1(200, sd = 1)
## We convert the 'x' data from a matrix to a data frame
## One reason is that this will give the columns names.
trainingData$x <- data.frame(trainingData$x) 
## Look at the data using
featurePlot(trainingData$x, trainingData$y)

## or other methods. 
## This creates a list with a vector 'y' and a matrix
## of predictors 'x'. Also simulate a large test set to 
## estimate the true error rate with good precision:
testData <- mlbench.friedman1(5000, sd = 1)
testData$x <- data.frame(testData$x)

Tune several models on these data. Which models appear to give the best performance? Does MARS select the informative predictors (those named X1–X5)?

neural_model <- expand.grid(.decay = c(0, 0.01, .1),
                        .size = c(1:10),
                        .bag = FALSE)

ctrl <- trainControl(method = "cv")

avNNet_model <- train(trainingData$x, trainingData$y, 
                 method = "avNNet",
                 tuneGrid = neural_model,
                 trControl = ctrl,
                 preProcess = c("center", "scale"),
                 linout = TRUE,
                 trace = FALSE,
                 maxit = 500,
                 MaxNWts = 10 * (ncol(trainingData$x) + 1) + 10 + 1)
avNNet_model

## Model Averaged Neural Network 
## 
## 200 samples
##  10 predictor
## 
## Pre-processing: centered (10), scaled (10) 
## Resampling: Cross-Validated (10 fold) 
## Summary of sample sizes: 180, 180, 180, 180, 180, 180, ... 
## Resampling results across tuning parameters:
## 
##   decay  size  RMSE      Rsquared   MAE     
##   0.00    1    2.439565  0.7644006  1.906904
##   0.00    2    2.538290  0.7448102  1.996572
##   0.00    3    2.097923  0.8270457  1.651204
##   0.00    4    2.003586  0.8428708  1.608316
##   0.00    5    2.495658  0.7821679  1.925544
##   0.00    6    2.498643  0.7589755  1.910032
##   0.00    7    3.695878  0.6121163  2.352241
##   0.00    8    6.282577  0.4593528  3.759895
##   0.00    9    4.973098  0.5538140  3.236094
##   0.00   10    2.703068  0.7437045  2.148617
##   0.01    1    2.433477  0.7660774  1.899358
##   0.01    2    2.488251  0.7557016  1.962511
##   0.01    3    2.087867  0.8302922  1.661617
##   0.01    4    2.057420  0.8371424  1.617592
##   0.01    5    2.036480  0.8378762  1.644359
##   0.01    6    2.202479  0.8113223  1.718370
##   0.01    7    2.362180  0.7969243  1.854537
##   0.01    8    2.468188  0.7717175  1.905309
##   0.01    9    2.442612  0.7632283  1.936275
##   0.01   10    2.465949  0.7629934  1.926444
##   0.10    1    2.440506  0.7640541  1.909513
##   0.10    2    2.563721  0.7398046  2.021249
##   0.10    3    2.144000  0.8186454  1.672726
##   0.10    4    2.112690  0.8281984  1.697981
##   0.10    5    2.075008  0.8319334  1.645113
##   0.10    6    2.189105  0.8143496  1.733985
##   0.10    7    2.323408  0.7912417  1.824610
##   0.10    8    2.359512  0.7870478  1.824359
##   0.10    9    2.371569  0.7894291  1.886362
##   0.10   10    2.396639  0.7779495  1.895462
## 
## Tuning parameter 'bag' was held constant at a value of FALSE
## RMSE was used to select the optimal model using the smallest value.
## The final values used for the model were size = 4, decay = 0 and bag = FALSE.

avNNet_predict <- predict(avNNet_model, newdata = testData$x)
avNNet_results <- postResample(pred = avNNet_predict, obs = testData$y)
avNNet_results

##      RMSE  Rsquared       MAE 
## 2.5093746 0.7778381 1.8429557

multi_variate <- expand.grid(.degree=1:2, .nprune=2:38)

mars_model <- train(x=trainingData$x, y=trainingData$y, 
                  method="earth",
                  preProcess = c("center", "scale"),
                  tuneGrid=multi_variate,
                  trControl = ctrl)
mars_model

## Multivariate Adaptive Regression Spline 
## 
## 200 samples
##  10 predictor
## 
## Pre-processing: centered (10), scaled (10) 
## Resampling: Cross-Validated (10 fold) 
## Summary of sample sizes: 180, 180, 180, 180, 180, 180, ... 
## Resampling results across tuning parameters:
## 
##   degree  nprune  RMSE      Rsquared   MAE     
##   1        2      4.189975  0.2906552  3.462839
##   1        3      3.484994  0.5151055  2.764123
##   1        4      2.587964  0.7410329  2.101983
##   1        5      2.420658  0.7683094  1.985518
##   1        6      2.339561  0.7839667  1.871780
##   1        7      1.731624  0.8788241  1.423116
##   1        8      1.678080  0.8854956  1.311885
##   1        9      1.655259  0.8869872  1.291336
##   1       10      1.652845  0.8850859  1.299452
##   1       11      1.604286  0.8921908  1.256919
##   1       12      1.631147  0.8899063  1.266755
##   1       13      1.633440  0.8881519  1.270749
##   1       14      1.646108  0.8865326  1.279917
##   1       15      1.646108  0.8865326  1.279917
##   1       16      1.646108  0.8865326  1.279917
##   1       17      1.646108  0.8865326  1.279917
##   1       18      1.646108  0.8865326  1.279917
##   1       19      1.646108  0.8865326  1.279917
##   1       20      1.646108  0.8865326  1.279917
##   1       21      1.646108  0.8865326  1.279917
##   1       22      1.646108  0.8865326  1.279917
##   1       23      1.646108  0.8865326  1.279917
##   1       24      1.646108  0.8865326  1.279917
##   1       25      1.646108  0.8865326  1.279917
##   1       26      1.646108  0.8865326  1.279917
##   1       27      1.646108  0.8865326  1.279917
##   1       28      1.646108  0.8865326  1.279917
##   1       29      1.646108  0.8865326  1.279917
##   1       30      1.646108  0.8865326  1.279917
##   1       31      1.646108  0.8865326  1.279917
##   1       32      1.646108  0.8865326  1.279917
##   1       33      1.646108  0.8865326  1.279917
##   1       34      1.646108  0.8865326  1.279917
##   1       35      1.646108  0.8865326  1.279917
##   1       36      1.646108  0.8865326  1.279917
##   1       37      1.646108  0.8865326  1.279917
##   1       38      1.646108  0.8865326  1.279917
##   2        2      4.493713  0.1908055  3.729300
##   2        3      3.709145  0.4504905  2.987843
##   2        4      2.609735  0.7341463  2.103370
##   2        5      2.379342  0.7742456  1.931422
##   2        6      2.334586  0.7830818  1.849111
##   2        7      1.749014  0.8766222  1.427570
##   2        8      1.698294  0.8837462  1.400995
##   2        9      1.497818  0.9080943  1.207548
##   2       10      1.411723  0.9170883  1.129177
##   2       11      1.375422  0.9202096  1.096012
##   2       12      1.324813  0.9271017  1.054678
##   2       13      1.319186  0.9276385  1.043179
##   2       14      1.262835  0.9357576  1.000477
##   2       15      1.262417  0.9342643  1.001733
##   2       16      1.267577  0.9334989  1.014147
##   2       17      1.275619  0.9324509  1.016926
##   2       18      1.275619  0.9324509  1.016926
##   2       19      1.275619  0.9324509  1.016926
##   2       20      1.275619  0.9324509  1.016926
##   2       21      1.275619  0.9324509  1.016926
##   2       22      1.275619  0.9324509  1.016926
##   2       23      1.275619  0.9324509  1.016926
##   2       24      1.275619  0.9324509  1.016926
##   2       25      1.275619  0.9324509  1.016926
##   2       26      1.275619  0.9324509  1.016926
##   2       27      1.275619  0.9324509  1.016926
##   2       28      1.275619  0.9324509  1.016926
##   2       29      1.275619  0.9324509  1.016926
##   2       30      1.275619  0.9324509  1.016926
##   2       31      1.275619  0.9324509  1.016926
##   2       32      1.275619  0.9324509  1.016926
##   2       33      1.275619  0.9324509  1.016926
##   2       34      1.275619  0.9324509  1.016926
##   2       35      1.275619  0.9324509  1.016926
##   2       36      1.275619  0.9324509  1.016926
##   2       37      1.275619  0.9324509  1.016926
##   2       38      1.275619  0.9324509  1.016926
## 
## RMSE was used to select the optimal model using the smallest value.
## The final values used for the model were nprune = 15 and degree = 2.

mars_predict <- predict(mars_model, newdata=testData$x)
mars_results <- postResample(pred=mars_predict, obs=testData$y)
mars_results

##      RMSE  Rsquared       MAE 
## 1.1908806 0.9428866 0.9496858

svm <- train(x=trainingData$x, y=trainingData$y, 
                  method="svmRadial", 
                  preProcess=c("center", "scale"), 
                  tuneLength=20)
svm

## Support Vector Machines with Radial Basis Function Kernel 
## 
## 200 samples
##  10 predictor
## 
## Pre-processing: centered (10), scaled (10) 
## Resampling: Bootstrapped (25 reps) 
## Summary of sample sizes: 200, 200, 200, 200, 200, 200, ... 
## Resampling results across tuning parameters:
## 
##   C          RMSE      Rsquared   MAE     
##        0.25  2.535413  0.7832510  2.010246
##        0.50  2.312221  0.7976533  1.821205
##        1.00  2.179041  0.8121920  1.705301
##        2.00  2.089542  0.8246834  1.630071
##        4.00  2.033124  0.8330785  1.576070
##        8.00  2.011953  0.8360991  1.561024
##       16.00  2.010544  0.8363445  1.562455
##       32.00  2.010544  0.8363445  1.562455
##       64.00  2.010544  0.8363445  1.562455
##      128.00  2.010544  0.8363445  1.562455
##      256.00  2.010544  0.8363445  1.562455
##      512.00  2.010544  0.8363445  1.562455
##     1024.00  2.010544  0.8363445  1.562455
##     2048.00  2.010544  0.8363445  1.562455
##     4096.00  2.010544  0.8363445  1.562455
##     8192.00  2.010544  0.8363445  1.562455
##    16384.00  2.010544  0.8363445  1.562455
##    32768.00  2.010544  0.8363445  1.562455
##    65536.00  2.010544  0.8363445  1.562455
##   131072.00  2.010544  0.8363445  1.562455
## 
## Tuning parameter 'sigma' was held constant at a value of 0.05869886
## RMSE was used to select the optimal model using the smallest value.
## The final values used for the model were sigma = 0.05869886 and C = 16.

svm_predict <- predict(svm, newdata=testData$x)
svm_results<- postResample(pred=svm_predict, obs=testData$y)
svm_results

##     RMSE Rsquared      MAE 
## 2.061539 0.827650 1.566485

model_comparison <- data.frame(rbind(avNNet_results, mars_results, svm_results))
#sort by RMSE
model_comparison[order(model_comparison$RMSE),] %>% kable %>%
  kable_styling(bootstrap_options = c("striped", "hover", "condensed"), 
                full_width = F)

	RMSE	Rsquared	MAE
mars_results	1.190881	0.9428866	0.9496858
svm_results	2.061539	0.8276500	1.5664854
avNNet_results	2.509375	0.7778381	1.8429557

The Multivariate Adaptive Regression Splines (MARS) model has the best performance based on RMSE, \(R^2\), MAE. In addition, the MARS model does select five informative predictors as seen below.

varImp(mars_model)

## earth variable importance
## 
##     Overall
## X1   100.00
## X4    85.07
## X2    69.06
## X5    48.95
## X3    39.50
## X8     0.00
## X7     0.00
## X6     0.00
## X10    0.00
## X9     0.00

KJ 7.5

Exercise 6.3 describes data for a chemical manufacturing process. Use the same data imputation, data splitting, and pre-processing steps as before and train several nonlinear regression models.

data(ChemicalManufacturingProcess)
missing <- ChemicalManufacturingProcess %>%
  miss_var_summary() 
kable(missing %>% 
  filter(pct_miss > 0)) %>%
  kable_styling(bootstrap_options = c("striped", "hover", "condensed"))

variable	n_miss	pct_miss
ManufacturingProcess03	15	8.5227273
ManufacturingProcess11	10	5.6818182
ManufacturingProcess10	9	5.1136364
ManufacturingProcess25	5	2.8409091
ManufacturingProcess26	5	2.8409091
ManufacturingProcess27	5	2.8409091
ManufacturingProcess28	5	2.8409091
ManufacturingProcess29	5	2.8409091
ManufacturingProcess30	5	2.8409091
ManufacturingProcess31	5	2.8409091
ManufacturingProcess33	5	2.8409091
ManufacturingProcess34	5	2.8409091
ManufacturingProcess35	5	2.8409091
ManufacturingProcess36	5	2.8409091
ManufacturingProcess02	3	1.7045455
ManufacturingProcess06	2	1.1363636
ManufacturingProcess01	1	0.5681818
ManufacturingProcess04	1	0.5681818
ManufacturingProcess05	1	0.5681818
ManufacturingProcess07	1	0.5681818
ManufacturingProcess08	1	0.5681818
ManufacturingProcess12	1	0.5681818
ManufacturingProcess14	1	0.5681818
ManufacturingProcess22	1	0.5681818
ManufacturingProcess23	1	0.5681818
ManufacturingProcess24	1	0.5681818
ManufacturingProcess40	1	0.5681818
ManufacturingProcess41	1	0.5681818

imputation <- preProcess(ChemicalManufacturingProcess, method = "knnImpute")
predict_imputer <- predict(imputation, ChemicalManufacturingProcess)
#compare to above chart
missing2 <- predict_imputer %>%
  miss_var_summary() 
kable(missing2 %>% 
  filter(pct_miss > 0)) %>%
  kable_styling(bootstrap_options = c("striped", "hover", "condensed"))

variable	n_miss	pct_miss

#split data into training and test data
near_zero <- nearZeroVar(predict_imputer)
part_c <- predict_imputer[,-near_zero]

training_rows <- createDataPartition(part_c$Yield, p = .80, list= FALSE)

train <- part_c[training_rows, ]
test <- part_c[-training_rows, ]

t_ctrl <- trainControl(method = "repeatedcv", repeats = 5)

Partial Least Squares, KNN, averaged neural networks, MARS, SVM with radial basis kernel

pls_model <- train(train[,-1], train$Yield,
                      method = "pls",
                      preProcess = c("center","scale"),
                      tuneLength=10)
pls_model

## Partial Least Squares 
## 
## 144 samples
##  56 predictor
## 
## Pre-processing: centered (56), scaled (56) 
## Resampling: Bootstrapped (25 reps) 
## Summary of sample sizes: 144, 144, 144, 144, 144, 144, ... 
## Resampling results across tuning parameters:
## 
##   ncomp  RMSE       Rsquared   MAE      
##    1     0.8463865  0.3696498  0.6485554
##    2     1.0152953  0.3495444  0.6607548
##    3     0.9874943  0.3948160  0.6444871
##    4     1.1996455  0.3589614  0.6898662
##    5     1.5043198  0.3178992  0.7546030
##    6     1.6994675  0.2953171  0.7986005
##    7     1.9832263  0.2698646  0.8535170
##    8     2.1206293  0.2527328  0.8885957
##    9     2.3144748  0.2407298  0.9297193
##   10     2.4927220  0.2320355  0.9722958
## 
## RMSE was used to select the optimal model using the smallest value.
## The final value used for the model was ncomp = 1.

pls_predictions <- predict(pls_model, test)

results <- data.frame(t(postResample(pred = pls_predictions, obs = test$Yield))) %>%
  mutate("Model"= "PLS")

set.seed(1)
knn_model <- train(train[,-1], train$Yield,
                      method = "knn",
                      preProcess = c("center","scale"),
                      tuneLength=10)
knn_model

## k-Nearest Neighbors 
## 
## 144 samples
##  56 predictor
## 
## Pre-processing: centered (56), scaled (56) 
## Resampling: Bootstrapped (25 reps) 
## Summary of sample sizes: 144, 144, 144, 144, 144, 144, ... 
## Resampling results across tuning parameters:
## 
##   k   RMSE       Rsquared   MAE      
##    5  0.8284109  0.3532782  0.6456100
##    7  0.8073042  0.3815288  0.6334889
##    9  0.8104003  0.3774551  0.6417109
##   11  0.8073493  0.3847691  0.6414043
##   13  0.8081197  0.3871352  0.6424470
##   15  0.8096741  0.3882767  0.6425047
##   17  0.8131737  0.3838400  0.6462842
##   19  0.8177885  0.3819478  0.6494542
##   21  0.8197708  0.3841125  0.6516429
##   23  0.8246242  0.3825287  0.6554087
## 
## RMSE was used to select the optimal model using the smallest value.
## The final value used for the model was k = 7.

knn_predictions <- predict(knn_model, test)

results <- data.frame(t(postResample(pred = knn_predictions, obs = test$Yield))) %>%
  mutate("Model"= "KNN")

nnetGrid <- expand.grid(.decay=c(0, 0.01, 0.1),
                        .size=c(1, 5, 10),
                        .bag=FALSE)
set.seed(1)
nnet_model <- train(train[,-1], train$Yield,
                   method = "avNNet",
                   tuneGrid = nnetGrid,
                   preProc = c("center", "scale"),
                   trace=FALSE,
                   linout=TRUE,
                   maxit=500)

nnet_model

## Model Averaged Neural Network 
## 
## 144 samples
##  56 predictor
## 
## Pre-processing: centered (56), scaled (56) 
## Resampling: Bootstrapped (25 reps) 
## Summary of sample sizes: 144, 144, 144, 144, 144, 144, ... 
## Resampling results across tuning parameters:
## 
##   decay  size  RMSE       Rsquared   MAE      
##   0.00    1    0.8779968  0.3252338  0.6975736
##   0.00    5    0.8496724  0.4003271  0.6683851
##   0.00   10    0.7605671  0.4793945  0.6022623
##   0.01    1    0.9519080  0.3152073  0.7479341
##   0.01    5    0.7545812  0.5046892  0.5943394
##   0.01   10    0.7726129  0.4905262  0.6032850
##   0.10    1    0.9449596  0.3500809  0.7406291
##   0.10    5    0.7591736  0.4948695  0.5974037
##   0.10   10    0.7732982  0.4868300  0.6059522
## 
## Tuning parameter 'bag' was held constant at a value of FALSE
## RMSE was used to select the optimal model using the smallest value.
## The final values used for the model were size = 5, decay = 0.01 and bag = FALSE.

nnet_predictions <- predict(nnet_model, test)

results <- data.frame(t(postResample(pred = nnet_predictions, obs = test$Yield))) %>%
  mutate("Model"= "nnet")

marsGrid <- expand.grid(.degree=1:2,
                        .nprune=2:10)
set.seed(1)
mars_model <- train(train[,-1], train$Yield,
                   method = "earth",
                   tuneGrid = marsGrid,
                   preProc = c("center", "scale"))

mars_model

## Multivariate Adaptive Regression Spline 
## 
## 144 samples
##  56 predictor
## 
## Pre-processing: centered (56), scaled (56) 
## Resampling: Bootstrapped (25 reps) 
## Summary of sample sizes: 144, 144, 144, 144, 144, 144, ... 
## Resampling results across tuning parameters:
## 
##   degree  nprune  RMSE       Rsquared   MAE      
##   1        2      0.7783152  0.4198334  0.6055136
##   1        3      0.7043953  0.5214058  0.5555687
##   1        4      0.6871336  0.5471404  0.5431290
##   1        5      0.7811994  0.5246012  0.5627611
##   1        6      0.8531350  0.4889543  0.5901189
##   1        7      0.8800830  0.4754905  0.6037979
##   1        8      0.9958844  0.4750123  0.6305042
##   1        9      1.2525056  0.4484326  0.6881623
##   1       10      1.4043548  0.4275383  0.7133107
##   2        2      0.7973342  0.3908401  0.6207388
##   2        3      0.7382615  0.4791600  0.5773297
##   2        4      0.7327771  0.4922024  0.5749608
##   2        5      0.8996915  0.4646330  0.6168141
##   2        6      0.9397321  0.4463805  0.6262385
##   2        7      0.9847164  0.4277122  0.6400375
##   2        8      1.0337790  0.4087467  0.6573526
##   2        9      1.0601138  0.4011863  0.6767900
##   2       10      1.0834788  0.3923856  0.6837816
## 
## RMSE was used to select the optimal model using the smallest value.
## The final values used for the model were nprune = 4 and degree = 1.

mars_predictions <- predict(mars_model, test)

results <- data.frame(t(postResample(pred = mars_predictions, obs = test$Yield))) %>%
  mutate("Model"= "MARS")

set.seed(1)
svm_model <- train(train[,-1], train$Yield,
                        method = "svmRadial",
                        tuneLength=10,
                        preProc = c("center", "scale"))
svm_model

## Support Vector Machines with Radial Basis Function Kernel 
## 
## 144 samples
##  56 predictor
## 
## Pre-processing: centered (56), scaled (56) 
## Resampling: Bootstrapped (25 reps) 
## Summary of sample sizes: 144, 144, 144, 144, 144, 144, ... 
## Resampling results across tuning parameters:
## 
##   C       RMSE       Rsquared   MAE      
##     0.25  0.8129474  0.4277107  0.6484259
##     0.50  0.7699433  0.4621842  0.6131232
##     1.00  0.7388357  0.4960953  0.5867002
##     2.00  0.7204512  0.5163894  0.5701909
##     4.00  0.7136990  0.5200364  0.5623580
##     8.00  0.7106642  0.5222605  0.5600247
##    16.00  0.7097141  0.5231418  0.5590835
##    32.00  0.7097141  0.5231418  0.5590835
##    64.00  0.7097141  0.5231418  0.5590835
##   128.00  0.7097141  0.5231418  0.5590835
## 
## Tuning parameter 'sigma' was held constant at a value of 0.01376305
## RMSE was used to select the optimal model using the smallest value.
## The final values used for the model were sigma = 0.01376305 and C = 16.

svm_predictions <- predict(svm_model, test)

results <- data.frame(t(postResample(pred = svm_predictions, obs = test$Yield))) %>%
  mutate("Model"= "SVM")

Below is a summary of the models:

results %>%
  select(Model, RMSE, Rsquared, MAE) %>%
  arrange(RMSE) %>%
  kable() %>%
  kable_styling()

Model	RMSE	Rsquared	MAE
SVM	NA	NA	NA

(a) Which nonlinear regression model gives the optimal resampling and test set performance?

The SVM model is the best non-linear model.

(b) Which predictors are most important in the optimal nonlinear regression model? Do either the biological or process variables dominate the list? How do the top ten important predictors compare to the top ten predictors from the optimal linear model?

varImp(svm_model, 10)

## loess r-squared variable importance
## 
##   only 20 most important variables shown (out of 56)
## 
##                        Overall
## ManufacturingProcess32  100.00
## ManufacturingProcess36   84.62
## BiologicalMaterial06     84.05
## ManufacturingProcess13   80.40
## BiologicalMaterial03     71.42
## BiologicalMaterial02     63.78
## BiologicalMaterial12     63.08
## ManufacturingProcess17   59.51
## ManufacturingProcess09   57.76
## ManufacturingProcess31   56.59
## ManufacturingProcess29   52.82
## ManufacturingProcess33   52.68
## ManufacturingProcess06   47.89
## BiologicalMaterial01     43.54
## BiologicalMaterial04     43.49
## BiologicalMaterial11     37.98
## ManufacturingProcess11   32.83
## BiologicalMaterial08     31.51
## ManufacturingProcess12   31.24
## ManufacturingProcess04   30.31

varImp(pls_model, 10)

## pls variable importance
## 
##   only 20 most important variables shown (out of 56)
## 
##                        Overall
## ManufacturingProcess32  100.00
## ManufacturingProcess36   91.92
## ManufacturingProcess13   79.50
## BiologicalMaterial02     76.86
## BiologicalMaterial06     75.62
## ManufacturingProcess09   73.32
## ManufacturingProcess33   72.36
## BiologicalMaterial03     69.08
## BiologicalMaterial04     61.99
## ManufacturingProcess17   59.20
## ManufacturingProcess06   58.37
## BiologicalMaterial01     57.30
## ManufacturingProcess12   55.53
## BiologicalMaterial08     53.58
## BiologicalMaterial12     51.24
## ManufacturingProcess28   48.55
## ManufacturingProcess11   47.34
## BiologicalMaterial11     47.07
## ManufacturingProcess04   46.85
## ManufacturingProcess10   40.85

The SVM model was very similar to the PLS model, except that the SVM model found BiologicalMaterial12 to be important and not BiologicalMaterial02. Overall, the manufacturing process variables are important in both models.

(c) Explore the relationships between the top predictors and the response for the predictors that are unique to the optimal nonlinear regression model. Do these plots reveal intuition about the biological or process predictors and their relationship with yield?

ggplot(train, aes(ManufacturingProcess32, Yield)) +
  geom_point()

There seems to be a positive relationship between the ManufacturingProcess32 variable and the yeild.

KJ 8.1

Recreate the simulated data from Exercise 7.2:

set.seed(200)
simulated <- mlbench.friedman1(200, sd = 1) 
simulated <- cbind(simulated$x, simulated$y) 
simulated <- as.data.frame(simulated)
colnames(simulated)[ncol(simulated)] <- "y"

(a) Fit a random forest model to all of the predictors, then estimate the variable importance scores:

model1 <- randomForest(y ~ ., data = simulated, importance = TRUE,
ntree = 1000) 
rfImp1 <- varImp(model1, scale = FALSE)
rfImp1

##         Overall
## V1   8.83890885
## V2   6.49023056
## V3   0.67583163
## V4   7.58822553
## V5   2.27426009
## V6   0.17436781
## V7   0.15136583
## V8  -0.03078937
## V9  -0.02989832
## V10 -0.08529218

Did the random forest model significantly use the uninformative predictors (V6 – V10)?

The random forest model did use the uninformative predictors (V6 – V10) but not very significantly as the importance numbers for these predictors are low.

(b) Now add an additional predictor that is highly correlated with one of the informative predictors. For example:

simulated$duplicate1 <- simulated$V1 + rnorm(200) * .1 
cor(simulated$duplicate1, simulated$V1)

## [1] 0.9396216

Fit another random forest model to these data. Did the importance score for V1 change? What happens when you add another predictor that is also highly correlated with V1?

After adding an additional predictor that is highly correlated, the importance score for V1 decreased significantly.

model2 <- randomForest(y ~ .,
                       data = simulated,
                       importance = TRUE,
                       ntree = 1000)
rfImp2 <- varImp(model2, scale = FALSE)
rfImp2

##                Overall
## V1          6.29780744
## V2          6.08038134
## V3          0.58410718
## V4          6.93924427
## V5          2.03104094
## V6          0.07947642
## V7         -0.02566414
## V8         -0.11007435
## V9         -0.08839463
## V10        -0.00715093
## duplicate1  3.56411581

simulated$duplicate2 <- simulated$V1 + rnorm(200) * .1 
cor(simulated$duplicate2, simulated$V1)

## [1] 0.9312569

model3 <- randomForest(y ~ .,
                       data = simulated,
                       importance = TRUE,
                       ntree = 1000)
rfImp3 <- varImp(model3, scale = FALSE)
rfImp3

##                 Overall
## V1          5.656397024
## V2          6.957366954
## V3          0.539700105
## V4          7.280227792
## V5          2.094226861
## V6          0.141163232
## V7          0.092792498
## V8         -0.096325566
## V9         -0.007463533
## V10         0.016839393
## duplicate1  2.566313355
## duplicate2  2.654958084

The importance score for V1 decreased again.

(c) Use the cforest function in the party package to fit a random forest model using conditional inference trees. The party package function varimp can calculate predictor importance. The conditional argument of that function toggles between the traditional importance measure and the modified version described in Strobl et al. (2007). Do these importances show the same pattern as the traditional random forest model?

Random Forest Model 1

#original data
partc <- simulated[,1:11]
tree <- cforest(y ~., data = partc)
imp_partc <- varimp(tree, conditional = TRUE)
imp_partc[order(-abs(imp_partc))]

##          V1          V4          V2          V5          V8          V9 
##  6.29693233  5.91248630  5.31985191  1.45798478 -0.50833336 -0.28273187 
##         V10          V6          V7          V3 
## -0.16748484 -0.16535778 -0.12086303  0.08627462

Random Forest Model 2 with Highly Correlated Variable

#data that includes duplicate1
partc2 <- simulated[,1:12]
tree <- cforest(y ~., data = partc2)
imp_partc <- varimp(tree, conditional = TRUE)
imp_partc[order(-abs(imp_partc))]

##          V4          V2          V1  duplicate1          V5          V8 
##  5.95962750  4.96154104  3.74925620  1.91622487  1.40958798 -0.39347533 
##          V7          V6         V10          V3          V9 
## -0.19621092 -0.17986349 -0.10474866 -0.10007628 -0.05495108

Random Forest Model 3 with Two Highly Correlated Variables

#data that includes duplicate1 and duplicate2
partc3 <- simulated[,1:13]
tree <- cforest(y ~., data = partc3)
imp_partc <- varimp(tree, conditional = TRUE)
imp_partc[order(-abs(imp_partc))]

##          V4          V2          V1  duplicate2          V5  duplicate1 
##  5.64999342  4.93107392  2.85755671  1.40425819  1.37039662  1.18838712 
##          V8         V10          V9          V6          V7          V3 
## -0.32731205 -0.32423142 -0.23095456 -0.19130723 -0.14729672 -0.02688669

(d) Repeat this process with different tree models, such as boosted trees and Cubist. Does the same pattern occur?

Boosted Tress

Model 1

#original data
boosted_trees <- simulated [,1:11]
gbm_model <- gbm (y~., data = boosted_trees, distribution = "gaussian")
summary(gbm_model)

##     var    rel.inf
## V1   V1 29.0673614
## V4   V4 28.1648450
## V2   V2 22.2804710
## V5   V5 12.3698244
## V3   V3  7.6010502
## V6   V6  0.3393232
## V7   V7  0.1771248
## V8   V8  0.0000000
## V9   V9  0.0000000
## V10 V10  0.0000000

Model 2 with Highly Correlated Variable

#data that includes duplicate1
boosted_trees2 <- simulated [,1:12]
gbm_model2 <- gbm (y~., data = boosted_trees2, distribution = "gaussian")
summary(gbm_model2)

##                   var    rel.inf
## V4                 V4 29.0129018
## V2                 V2 24.6253635
## V1                 V1 23.9838931
## V5                 V5 11.1039359
## V3                 V3  8.4126013
## duplicate1 duplicate1  2.7165144
## V6                 V6  0.1447899
## V7                 V7  0.0000000
## V8                 V8  0.0000000
## V9                 V9  0.0000000
## V10               V10  0.0000000

Model 3 with Two Highly Correlated Variables

#data that includes duplicate1 and duplicate2
boosted_trees3 <- simulated [,1:13]
gbm_model3 <- gbm (y~., data = boosted_trees3, distribution = "gaussian")
summary(gbm_model3)

##                   var    rel.inf
## V4                 V4 29.4671008
## V2                 V2 22.6773152
## V1                 V1 17.3114858
## V5                 V5 11.4561331
## duplicate1 duplicate1  9.9548300
## V3                 V3  7.7375395
## duplicate2 duplicate2  0.8260121
## V6                 V6  0.2525465
## V7                 V7  0.1979189
## V9                 V9  0.1191182
## V8                 V8  0.0000000
## V10               V10  0.0000000

The decline in importance is more gradual in comparison with the traditional random forest model. However, more variables are assigned levels of importance with values greater than zero. In addition, the highly correlated predictors that were added (duplicate1 and duplicate2) still appear among the list of influential variables.

KJ 8.2

Use a simulation to show tree bias with different granularities

set.seed(824)
x1_low <- sample(-1:1, 250, replace = T)
y <- x1_low + rnorm(250, mean=0, sd=1)
x2_mid <- x1_low + round(rnorm(250, mean=0, sd=1), 1)
x3_high <- x1_low + round(rnorm(250, mean=0, sd=1), 2)

df <- data.frame(cbind(x1_low, x2_mid, x3_high, y))
str(df)

## 'data.frame':    250 obs. of  4 variables:
##  $ x1_low : num  1 -1 0 1 -1 1 -1 -1 0 0 ...
##  $ x2_mid : num  -0.2 -1.4 0.4 3.9 -0.3 0.4 -1.2 -0.5 0.2 -1.2 ...
##  $ x3_high: num  -0.84 -1.34 -0.31 0.55 -1.07 0.07 -1.19 0.53 -0.07 0.69 ...
##  $ y      : num  1.984 -0.582 2.043 1.96 -2.943 ...

tree_mod <- rpart(y ~ ., data = df)

correlation_to_y <- c(cor(x1_low, y), cor(x2_mid, y), cor(x3_high, y))
unique_values <- c(length(unique(x1_low)), 
                    length(unique(x2_mid)), 
                    length(unique(x3_high)))
var_imp <- varImp(tree_mod)

table_sim <- cbind(correlation_to_y, unique_values, var_imp) 
names(table_sim)[3] <- "importance"
table_sim

##         correlation_to_y unique_values importance
## x1_low         0.6516570             3  0.5592728
## x2_mid         0.5068903            71  0.7014998
## x3_high        0.4670269           213  0.9166740

KJ 8.3

In stochastic gradient boosting the bagging fraction and learning rate will govern the construction of the trees as they are guided by the gradient. Although the optimal values of these parameters should be obtained through the tuning process, it is helpful to understand how the magnitudes of these parameters affect magnitudes of variable importance. Figure 8.24 provides the variable importance plots for boosting using two extreme values for the bagging fraction (0.1 and 0.9) and the learning rate (0.1 and 0.9) for the solubility data. The left-hand plot has both parameters set to 0.1, and the right-hand plot has both set to 0.9:

(a) Why does the model on the right focus its importance on just the first few of predictors, whereas the model on the left spreads importance across more predictors?

The model on the right has a higher bagging parameter and so uses a larger random sample of the predictors.

(b) Which model do you think would be more predictive of other samples?

The left model which is regulated by the lower values of the two tuning parameters, will exhibit smaller variance for new samples and a lower RMSE. This will make it more predictive of other samples.

(c) How would increasing interaction depth affect the slope of predictor importance for either model in Fig. 8.24?

Increasing the depth should cause the predictors with higher importance values to decrease and the ones with lower values to be increased, thus flattening the curve in the slope of predictor importance in either model.

KJ 8.7

Refer to Exercises 6.3 and 7.5 which describe a chemical manufacturing process. Use the same data imputation, data splitting, and pre-processing steps as before and train several tree-based models:

(a) Which tree-based regression model gives the optimal resampling and test set performance?

imputation <- preProcess(ChemicalManufacturingProcess, method = "knnImpute")
predict_imputer <- predict(imputation, ChemicalManufacturingProcess)

near_zero <- nearZeroVar(predict_imputer)
part_a <- predict_imputer[,-near_zero]

training_rows <- createDataPartition(part_a$Yield, p = .80, list= FALSE)

train <- part_a[training_rows, ]
trainx <- train[,-1]
trainy <- train$Yield
test <- part_a[-training_rows, ]
testx <- test[,-1]
testy <- test$Yield

Random Forest

set.seed(999)
randomforest_grid <- expand.grid(mtry=seq(7,49,by=3))
randomforest_model <- train(trainx, trainy, method = "rf", 
                tuneGrid = randomforest_grid)

rf_imp <- varImp(randomforest_model, scale = FALSE) 
rf_imp <- rf_imp$importance %>% rownames_to_column("var") 
rf_imp <- rf_imp[order(-rf_imp$Overall), , drop=FALSE] %>% remove_rownames()

plot(randomforest_model)

head (rf_imp,15)

##                       var   Overall
## 1  ManufacturingProcess32 21.633862
## 2    BiologicalMaterial12 10.057920
## 3  ManufacturingProcess13  8.806751
## 4    BiologicalMaterial06  8.481441
## 5  ManufacturingProcess09  6.032000
## 6  ManufacturingProcess31  5.466688
## 7    BiologicalMaterial03  5.442345
## 8  ManufacturingProcess36  5.125186
## 9  ManufacturingProcess17  4.846523
## 10   BiologicalMaterial11  3.755388
## 11 ManufacturingProcess06  3.367949
## 12   BiologicalMaterial02  2.998984
## 13 ManufacturingProcess11  2.818685
## 14 ManufacturingProcess28  2.786365
## 15 ManufacturingProcess33  2.317485

Gradient Boost

gbm_grid <- expand.grid(.interaction.depth = seq(1, 7, by = 2), 
                       .n.trees = seq(100, 1000, by = 100), 
                       .shrinkage = c(0.005, 0.01, 0.1, 0.2),
                       .n.minobsinnode = seq(5, 16, by = 5))

gbm_train_model <- train(trainx, trainy, method = "gbm", 
                   tuneGrid = gbm_grid, verbose = FALSE)

gbm_imp <- data.frame(varImp(gbm_train_model)$importance) %>%
  rownames_to_column("var") 
gbm_imp <- gbm_imp[order(-gbm_imp$Overall), , drop=FALSE] %>%
  remove_rownames()

plot(gbm_train_model)

head (gbm_imp,15)

##                       var    Overall
## 1  ManufacturingProcess32 100.000000
## 2    BiologicalMaterial12  40.575656
## 3  ManufacturingProcess17  20.558080
## 4  ManufacturingProcess13  20.029846
## 5  ManufacturingProcess31  18.018887
## 6  ManufacturingProcess09  17.951764
## 7    BiologicalMaterial09  13.246676
## 8    BiologicalMaterial03  12.095358
## 9  ManufacturingProcess06  11.370039
## 10   BiologicalMaterial11   9.905592
## 11 ManufacturingProcess43   8.874230
## 12 ManufacturingProcess24   7.912428
## 13   BiologicalMaterial06   7.658822
## 14 ManufacturingProcess11   7.334153
## 15 ManufacturingProcess39   7.197968

Cubist

cubist_grid <- expand.grid(committees = c(1,5,10,20,40,80), neighbors = c(0,1,3,5,7,9))
cubist_model = train(trainx, trainy, method = "cubist", tuneGrid = cubist_grid) 

cubist_imp <- data.frame(varImp(cubist_model)$importance) %>%
  rownames_to_column("var")
cubist_imp <- cubist_imp[order(-cubist_imp$Overall), , drop=FALSE] %>%
  remove_rownames()

plot(cubist_model)

head (cubist_imp,15)

##                       var   Overall
## 1  ManufacturingProcess32 100.00000
## 2  ManufacturingProcess17  56.12245
## 3  ManufacturingProcess09  52.04082
## 4    BiologicalMaterial06  32.65306
## 5  ManufacturingProcess39  25.51020
## 6    BiologicalMaterial02  24.48980
## 7    BiologicalMaterial03  24.48980
## 8    BiologicalMaterial12  23.46939
## 9  ManufacturingProcess04  22.44898
## 10 ManufacturingProcess13  21.42857
## 11 ManufacturingProcess33  21.42857
## 12 ManufacturingProcess24  17.34694
## 13 ManufacturingProcess29  17.34694
## 14 ManufacturingProcess25  15.30612
## 15 ManufacturingProcess27  13.26531

Model Comparison

#use predict function for all three different models
rand_forest_train <- predict(randomforest_model)
gradient_boost_train <- predict(gbm_train_model)
cubist_train <- predict(cubist_model)

#combine results of three different models
model_compare <- data.frame(rbind(postResample(pred = cubist_train, obs = trainy), 
                         postResample(pred = gradient_boost_train, obs = trainy),
                         postResample(pred = rand_forest_train, obs = trainy)),
                   row.names =  c("Cubist","Gradient Boost", "Random Forest"))
model_compare

##                        RMSE  Rsquared          MAE
## Cubist         2.645043e-08 1.0000000 1.794383e-08
## Gradient Boost 9.661302e-02 0.9921559 7.447140e-02
## Random Forest  2.500425e-01 0.9651050 1.942209e-01

#test set performance
rand_forest_test <- predict(randomforest_model, newdata = testx)
gradient_boost_test <- predict(gbm_train_model, newdata = testx)
cubist_test <- predict(cubist_model, newdata = testx)

model_compare2 <- data.frame(rbind(postResample(pred = cubist_test, obs = testy), 
                         postResample(pred = gradient_boost_test, obs = testy),
                         postResample(pred = rand_forest_test, obs = testy)),
                   row.names =  c("Cubist","Gradient Boost", "Random Forest"))
model_compare2

##                     RMSE  Rsquared       MAE
## Cubist         0.4786084 0.7644056 0.3198840
## Gradient Boost 0.5555375 0.6826077 0.3966018
## Random Forest  0.5844761 0.6638290 0.4193630

cubist_model$bestTune

##    committees neighbors
## 32         80         1

The cubist model resulted in the lowest RMSE and MAE values for both the resampling and the test set predictions, as well as the highest R2.

(b) Which predictors are most important in the optimal tree-based regression model? Do either the biological or process variables dominate the list? How do the top 10 important predictors compare to the top 10 predictors from the optimal linear and nonlinear models?

head(cubist_imp,10)

##                       var   Overall
## 1  ManufacturingProcess32 100.00000
## 2  ManufacturingProcess17  56.12245
## 3  ManufacturingProcess09  52.04082
## 4    BiologicalMaterial06  32.65306
## 5  ManufacturingProcess39  25.51020
## 6    BiologicalMaterial02  24.48980
## 7    BiologicalMaterial03  24.48980
## 8    BiologicalMaterial12  23.46939
## 9  ManufacturingProcess04  22.44898
## 10 ManufacturingProcess13  21.42857

(c) Plot the optimal single tree with the distribution of yield in the terminal nodes. Does this view of the data provide additional knowledge about the biological or process predictors and their relationship with yield?

set.seed(456)
ctrl <- trainControl(method = "cv", number = 10)
tree_grid <- expand.grid(maxdepth= seq(1,15, by=1))
tree_model <- train(trainx, trainy, method = "rpart2", 
                  tuneGrid = tree_grid, trControl = ctrl)

plot(as.party(tree_model$finalModel), gp=gpar(fontsize=11))

Market Basket Analysis | Recommender Systems

Imagine 10000 receipts sitting on your table. Each receipt represents a transaction with items that were purchased. The receipt is a representation of stuff that went into a customer’s basket – and therefore ‘Market Basket Analysis’.

That is exactly what the Groceries Data Set contains: a collection of receipts with each line representing 1 receipt and the items purchased. Each line is called a transaction and each column in a row represents an item.

You assignment is to use R to mine the data for association rules. You should report support, confidence and lift and your top 10 rules by lift. Turn in as you would the other problems from HA and KJ. Due 07/18/20 with the packaged set, HW #2.

First, we will load the data.

#read csv from github path
groceryds <- read.transactions("https://raw.githubusercontent.com/aaronzalkisps/data624/master/GroceryDataSet.csv", sep=",")
itemFrequencyPlot(groceryds, topN=10, type="absolute", main="Top 10 Items")

We can see that whole milk is the most common item. Next, we will use the apriori algorithm and display the top 15 rules with their support, confidence and lift.

apriori(groceryds, parameter=list(supp=0.001, conf=0.5) , control=list(verbose=FALSE)) %>%
  DATAFRAME() %>%
  arrange(desc(lift)) %>%
  top_n(10) %>%
  kable() %>%
  kable_styling()

LHS	RHS	support	confidence	coverage	lift	count
{root vegetables,tropical fruit}	{other vegetables}	0.0123030	0.5845411	0.0210473	3.020999	121
{rolls/buns,root vegetables}	{other vegetables}	0.0122013	0.5020921	0.0243010	2.594890	120
{root vegetables,yogurt}	{other vegetables}	0.0129131	0.5000000	0.0258261	2.584078	127
{root vegetables,yogurt}	{whole milk}	0.0145399	0.5629921	0.0258261	2.203354	143
{domestic eggs,other vegetables}	{whole milk}	0.0123030	0.5525114	0.0222674	2.162336	121
{rolls/buns,root vegetables}	{whole milk}	0.0127097	0.5230126	0.0243010	2.046888	125
{other vegetables,pip fruit}	{whole milk}	0.0135231	0.5175097	0.0261312	2.025351	133
{tropical fruit,yogurt}	{whole milk}	0.0151500	0.5173611	0.0292832	2.024770	149
{other vegetables,yogurt}	{whole milk}	0.0222674	0.5128806	0.0434164	2.007235	219
{other vegetables,whipped/sour cream}	{whole milk}	0.0146416	0.5070423	0.0288765	1.984385	144

Below, we can see the summary of the rules. There are 232 rules of association, most of which are based on 2 or 3 items.

rules <- apriori(groceryds, parameter = list(support = 0.01, confidence = 0.2))

## Apriori
## 
## Parameter specification:
##  confidence minval smax arem  aval originalSupport maxtime support minlen
##         0.2    0.1    1 none FALSE            TRUE       5    0.01      1
##  maxlen target  ext
##      10  rules TRUE
## 
## Algorithmic control:
##  filter tree heap memopt load sort verbose
##     0.1 TRUE TRUE  FALSE TRUE    2    TRUE
## 
## Absolute minimum support count: 98 
## 
## set item appearances ...[0 item(s)] done [0.00s].
## set transactions ...[169 item(s), 9835 transaction(s)] done [0.01s].
## sorting and recoding items ... [88 item(s)] done [0.00s].
## creating transaction tree ... done [0.00s].
## checking subsets of size 1 2 3 4 done [0.00s].
## writing ... [232 rule(s)] done [0.00s].
## creating S4 object  ... done [0.00s].

summary(rules)

## set of 232 rules
## 
## rule length distribution (lhs + rhs):sizes
##   1   2   3 
##   1 151  80 
## 
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   1.000   2.000   2.000   2.341   3.000   3.000 
## 
## summary of quality measures:
##     support          confidence        coverage            lift       
##  Min.   :0.01007   Min.   :0.2006   Min.   :0.01729   Min.   :0.8991  
##  1st Qu.:0.01200   1st Qu.:0.2470   1st Qu.:0.03437   1st Qu.:1.4432  
##  Median :0.01490   Median :0.3170   Median :0.05241   Median :1.7277  
##  Mean   :0.02005   Mean   :0.3321   Mean   :0.06708   Mean   :1.7890  
##  3rd Qu.:0.02227   3rd Qu.:0.4033   3rd Qu.:0.07565   3rd Qu.:2.0762  
##  Max.   :0.25552   Max.   :0.5862   Max.   :1.00000   Max.   :3.2950  
##      count       
##  Min.   :  99.0  
##  1st Qu.: 118.0  
##  Median : 146.5  
##  Mean   : 197.2  
##  3rd Qu.: 219.0  
##  Max.   :2513.0  
## 
## mining info:
##       data ntransactions support confidence
##  groceryds          9835    0.01        0.2

We can see the top fifteen rules based on lift below. They are mostly associated with root vegetables as a consequent item.

inspect(head(rules, n = 15, by = "lift"))

##      lhs                                   rhs                  support   
## [1]  {citrus fruit,other vegetables}    => {root vegetables}    0.01037112
## [2]  {other vegetables,yogurt}          => {whipped/sour cream} 0.01016777
## [3]  {other vegetables,tropical fruit}  => {root vegetables}    0.01230300
## [4]  {beef}                             => {root vegetables}    0.01738688
## [5]  {citrus fruit,root vegetables}     => {other vegetables}   0.01037112
## [6]  {root vegetables,tropical fruit}   => {other vegetables}   0.01230300
## [7]  {other vegetables,whole milk}      => {root vegetables}    0.02318251
## [8]  {curd,whole milk}                  => {yogurt}             0.01006609
## [9]  {other vegetables,yogurt}          => {root vegetables}    0.01291307
## [10] {other vegetables,yogurt}          => {tropical fruit}     0.01230300
## [11] {other vegetables,root vegetables} => {citrus fruit}       0.01037112
## [12] {other vegetables,rolls/buns}      => {root vegetables}    0.01220132
## [13] {tropical fruit,whole milk}        => {root vegetables}    0.01199797
## [14] {rolls/buns,root vegetables}       => {other vegetables}   0.01220132
## [15] {root vegetables,yogurt}           => {other vegetables}   0.01291307
##      confidence coverage   lift     count
## [1]  0.3591549  0.02887646 3.295045 102  
## [2]  0.2341920  0.04341637 3.267062 100  
## [3]  0.3427762  0.03589222 3.144780 121  
## [4]  0.3313953  0.05246568 3.040367 171  
## [5]  0.5862069  0.01769192 3.029608 102  
## [6]  0.5845411  0.02104728 3.020999 121  
## [7]  0.3097826  0.07483477 2.842082 228  
## [8]  0.3852140  0.02613116 2.761356  99  
## [9]  0.2974239  0.04341637 2.728698 127  
## [10] 0.2833724  0.04341637 2.700550 121  
## [11] 0.2188841  0.04738180 2.644626 102  
## [12] 0.2863962  0.04260295 2.627525 120  
## [13] 0.2836538  0.04229792 2.602365 118  
## [14] 0.5020921  0.02430097 2.594890 120  
## [15] 0.5000000  0.02582613 2.584078 127

We can see the top fifteen rules based on confidence below. They are mostly associated with whole milk as a consquent item.

inspect(head(rules, n = 15, by = "confidence"))

##      lhs                                      rhs                support   
## [1]  {citrus fruit,root vegetables}        => {other vegetables} 0.01037112
## [2]  {root vegetables,tropical fruit}      => {other vegetables} 0.01230300
## [3]  {curd,yogurt}                         => {whole milk}       0.01006609
## [4]  {butter,other vegetables}             => {whole milk}       0.01148958
## [5]  {root vegetables,tropical fruit}      => {whole milk}       0.01199797
## [6]  {root vegetables,yogurt}              => {whole milk}       0.01453991
## [7]  {domestic eggs,other vegetables}      => {whole milk}       0.01230300
## [8]  {whipped/sour cream,yogurt}           => {whole milk}       0.01087951
## [9]  {rolls/buns,root vegetables}          => {whole milk}       0.01270971
## [10] {other vegetables,pip fruit}          => {whole milk}       0.01352313
## [11] {tropical fruit,yogurt}               => {whole milk}       0.01514997
## [12] {other vegetables,yogurt}             => {whole milk}       0.02226741
## [13] {other vegetables,whipped/sour cream} => {whole milk}       0.01464159
## [14] {rolls/buns,root vegetables}          => {other vegetables} 0.01220132
## [15] {root vegetables,yogurt}              => {other vegetables} 0.01291307
##      confidence coverage   lift     count
## [1]  0.5862069  0.01769192 3.029608 102  
## [2]  0.5845411  0.02104728 3.020999 121  
## [3]  0.5823529  0.01728521 2.279125  99  
## [4]  0.5736041  0.02003050 2.244885 113  
## [5]  0.5700483  0.02104728 2.230969 118  
## [6]  0.5629921  0.02582613 2.203354 143  
## [7]  0.5525114  0.02226741 2.162336 121  
## [8]  0.5245098  0.02074225 2.052747 107  
## [9]  0.5230126  0.02430097 2.046888 125  
## [10] 0.5175097  0.02613116 2.025351 133  
## [11] 0.5173611  0.02928317 2.024770 149  
## [12] 0.5128806  0.04341637 2.007235 219  
## [13] 0.5070423  0.02887646 1.984385 144  
## [14] 0.5020921  0.02430097 2.594890 120  
## [15] 0.5000000  0.02582613 2.584078 127

Below we will plot the rules:

plot(rules, jitter = 1)

head(rules, n = 10, by = "lift") %>% 
    plot(method = "graph", engine = "htmlwidget")

head(rules, n = 10, by = "confidence") %>% 
    plot(method = "graph", engine = "htmlwidget")