DATA624 HW2

library(AppliedPredictiveModeling)
library(caret)
library(kableExtra)
library(dplyr)
library(elasticnet)
library(naniar) 
library(mlbench)
library(earth)
library(kernlab)
library(randomForest)
library(party)
library(tibble)
library(gbm)
library(rpart)
library(arules)
library(arulesViz)
library(tidyverse)
library(readxl)
library(knitr)
library(ggplot2)
library(lubridate)
library(plyr)

KJ 6.3

A chemical manufacturing process for a pharmaceutical product was discussed in Sect.1.4. In this problem, the objective is to understand the relationship between biological measurements of the raw materials (predictors), measurements of the manufacturing process (predictors), and the response of product yield. Biological predictors cannot be changed but can be used to assess the quality of the raw material before processing. On the other hand, manufacturing process predictors can be changed in the manufacturing process. Improving product yield by 1% will boost revenue by approximately one hundred thousand dollars per batch:

(a) Start R and use these commands to load the data: The matrix processPredictors contains the 57 predictors (12 describing the input biological material and 45 describing the process predictors) for the 176 manufacturing runs. yield contains the percent yield for each run.

library(AppliedPredictiveModeling)
data(ChemicalManufacturingProcess)

(b) A small percentage of cells in the predictor set contain missing values. Use an imputation function to fill in these missing values

missing <- ChemicalManufacturingProcess %>%
  miss_var_summary() 
kable(missing %>% 
  filter(pct_miss > 0)) %>%
  kable_styling(bootstrap_options = c("striped", "hover", "condensed"))

variable	n_miss	pct_miss
ManufacturingProcess03	15	8.5227273
ManufacturingProcess11	10	5.6818182
ManufacturingProcess10	9	5.1136364
ManufacturingProcess25	5	2.8409091
ManufacturingProcess26	5	2.8409091
ManufacturingProcess27	5	2.8409091
ManufacturingProcess28	5	2.8409091
ManufacturingProcess29	5	2.8409091
ManufacturingProcess30	5	2.8409091
ManufacturingProcess31	5	2.8409091
ManufacturingProcess33	5	2.8409091
ManufacturingProcess34	5	2.8409091
ManufacturingProcess35	5	2.8409091
ManufacturingProcess36	5	2.8409091
ManufacturingProcess02	3	1.7045455
ManufacturingProcess06	2	1.1363636
ManufacturingProcess01	1	0.5681818
ManufacturingProcess04	1	0.5681818
ManufacturingProcess05	1	0.5681818
ManufacturingProcess07	1	0.5681818
ManufacturingProcess08	1	0.5681818
ManufacturingProcess12	1	0.5681818
ManufacturingProcess14	1	0.5681818
ManufacturingProcess22	1	0.5681818
ManufacturingProcess23	1	0.5681818
ManufacturingProcess24	1	0.5681818
ManufacturingProcess40	1	0.5681818
ManufacturingProcess41	1	0.5681818

imputation <- preProcess(ChemicalManufacturingProcess, method = "knnImpute")
predict_imputer <- predict(imputation, ChemicalManufacturingProcess)
#compare to above chart
missing2 <- predict_imputer %>%
  miss_var_summary() 
kable(missing2 %>% 
  filter(pct_miss > 0)) %>%
  kable_styling(bootstrap_options = c("striped", "hover", "condensed"))

variable	n_miss	pct_miss

(c) Split the data into a training and a test set, pre-process the data, and tune a model of your choice from this chapter. What is the optimal value of the performance metric?

near_zero <- nearZeroVar(predict_imputer)
part_c <- predict_imputer[,-near_zero]

training_rows <- createDataPartition(part_c$Yield, p = .80, list= FALSE)

train <- part_c[training_rows, ]
test <- part_c[-training_rows, ]

ridge_model <- train(train[,-1], train$Yield,
                      method = "ridge",
                    # tuneGrid = ridge_grid,
                  #  trControl = ctrl,
                      preProcess = c("center","scale"))
ridge_model

## Ridge Regression 
## 
## 144 samples
##  56 predictor
## 
## Pre-processing: centered (56), scaled (56) 
## Resampling: Bootstrapped (25 reps) 
## Summary of sample sizes: 144, 144, 144, 144, 144, 144, ... 
## Resampling results across tuning parameters:
## 
##   lambda  RMSE      Rsquared   MAE      
##   0e+00   8.958523  0.1149972  2.1878232
##   1e-04   7.738035  0.1265474  1.9649354
##   1e-01   1.440014  0.3460875  0.7504608
## 
## RMSE was used to select the optimal model using the smallest value.
## The final value used for the model was lambda = 0.1.

ridge_model2 <- enet(x = as.matrix(train[,-1]), y = train$Yield, lambda = 0.1, 
                   normalize = TRUE)

(d) Predict the response for the test set. What is the value of the performance metric and how does this compare with the resampled performance metric on the training set?

predict_value <- predict(ridge_model, test[,-1], s = 1, 
                     mode = "raw",
                     type = "raw")

postResample(pred = predict_value, obs = test$Yield)

##      RMSE  Rsquared       MAE 
## 1.3943349 0.2120608 0.6452461

predict_value2 <- predict(ridge_model2, as.matrix(test[,-1]), s = 1, 
                     mode = "fraction",
                     type = "fit")

postResample(pred = predict_value2$fit, obs = test$Yield)

##      RMSE  Rsquared       MAE 
## 1.3943349 0.2120608 0.6452461

(e) Which predictors are most important in the model you have trained? Do either the biological or process predictors dominate the list?

get_coefficients <- predict(ridge_model2, as.matrix(test[,-1]), s = 1, 
                     mode = "fraction",
                     type = "coefficients")

top5 <- head(sort(get_coefficients$coefficients, decreasing = TRUE), 5)
bottom5 <- tail(sort(get_coefficients$coefficients, decreasing = TRUE), 5)
top5

## ManufacturingProcess32 ManufacturingProcess09 ManufacturingProcess04 
##              0.3789046              0.2430406              0.1421202 
## ManufacturingProcess34 ManufacturingProcess19 
##              0.1363196              0.1206293

bottom5

## ManufacturingProcess37 ManufacturingProcess17 ManufacturingProcess13 
##             -0.1398039             -0.1429121             -0.1565554 
## ManufacturingProcess28 ManufacturingProcess16 
##             -0.1627970             -0.5709971

(f) Explore the relationships between each of the top predictors and the response. How could this information be helpful in improving yield in future runs of the manufacturing process?

top_predictors <- c('ManufacturingProcess32', 'ManufacturingProcess09', 'ManufacturingProcess04',
         'ManufacturingProcess34', 'ManufacturingProcess39', 'ManufacturingProcess36',
         'ManufacturingProcess28', 'ManufacturingProcess37', 'ManufacturingProcess17', 'ManufacturingProcess13')
multi_linear_model <- lm(Yield~ManufacturingProcess32+ManufacturingProcess09+ManufacturingProcess04+
        ManufacturingProcess34+ManufacturingProcess39+ManufacturingProcess36+ManufacturingProcess28+
          ManufacturingProcess37+ManufacturingProcess17+ManufacturingProcess13,
        data = predict_imputer)
summary(multi_linear_model)

## 
## Call:
## lm(formula = Yield ~ ManufacturingProcess32 + ManufacturingProcess09 + 
##     ManufacturingProcess04 + ManufacturingProcess34 + ManufacturingProcess39 + 
##     ManufacturingProcess36 + ManufacturingProcess28 + ManufacturingProcess37 + 
##     ManufacturingProcess17 + ManufacturingProcess13, data = predict_imputer)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -1.4187 -0.3904  0.0430  0.3862  1.6256 
## 
## Coefficients:
##                         Estimate Std. Error t value Pr(>|t|)    
## (Intercept)            -0.001536   0.044719  -0.034  0.97265    
## ManufacturingProcess32  0.540806   0.080931   6.682 3.45e-10 ***
## ManufacturingProcess09  0.361180   0.079144   4.564 9.78e-06 ***
## ManufacturingProcess04  0.108567   0.058421   1.858  0.06490 .  
## ManufacturingProcess34  0.096110   0.046815   2.053  0.04166 *  
## ManufacturingProcess39  0.145824   0.049084   2.971  0.00341 ** 
## ManufacturingProcess36 -0.126394   0.075187  -1.681  0.09464 .  
## ManufacturingProcess28 -0.035867   0.063876  -0.562  0.57521    
## ManufacturingProcess37 -0.079526   0.048249  -1.648  0.10120    
## ManufacturingProcess17 -0.111111   0.081515  -1.363  0.17472    
## ManufacturingProcess13 -0.096458   0.088673  -1.088  0.27827    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.5927 on 165 degrees of freedom
## Multiple R-squared:  0.6688, Adjusted R-squared:  0.6487 
## F-statistic: 33.32 on 10 and 165 DF,  p-value: < 2.2e-16

KJ 7.2

Friedman (1991) introduced several benchmark data sets create by simulation. One of these simulations used the following nonlinear equation to create data:

\[ y = 10\sin(\pi x_1x_2) + 20(x_3 - 0.5)^2 + 10x_4 + 5x_5 + N (0, \sigma ^2) \]

where the \(x\) values are random variables uniformly distributed between [0, 1] (there are also 5 other non-informative variables also created in the simulation). The package mlbench contains a function called mlbench.friedman1 that simulates these data:

set.seed(200)
trainingData <- mlbench.friedman1(200, sd = 1)
## We convert the 'x' data from a matrix to a data frame
## One reason is that this will give the columns names.
trainingData$x <- data.frame(trainingData$x) 
## Look at the data using
featurePlot(trainingData$x, trainingData$y)

## or other methods. 
## This creates a list with a vector 'y' and a matrix
## of predictors 'x'. Also simulate a large test set to 
## estimate the true error rate with good precision:
testData <- mlbench.friedman1(5000, sd = 1)
testData$x <- data.frame(testData$x)

Tune several models on these data. Which models appear to give the best performance? Does MARS select the informative predictors (those named X1–X5)?

neural_model <- expand.grid(.decay = c(0, 0.01, .1),
                        .size = c(1:10),
                        .bag = FALSE)

ctrl <- trainControl(method = "cv")

avNNet_model <- train(trainingData$x, trainingData$y, 
                 method = "avNNet",
                 tuneGrid = neural_model,
                 trControl = ctrl,
                 preProcess = c("center", "scale"),
                 linout = TRUE,
                 trace = FALSE,
                 maxit = 500,
                 MaxNWts = 10 * (ncol(trainingData$x) + 1) + 10 + 1)
avNNet_model

## Model Averaged Neural Network 
## 
## 200 samples
##  10 predictor
## 
## Pre-processing: centered (10), scaled (10) 
## Resampling: Cross-Validated (10 fold) 
## Summary of sample sizes: 180, 180, 180, 180, 180, 180, ... 
## Resampling results across tuning parameters:
## 
##   decay  size  RMSE      Rsquared   MAE     
##   0.00    1    2.439565  0.7644006  1.906904
##   0.00    2    2.538290  0.7448102  1.996572
##   0.00    3    2.097923  0.8270457  1.651204
##   0.00    4    2.003586  0.8428708  1.608316
##   0.00    5    2.495658  0.7821679  1.925544
##   0.00    6    2.498643  0.7589755  1.910032
##   0.00    7    3.695878  0.6121163  2.352241
##   0.00    8    6.282577  0.4593528  3.759895
##   0.00    9    4.973098  0.5538140  3.236094
##   0.00   10    2.703068  0.7437045  2.148617
##   0.01    1    2.433477  0.7660774  1.899358
##   0.01    2    2.488251  0.7557016  1.962511
##   0.01    3    2.087867  0.8302922  1.661617
##   0.01    4    2.057420  0.8371424  1.617592
##   0.01    5    2.036480  0.8378762  1.644359
##   0.01    6    2.202479  0.8113223  1.718370
##   0.01    7    2.362180  0.7969243  1.854537
##   0.01    8    2.468188  0.7717175  1.905309
##   0.01    9    2.442612  0.7632283  1.936275
##   0.01   10    2.465949  0.7629934  1.926444
##   0.10    1    2.440506  0.7640541  1.909513
##   0.10    2    2.563721  0.7398046  2.021249
##   0.10    3    2.144000  0.8186454  1.672726
##   0.10    4    2.112690  0.8281984  1.697981
##   0.10    5    2.075008  0.8319334  1.645113
##   0.10    6    2.189105  0.8143496  1.733985
##   0.10    7    2.323408  0.7912417  1.824610
##   0.10    8    2.359512  0.7870478  1.824359
##   0.10    9    2.371569  0.7894291  1.886362
##   0.10   10    2.396639  0.7779495  1.895462
## 
## Tuning parameter 'bag' was held constant at a value of FALSE
## RMSE was used to select the optimal model using the smallest value.
## The final values used for the model were size = 4, decay = 0 and bag = FALSE.

avNNet_predict <- predict(avNNet_model, newdata = testData$x)
avNNet_results <- postResample(pred = avNNet_predict, obs = testData$y)
avNNet_results

##      RMSE  Rsquared       MAE 
## 2.5093746 0.7778381 1.8429557

multi_variate <- expand.grid(.degree=1:2, .nprune=2:38)

mars_model <- train(x=trainingData$x, y=trainingData$y, 
                  method="earth",
                  preProcess = c("center", "scale"),
                  tuneGrid=multi_variate,
                  trControl = ctrl)
mars_model

## Multivariate Adaptive Regression Spline 
## 
## 200 samples
##  10 predictor
## 
## Pre-processing: centered (10), scaled (10) 
## Resampling: Cross-Validated (10 fold) 
## Summary of sample sizes: 180, 180, 180, 180, 180, 180, ... 
## Resampling results across tuning parameters:
## 
##   degree  nprune  RMSE      Rsquared   MAE     
##   1        2      4.189975  0.2906552  3.462839
##   1        3      3.484994  0.5151055  2.764123
##   1        4      2.587964  0.7410329  2.101983
##   1        5      2.420658  0.7683094  1.985518
##   1        6      2.339561  0.7839667  1.871780
##   1        7      1.731624  0.8788241  1.423116
##   1        8      1.678080  0.8854956  1.311885
##   1        9      1.655259  0.8869872  1.291336
##   1       10      1.652845  0.8850859  1.299452
##   1       11      1.604286  0.8921908  1.256919
##   1       12      1.631147  0.8899063  1.266755
##   1       13      1.633440  0.8881519  1.270749
##   1       14      1.646108  0.8865326  1.279917
##   1       15      1.646108  0.8865326  1.279917
##   1       16      1.646108  0.8865326  1.279917
##   1       17      1.646108  0.8865326  1.279917
##   1       18      1.646108  0.8865326  1.279917
##   1       19      1.646108  0.8865326  1.279917
##   1       20      1.646108  0.8865326  1.279917
##   1       21      1.646108  0.8865326  1.279917
##   1       22      1.646108  0.8865326  1.279917
##   1       23      1.646108  0.8865326  1.279917
##   1       24      1.646108  0.8865326  1.279917
##   1       25      1.646108  0.8865326  1.279917
##   1       26      1.646108  0.8865326  1.279917
##   1       27      1.646108  0.8865326  1.279917
##   1       28      1.646108  0.8865326  1.279917
##   1       29      1.646108  0.8865326  1.279917
##   1       30      1.646108  0.8865326  1.279917
##   1       31      1.646108  0.8865326  1.279917
##   1       32      1.646108  0.8865326  1.279917
##   1       33      1.646108  0.8865326  1.279917
##   1       34      1.646108  0.8865326  1.279917
##   1       35      1.646108  0.8865326  1.279917
##   1       36      1.646108  0.8865326  1.279917
##   1       37      1.646108  0.8865326  1.279917
##   1       38      1.646108  0.8865326  1.279917
##   2        2      4.493713  0.1908055  3.729300
##   2        3      3.709145  0.4504905  2.987843
##   2        4      2.609735  0.7341463  2.103370
##   2        5      2.379342  0.7742456  1.931422
##   2        6      2.334586  0.7830818  1.849111
##   2        7      1.749014  0.8766222  1.427570
##   2        8      1.698294  0.8837462  1.400995
##   2        9      1.497818  0.9080943  1.207548
##   2       10      1.411723  0.9170883  1.129177
##   2       11      1.375422  0.9202096  1.096012
##   2       12      1.324813  0.9271017  1.054678
##   2       13      1.319186  0.9276385  1.043179
##   2       14      1.262835  0.9357576  1.000477
##   2       15      1.262417  0.9342643  1.001733
##   2       16      1.267577  0.9334989  1.014147
##   2       17      1.275619  0.9324509  1.016926
##   2       18      1.275619  0.9324509  1.016926
##   2       19      1.275619  0.9324509  1.016926
##   2       20      1.275619  0.9324509  1.016926
##   2       21      1.275619  0.9324509  1.016926
##   2       22      1.275619  0.9324509  1.016926
##   2       23      1.275619  0.9324509  1.016926
##   2       24      1.275619  0.9324509  1.016926
##   2       25      1.275619  0.9324509  1.016926
##   2       26      1.275619  0.9324509  1.016926
##   2       27      1.275619  0.9324509  1.016926
##   2       28      1.275619  0.9324509  1.016926
##   2       29      1.275619  0.9324509  1.016926
##   2       30      1.275619  0.9324509  1.016926
##   2       31      1.275619  0.9324509  1.016926
##   2       32      1.275619  0.9324509  1.016926
##   2       33      1.275619  0.9324509  1.016926
##   2       34      1.275619  0.9324509  1.016926
##   2       35      1.275619  0.9324509  1.016926
##   2       36      1.275619  0.9324509  1.016926
##   2       37      1.275619  0.9324509  1.016926
##   2       38      1.275619  0.9324509  1.016926
## 
## RMSE was used to select the optimal model using the smallest value.
## The final values used for the model were nprune = 15 and degree = 2.

mars_predict <- predict(mars_model, newdata=testData$x)
mars_results <- postResample(pred=mars_predict, obs=testData$y)
mars_results

##      RMSE  Rsquared       MAE 
## 1.1908806 0.9428866 0.9496858

svm <- train(x=trainingData$x, y=trainingData$y, 
                  method="svmRadial", 
                  preProcess=c("center", "scale"), 
                  tuneLength=20)
svm

## Support Vector Machines with Radial Basis Function Kernel 
## 
## 200 samples
##  10 predictor
## 
## Pre-processing: centered (10), scaled (10) 
## Resampling: Bootstrapped (25 reps) 
## Summary of sample sizes: 200, 200, 200, 200, 200, 200, ... 
## Resampling results across tuning parameters:
## 
##   C          RMSE      Rsquared   MAE     
##        0.25  2.535413  0.7832510  2.010246
##        0.50  2.312221  0.7976533  1.821205
##        1.00  2.179041  0.8121920  1.705301
##        2.00  2.089542  0.8246834  1.630071
##        4.00  2.033124  0.8330785  1.576070
##        8.00  2.011953  0.8360991  1.561024
##       16.00  2.010544  0.8363445  1.562455
##       32.00  2.010544  0.8363445  1.562455
##       64.00  2.010544  0.8363445  1.562455
##      128.00  2.010544  0.8363445  1.562455
##      256.00  2.010544  0.8363445  1.562455
##      512.00  2.010544  0.8363445  1.562455
##     1024.00  2.010544  0.8363445  1.562455
##     2048.00  2.010544  0.8363445  1.562455
##     4096.00  2.010544  0.8363445  1.562455
##     8192.00  2.010544  0.8363445  1.562455
##    16384.00  2.010544  0.8363445  1.562455
##    32768.00  2.010544  0.8363445  1.562455
##    65536.00  2.010544  0.8363445  1.562455
##   131072.00  2.010544  0.8363445  1.562455
## 
## Tuning parameter 'sigma' was held constant at a value of 0.05869886
## RMSE was used to select the optimal model using the smallest value.
## The final values used for the model were sigma = 0.05869886 and C = 16.

svm_predict <- predict(svm, newdata=testData$x)
svm_results<- postResample(pred=svm_predict, obs=testData$y)
svm_results

##     RMSE Rsquared      MAE 
## 2.061539 0.827650 1.566485

model_comparison <- data.frame(rbind(avNNet_results, mars_results, svm_results))
#sort by RMSE
model_comparison[order(model_comparison$RMSE),] %>% kable %>%
  kable_styling(bootstrap_options = c("striped", "hover", "condensed"), 
                full_width = F)

	RMSE	Rsquared	MAE
mars_results	1.190881	0.9428866	0.9496858
svm_results	2.061539	0.8276500	1.5664854
avNNet_results	2.509375	0.7778381	1.8429557

The Multivariate Adaptive Regression Splines (MARS) model has the best performance based on RMSE, \(R^2\), MAE. In addition, the MARS model does select five informative predictors as seen below.

varImp(mars_model)

## earth variable importance
## 
##     Overall
## X1   100.00
## X4    85.07
## X2    69.06
## X5    48.95
## X3    39.50
## X8     0.00
## X7     0.00
## X6     0.00
## X10    0.00
## X9     0.00

KJ 7.5

Exercise 6.3 describes data for a chemical manufacturing process. Use the same data imputation, data splitting, and pre-processing steps as before and train several nonlinear regression models.

(a) Which nonlinear regression model gives the optimal resampling and test set performance?

(b) Which predictors are most important in the optimal nonlinear regression model? Do either the biological or process variables dominate the list? How do the top ten important predictors compare to the top ten predictors from the optimal linear model?

(c) Explore the relationships between the top predictors and the response for the predictors that are unique to the optimal nonlinear regression model. Do these plots reveal intuition about the biological or process predictors and their relationship with yield?

KJ 8.1

Recreate the simulated data from Exercise 7.2:

set.seed(200)
simulated <- mlbench.friedman1(200, sd = 1) 
simulated <- cbind(simulated$x, simulated$y) 
simulated <- as.data.frame(simulated)
colnames(simulated)[ncol(simulated)] <- "y"

(a) Fit a random forest model to all of the predictors, then estimate the variable importance scores:

model1 <- randomForest(y ~ ., data = simulated, importance = TRUE,
ntree = 1000) 
rfImp1 <- varImp(model1, scale = FALSE)
rfImp1

##         Overall
## V1   8.83890885
## V2   6.49023056
## V3   0.67583163
## V4   7.58822553
## V5   2.27426009
## V6   0.17436781
## V7   0.15136583
## V8  -0.03078937
## V9  -0.02989832
## V10 -0.08529218

Did the random forest model significantly use the uninformative predic- tors (V6 – V10)?

The random forest model did use the uninformative predictors (V6 – V10) but not very significantly as the importance numbers for these predictors are low.

(b) Now add an additional predictor that is highly correlated with one of the informative predictors. For example:

simulated$duplicate1 <- simulated$V1 + rnorm(200) * .1 
cor(simulated$duplicate1, simulated$V1)

## [1] 0.9396216

Fit another random forest model to these data. Did the importance score for V1 change? What happens when you add another predictor that is also highly correlated with V1?

After adding an additional predictor that is highly correlated, the importance score for V1 decreased significantly.

model2 <- randomForest(y ~ .,
                       data = simulated,
                       importance = TRUE,
                       ntree = 1000)
rfImp2 <- varImp(model2, scale = FALSE)
rfImp2

##                Overall
## V1          6.29780744
## V2          6.08038134
## V3          0.58410718
## V4          6.93924427
## V5          2.03104094
## V6          0.07947642
## V7         -0.02566414
## V8         -0.11007435
## V9         -0.08839463
## V10        -0.00715093
## duplicate1  3.56411581

simulated$duplicate2 <- simulated$V1 + rnorm(200) * .1 
cor(simulated$duplicate2, simulated$V1)

## [1] 0.9312569

model3 <- randomForest(y ~ .,
                       data = simulated,
                       importance = TRUE,
                       ntree = 1000)
rfImp3 <- varImp(model3, scale = FALSE)
rfImp3

##                 Overall
## V1          5.656397024
## V2          6.957366954
## V3          0.539700105
## V4          7.280227792
## V5          2.094226861
## V6          0.141163232
## V7          0.092792498
## V8         -0.096325566
## V9         -0.007463533
## V10         0.016839393
## duplicate1  2.566313355
## duplicate2  2.654958084

(c) Use the cforest function in the party package to fit a random forest model using conditional inference trees. The party package function varimp can calculate predictor importance. The conditional argument of that function toggles between the traditional importance measure and the modified version described in Strobl et al. (2007). Do these importances show the same pattern as the traditional random forest model?

partc <- simulated[, 1:11]
ctrl_c <- cforest_control(mtry = ncol(partc) - 1)
tree <- cforest(y ~., data = partc, controls = ctrl_c)
imp_partc <- varimp(tree, conditional = TRUE)
imp_partc[order(-abs(imp_partc))]

##            V4            V2            V1            V5            V7 
##  5.8700671260  4.6425360967  3.4451723147  0.8798145368  0.0333058286 
##            V3           V10            V9            V6            V8 
##  0.0231691852  0.0127178781 -0.0098722305  0.0062508985 -0.0006704277

partc <- simulated[, 1:12]
ctrl_c <- cforest_control(mtry = ncol(partc) - 1)
tree <- cforest(y ~., data = partc, controls = ctrl_c)
imp_partc <- varimp(tree, conditional = TRUE)
imp_partc[order(-abs(imp_partc))]

##           V4           V2           V1           V5   duplicate1           V3 
##  5.842388468  4.598544222  1.797449359  0.780799053  0.100614076  0.023228320 
##           V7          V10           V9           V6           V8 
## -0.018127549  0.016796764 -0.005541508  0.004369460 -0.002707915

partc <- simulated[, 1:13]
ctrl_c <- cforest_control(mtry = ncol(partc) - 1)
tree <- cforest(y ~., data = partc, controls = ctrl_c)
imp_partc <- varimp(tree, conditional = TRUE)
imp_partc[order(-abs(imp_partc))]

##           V4           V2           V1           V5   duplicate1   duplicate2 
##  6.119450532  4.781446865  1.250977063  0.693989348  0.068191074  0.052517746 
##           V6          V10           V3           V8           V7           V9 
##  0.017969967 -0.011482455  0.010383519 -0.006283995  0.006260121 -0.002817153

(d) Repeat this process with different tree models, such as boosted trees and Cubist. Does the same pattern occur?

boosted_trees <- simulated [,1:11]
gbm_model <- gbm (y~., data = boosted_trees, distribution = "gaussian")
summary(gbm_model)

##     var    rel.inf
## V4   V4 30.6474589
## V1   V1 27.2261194
## V2   V2 21.2628689
## V5   V5 11.7734871
## V3   V3  7.7264246
## V6   V6  0.8378481
## V7   V7  0.2165746
## V10 V10  0.1568537
## V9   V9  0.1523647
## V8   V8  0.0000000

boosted_trees2 <- simulated [,1:12]
gbm_model2 <- gbm (y~., data = boosted_trees2, distribution = "gaussian")
summary(gbm_model2)

##                   var    rel.inf
## V4                 V4 29.1688776
## V1                 V1 23.4062168
## V2                 V2 22.8354612
## V5                 V5 11.2618599
## V3                 V3  8.0134684
## duplicate1 duplicate1  4.8613122
## V9                 V9  0.2615355
## V6                 V6  0.1912683
## V7                 V7  0.0000000
## V8                 V8  0.0000000
## V10               V10  0.0000000

boosted_trees3 <- simulated [,1:12]
gbm_model3 <- gbm (y~., data = boosted_trees3, distribution = "gaussian")
summary(gbm_model3)

##                   var    rel.inf
## V4                 V4 29.0397620
## V2                 V2 22.7779799
## V1                 V1 20.2779219
## V5                 V5 11.0767985
## duplicate1 duplicate1  8.4016151
## V3                 V3  8.2755652
## V6                 V6  0.1503573
## V7                 V7  0.0000000
## V8                 V8  0.0000000
## V9                 V9  0.0000000
## V10               V10  0.0000000

The decline in importance is more gradual in comparison with the traditional random forest model. However, more variables are assigned levels of importance with values greater than zero. In addition, the highly correlated predictors that were added still appear among the list of influential variables.

KJ 8.2

Use a simulation to show tree bias with different granularities

set.seed(824)
x1_low <- sample(-1:1, 250, replace = T)
y <- x1_low + rnorm(250, mean=0, sd=1)
x2_mid <- x1_low + round(rnorm(250, mean=0, sd=1), 1)
x3_high <- x1_low + round(rnorm(250, mean=0, sd=1), 2)

df <- data.frame(cbind(x1_low, x2_mid, x3_high, y))
str(df)

## 'data.frame':    250 obs. of  4 variables:
##  $ x1_low : num  1 -1 0 1 -1 1 -1 -1 0 0 ...
##  $ x2_mid : num  -0.2 -1.4 0.4 3.9 -0.3 0.4 -1.2 -0.5 0.2 -1.2 ...
##  $ x3_high: num  -0.84 -1.34 -0.31 0.55 -1.07 0.07 -1.19 0.53 -0.07 0.69 ...
##  $ y      : num  1.984 -0.582 2.043 1.96 -2.943 ...

tree_mod <- rpart(y ~ ., data = df)

correlation_to_y <- c(cor(x1_low, y), cor(x2_mid, y), cor(x3_high, y))
unique_values <- c(length(unique(x1_low)), 
                    length(unique(x2_mid)), 
                    length(unique(x3_high)))
var_imp <- varImp(tree_mod)

table_sim <- cbind(correlation_to_y, unique_values, var_imp) 
names(table_sim)[3] <- "importance"
table_sim

##         correlation_to_y unique_values importance
## x1_low         0.6516570             3  0.5592728
## x2_mid         0.5068903            71  0.7014998
## x3_high        0.4670269           213  0.9166740

KJ 8.3

In stochastic gradient boosting the bagging fraction and learning rate will govern the construction of the trees as they are guided by the gradient. Although the optimal values of these parameters should be obtained through the tuning process, it is helpful to understand how the magnitudes of these parameters affect magnitudes of variable importance. Figure 8.24 provides the variable importance plots for boosting using two extreme values for the bagging fraction (0.1 and 0.9) and the learning rate (0.1 and 0.9) for the solubility data. The left-hand plot has both parameters set to 0.1, and the right-hand plot has both set to 0.9:

Figure 8.24

(a) Why does the model on the right focus its importance on just the first few of predictors, whereas the model on the left spreads importance across more predictors?

The model on the right has a higher bagging parameter and so uses a larger random sample of the predictors.

(b) Which model do you think would be more predictive of other samples?

The left model which is regulated by the lower values of the two tuning parameters, will exhibit smaller variance for new samples and a lower RMSE. This will make it more predictive of other samples.

(c) How would increasing interaction depth affect the slope of predictor importance for either model in Fig. 8.24?

Increasing the depth should cause the predictors with higher importance values to decrease and the ones with lower values to be increased, thus flattening the curve in the slope of predictor importance in either model.

KJ 8.7

Refer to Exercises 6.3 and 7.5 which describe a chemical manufacturing process. Use the same data imputation, data splitting, and pre-processing steps as before and train several tree-based models:

(a) Which tree-based regression model gives the optimal resampling and test set performance?

imputation <- preProcess(ChemicalManufacturingProcess, method = "knnImpute")
predict_imputer <- predict(imputation, ChemicalManufacturingProcess)

near_zero <- nearZeroVar(predict_imputer)
part_a <- predict_imputer[,-near_zero]

training_rows <- createDataPartition(part_a$Yield, p = .80, list= FALSE)

train <- part_a[training_rows, ]
trainx <- train[,-1]
trainy <- train$Yield
test <- part_a[-training_rows, ]
testx <- test[,-1]
testy <- test$Yield

set.seed(999)
randomforest_grid <- expand.grid(mtry=seq(7,49,by=3))
randomforest_model <- train(trainx, trainy, method = "rf", 
                tuneGrid = randomforest_grid)

rf_imp <- varImp(randomforest_model, scale = FALSE) 
rf_imp <- rf_imp$importance %>% rownames_to_column("var") 
rf_imp <- rf_imp[order(-rf_imp$Overall), , drop=FALSE] %>% remove_rownames()

plot(randomforest_model)

boosted_trees3 <- simulated [,1:12]
gbm_model3 <- gbm (y~., data = boosted_trees3, distribution = "gaussian")
summary(gbm_model3)

##                   var    rel.inf
## V4                 V4 30.7806455
## V1                 V1 26.7938082
## V2                 V2 22.3508758
## V5                 V5 10.4520161
## V3                 V3  8.6315597
## duplicate1 duplicate1  0.5731925
## V6                 V6  0.4179020
## V7                 V7  0.0000000
## V8                 V8  0.0000000
## V9                 V9  0.0000000
## V10               V10  0.0000000

(b) Which predictors are most important in the optimal tree-based regression model? Do either the biological or process variables dominate the list? How do the top 10 important predictors compare to the top 10 predictors from the optimal linear and nonlinear models?

(c) Plot the optimal single tree with the distribution of yield in the terminal nodes. Does this view of the data provide additional knowledge about the biological or process predictors and their relationship with yield?

Market Basket Analysis | Recommender Systems

Imagine 10000 receipts sitting on your table. Each receipt represents a transaction with items that were purchased. The receipt is a representation of stuff that went into a customer’s basket – and therefore ‘Market Basket Analysis’.

That is exactly what the Groceries Data Set contains: a collection of receipts with each line representing 1 receipt and the items purchased. Each line is called a transaction and each column in a row represents an item.

You assignment is to use R to mine the data for association rules. You should report support, confidence and lift and your top 10 rules by lift. Turn in as you would the other problems from HA and KJ. Due 07/18/20 with the packaged set, HW #2.

First, we will load the data.

groceryds <- read.transactions("https://raw.githubusercontent.com/aaronzalkisps/data624/master/GroceryDataSet.csv", sep=",")
itemFrequencyPlot(groceryds, topN=10, type="absolute", main="Top 10 Items")

We can see that whole milk is the most common item. Next, we will use the Apriori algorithm and display the top 10 rules with their support, confidence and lift.

apriori(groceryds, parameter=list(supp=0.001, conf=0.5) , control=list(verbose=FALSE)) %>%
  DATAFRAME() %>%
  arrange(desc(lift)) %>%
  top_n(10) %>%
  kable() %>%
  kable_styling()

## Selecting by count

LHS	RHS	support	confidence	coverage	lift	count
{root vegetables,tropical fruit}	{other vegetables}	0.0123030	0.5845411	0.0210473	3.020999	121
{rolls/buns,root vegetables}	{other vegetables}	0.0122013	0.5020921	0.0243010	2.594890	120
{root vegetables,yogurt}	{other vegetables}	0.0129131	0.5000000	0.0258261	2.584078	127
{root vegetables,yogurt}	{whole milk}	0.0145399	0.5629921	0.0258261	2.203354	143
{domestic eggs,other vegetables}	{whole milk}	0.0123030	0.5525114	0.0222674	2.162336	121
{rolls/buns,root vegetables}	{whole milk}	0.0127097	0.5230126	0.0243010	2.046888	125
{other vegetables,pip fruit}	{whole milk}	0.0135231	0.5175097	0.0261312	2.025351	133
{tropical fruit,yogurt}	{whole milk}	0.0151500	0.5173611	0.0292832	2.024770	149
{other vegetables,yogurt}	{whole milk}	0.0222674	0.5128806	0.0434164	2.007235	219
{other vegetables,whipped/sour cream}	{whole milk}	0.0146416	0.5070423	0.0288765	1.984385	144

Below, we can see the summary of the rules. There are 232 rules of association, most of which are based on 2 or 3 items.

rules <- apriori(groceryds, parameter = list(support = 0.01, confidence = 0.2))

## Apriori
## 
## Parameter specification:
##  confidence minval smax arem  aval originalSupport maxtime support minlen
##         0.2    0.1    1 none FALSE            TRUE       5    0.01      1
##  maxlen target  ext
##      10  rules TRUE
## 
## Algorithmic control:
##  filter tree heap memopt load sort verbose
##     0.1 TRUE TRUE  FALSE TRUE    2    TRUE
## 
## Absolute minimum support count: 98 
## 
## set item appearances ...[0 item(s)] done [0.00s].
## set transactions ...[169 item(s), 9835 transaction(s)] done [0.01s].
## sorting and recoding items ... [88 item(s)] done [0.00s].
## creating transaction tree ... done [0.01s].
## checking subsets of size 1 2 3 4 done [0.00s].
## writing ... [232 rule(s)] done [0.00s].
## creating S4 object  ... done [0.00s].

summary(rules)

## set of 232 rules
## 
## rule length distribution (lhs + rhs):sizes
##   1   2   3 
##   1 151  80 
## 
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   1.000   2.000   2.000   2.341   3.000   3.000 
## 
## summary of quality measures:
##     support          confidence        coverage            lift       
##  Min.   :0.01007   Min.   :0.2006   Min.   :0.01729   Min.   :0.8991  
##  1st Qu.:0.01200   1st Qu.:0.2470   1st Qu.:0.03437   1st Qu.:1.4432  
##  Median :0.01490   Median :0.3170   Median :0.05241   Median :1.7277  
##  Mean   :0.02005   Mean   :0.3321   Mean   :0.06708   Mean   :1.7890  
##  3rd Qu.:0.02227   3rd Qu.:0.4033   3rd Qu.:0.07565   3rd Qu.:2.0762  
##  Max.   :0.25552   Max.   :0.5862   Max.   :1.00000   Max.   :3.2950  
##      count       
##  Min.   :  99.0  
##  1st Qu.: 118.0  
##  Median : 146.5  
##  Mean   : 197.2  
##  3rd Qu.: 219.0  
##  Max.   :2513.0  
## 
## mining info:
##       data ntransactions support confidence
##  groceryds          9835    0.01        0.2

We can see the top ten rules based on lift below. They are mostly associated with root vegetables as a consequent item.

inspect(head(rules, n = 15, by = "lift"))

##      lhs                                   rhs                  support   
## [1]  {citrus fruit,other vegetables}    => {root vegetables}    0.01037112
## [2]  {other vegetables,yogurt}          => {whipped/sour cream} 0.01016777
## [3]  {other vegetables,tropical fruit}  => {root vegetables}    0.01230300
## [4]  {beef}                             => {root vegetables}    0.01738688
## [5]  {citrus fruit,root vegetables}     => {other vegetables}   0.01037112
## [6]  {root vegetables,tropical fruit}   => {other vegetables}   0.01230300
## [7]  {other vegetables,whole milk}      => {root vegetables}    0.02318251
## [8]  {curd,whole milk}                  => {yogurt}             0.01006609
## [9]  {other vegetables,yogurt}          => {root vegetables}    0.01291307
## [10] {other vegetables,yogurt}          => {tropical fruit}     0.01230300
## [11] {other vegetables,root vegetables} => {citrus fruit}       0.01037112
## [12] {other vegetables,rolls/buns}      => {root vegetables}    0.01220132
## [13] {tropical fruit,whole milk}        => {root vegetables}    0.01199797
## [14] {rolls/buns,root vegetables}       => {other vegetables}   0.01220132
## [15] {root vegetables,yogurt}           => {other vegetables}   0.01291307
##      confidence coverage   lift     count
## [1]  0.3591549  0.02887646 3.295045 102  
## [2]  0.2341920  0.04341637 3.267062 100  
## [3]  0.3427762  0.03589222 3.144780 121  
## [4]  0.3313953  0.05246568 3.040367 171  
## [5]  0.5862069  0.01769192 3.029608 102  
## [6]  0.5845411  0.02104728 3.020999 121  
## [7]  0.3097826  0.07483477 2.842082 228  
## [8]  0.3852140  0.02613116 2.761356  99  
## [9]  0.2974239  0.04341637 2.728698 127  
## [10] 0.2833724  0.04341637 2.700550 121  
## [11] 0.2188841  0.04738180 2.644626 102  
## [12] 0.2863962  0.04260295 2.627525 120  
## [13] 0.2836538  0.04229792 2.602365 118  
## [14] 0.5020921  0.02430097 2.594890 120  
## [15] 0.5000000  0.02582613 2.584078 127

We can see the top ten rules based on confidence below. They are mostly associated with whole milk as a consquent item.

inspect(head(rules, n = 15, by = "confidence"))

##      lhs                                      rhs                support   
## [1]  {citrus fruit,root vegetables}        => {other vegetables} 0.01037112
## [2]  {root vegetables,tropical fruit}      => {other vegetables} 0.01230300
## [3]  {curd,yogurt}                         => {whole milk}       0.01006609
## [4]  {butter,other vegetables}             => {whole milk}       0.01148958
## [5]  {root vegetables,tropical fruit}      => {whole milk}       0.01199797
## [6]  {root vegetables,yogurt}              => {whole milk}       0.01453991
## [7]  {domestic eggs,other vegetables}      => {whole milk}       0.01230300
## [8]  {whipped/sour cream,yogurt}           => {whole milk}       0.01087951
## [9]  {rolls/buns,root vegetables}          => {whole milk}       0.01270971
## [10] {other vegetables,pip fruit}          => {whole milk}       0.01352313
## [11] {tropical fruit,yogurt}               => {whole milk}       0.01514997
## [12] {other vegetables,yogurt}             => {whole milk}       0.02226741
## [13] {other vegetables,whipped/sour cream} => {whole milk}       0.01464159
## [14] {rolls/buns,root vegetables}          => {other vegetables} 0.01220132
## [15] {root vegetables,yogurt}              => {other vegetables} 0.01291307
##      confidence coverage   lift     count
## [1]  0.5862069  0.01769192 3.029608 102  
## [2]  0.5845411  0.02104728 3.020999 121  
## [3]  0.5823529  0.01728521 2.279125  99  
## [4]  0.5736041  0.02003050 2.244885 113  
## [5]  0.5700483  0.02104728 2.230969 118  
## [6]  0.5629921  0.02582613 2.203354 143  
## [7]  0.5525114  0.02226741 2.162336 121  
## [8]  0.5245098  0.02074225 2.052747 107  
## [9]  0.5230126  0.02430097 2.046888 125  
## [10] 0.5175097  0.02613116 2.025351 133  
## [11] 0.5173611  0.02928317 2.024770 149  
## [12] 0.5128806  0.04341637 2.007235 219  
## [13] 0.5070423  0.02887646 1.984385 144  
## [14] 0.5020921  0.02430097 2.594890 120  
## [15] 0.5000000  0.02582613 2.584078 127

Below we will plot the rules:

plot(rules, jitter = 1)

head(rules, n = 10, by = "lift") %>% 
    plot(method = "graph", engine = "htmlwidget")

head(rules, n = 10, by = "confidence") %>% 
    plot(method = "graph", engine = "htmlwidget")