Run SVM models
- Radial test set 1.17 RMSE
- the residual plot for the model shows non constant variance in the residuals. It almost look like there is a a positive correlation between residuals and predictions
- Linear SVM test set RMSE 1.16, the residuals do look a little better than above model
- Poly has an ok residual plot but RMSE of 1.23 on test set
## Train test split
set.seed(123)
smp_size <- floor(0.75 * nrow(completedData))
train_ind <- sample(seq_len(nrow(completedData)), size = smp_size)
train <- completedData[train_ind, ]
test <- completedData[-train_ind, ]
train_y <- yield[train_ind]
test_y <- yield[-train_ind]
## tune models
## Radial model
svmRTuned <- train(train , train_y,
method = "svmRadial",
preProc = c("center", "scale"),
tuneLength = 14,
trControl = trainControl(method = "cv"))
## plot Radial model
predicted <- predict(svmRTuned,test)
actual <-test_y
axisRange <- extendrange(c(actual, predicted))
plot(actual, predicted, ylim = axisRange, xlim = axisRange)
abline(0, 1, col = "darkgrey", lty = 2)

plot(predicted, (predicted-actual), ylab = "residual")
abline(h = 0, col = "darkgrey", lty = 2)

## Linear model
svmRTuned2 <- train(train , train_y,
method = "svmLinear",
preProc = c("center", "scale"),
tuneLength = 14,
trControl = trainControl(method = "cv"))
## plot lin model
predicted <- predict(svmRTuned2,test)
actual <-test_y
axisRange <- extendrange(c(actual, predicted))
plot(actual, predicted, ylim = axisRange, xlim = axisRange)
abline(0, 1, col = "darkgrey", lty = 2)

plot(predicted, (predicted-actual), ylab = "residual")
abline(h = 0, col = "darkgrey", lty = 2)

## poly svm model
svmRTuned3 <- train(train, train_y,
method = "svmPoly",
preProc = c("center", "scale"),
tuneLength = 3,
trControl = trainControl(method = "cv"))
## plot model
##
predicted <- predict(svmRTuned3,test)
actual <-test_y
axisRange <- extendrange(c(actual, predicted))
plot(actual, predicted, ylim = axisRange, xlim = axisRange)
abline(0, 1, col = "darkgrey", lty = 2)

plot(predicted, (predicted-actual), ylab = "residual")
abline(h = 0, col = "darkgrey", lty = 2)

## get predictions RMSE for models
predictions1 <- predict(svmRTuned,test)
radial_rmse <- RMSE(predictions1,test_y)
predictions2 <- predict(svmRTuned2,test)
linear_rmse <- RMSE(predictions2,test_y)
predictions3 <- predict(svmRTuned3,test)
poly_rmse <- RMSE(predictions3,test_y)
cbind(radial_rmse,linear_rmse,poly_rmse)
## radial_rmse linear_rmse poly_rmse
## [1,] 1.17302 1.165471 1.233217
## final models
svmRTuned$finalModel
## Support Vector Machine object of class "ksvm"
##
## SV type: eps-svr (regression)
## parameter : epsilon = 0.1 cost C = 2
##
## Gaussian Radial Basis kernel function.
## Hyperparameter : sigma = 0.0221305795559405
##
## Number of Support Vectors : 122
##
## Objective Function Value : -66.0452
## Training error : 0.102862
## Support Vector Machine object of class "ksvm"
##
## SV type: eps-svr (regression)
## parameter : epsilon = 0.1 cost C = 1
##
## Linear (vanilla) kernel function.
##
## Number of Support Vectors : 125
##
## Objective Function Value : -44.7484
## Training error : 0.336602
## Support Vector Machine object of class "ksvm"
##
## SV type: eps-svr (regression)
## parameter : epsilon = 0.1 cost C = 0.25
##
## Polynomial kernel function.
## Hyperparameters : degree = 3 scale = 0.01 offset = 1
##
## Number of Support Vectors : 117
##
## Objective Function Value : -14.6279
## Training error : 0.357631
## loess r-squared variable importance
##
## only 20 most important variables shown (out of 36)
##
## Overall
## BiologicalMaterial03 100.00
## ManufacturingProcess17 93.51
## ManufacturingProcess36 86.62
## ManufacturingProcess33 66.87
## BiologicalMaterial11 66.25
## ManufacturingProcess06 63.65
## BiologicalMaterial09 50.78
## ManufacturingProcess11 45.97
## ManufacturingProcess30 39.92
## ManufacturingProcess12 36.89
## ManufacturingProcess28 31.69
## ManufacturingProcess01 26.22
## ManufacturingProcess27 26.04
## BiologicalMaterial10 23.40
## BiologicalMaterial05 22.94
## ManufacturingProcess16 21.21
## ManufacturingProcess35 19.28
## ManufacturingProcess04 18.70
## ManufacturingProcess20 18.02
## ManufacturingProcess02 16.06
## loess r-squared variable importance
##
## only 20 most important variables shown (out of 36)
##
## Overall
## BiologicalMaterial03 100.00
## ManufacturingProcess17 93.51
## ManufacturingProcess36 86.62
## ManufacturingProcess33 66.87
## BiologicalMaterial11 66.25
## ManufacturingProcess06 63.65
## BiologicalMaterial09 50.78
## ManufacturingProcess11 45.97
## ManufacturingProcess30 39.92
## ManufacturingProcess12 36.89
## ManufacturingProcess28 31.69
## ManufacturingProcess01 26.22
## ManufacturingProcess27 26.04
## BiologicalMaterial10 23.40
## BiologicalMaterial05 22.94
## ManufacturingProcess16 21.21
## ManufacturingProcess35 19.28
## ManufacturingProcess04 18.70
## ManufacturingProcess20 18.02
## ManufacturingProcess02 16.06
## loess r-squared variable importance
##
## only 20 most important variables shown (out of 36)
##
## Overall
## BiologicalMaterial03 100.00
## ManufacturingProcess17 93.51
## ManufacturingProcess36 86.62
## ManufacturingProcess33 66.87
## BiologicalMaterial11 66.25
## ManufacturingProcess06 63.65
## BiologicalMaterial09 50.78
## ManufacturingProcess11 45.97
## ManufacturingProcess30 39.92
## ManufacturingProcess12 36.89
## ManufacturingProcess28 31.69
## ManufacturingProcess01 26.22
## ManufacturingProcess27 26.04
## BiologicalMaterial10 23.40
## BiologicalMaterial05 22.94
## ManufacturingProcess16 21.21
## ManufacturingProcess35 19.28
## ManufacturingProcess04 18.70
## ManufacturingProcess20 18.02
## ManufacturingProcess02 16.06
MARS
- RMSE on test set is 1.71
- MARS model looks terrible
set.seed(123)
marsGrid <- expand.grid(.degree = 1:2, .nprune = 2:32)
marsTuned <- train(train, train_y,
method = "earth",
tuneGrid = marsGrid,
trControl = trainControl(method = "cv"))
marsTuned$finalModel
## Selected 2 of 45 terms, and 2 of 36 predictors
## Termination condition: RSq changed by less than 0.001 at 45 terms
## Importance: ManufacturingProcess17, ManufacturingProcess28, ...
## Number of terms at each degree of interaction: 1 0 1
## GCV 2.548561 RSS 318.8066 GRSq 0.2586737 RSq 0.2866986
mars_pred <- predict(marsTuned,test)
RMSE(test_y,mars_pred)
## [1] 1.714526
predicted <- predict(marsTuned,test)
actual <-test_y
axisRange <- extendrange(c(actual, predicted))
plot(actual, predicted, ylim = axisRange, xlim = axisRange)
abline(0, 1, col = "darkgrey", lty = 2)

plot(predicted, (predicted-actual), ylab = "residual")
abline(h = 0, col = "darkgrey", lty = 2)

## earth variable importance
##
## only 20 most important variables shown (out of 36)
##
## Overall
## ManufacturingProcess28 100
## ManufacturingProcess17 100
## ManufacturingProcess21 0
## ManufacturingProcess24 0
## ManufacturingProcess05 0
## BiologicalMaterial03 0
## ManufacturingProcess37 0
## BiologicalMaterial09 0
## ManufacturingProcess12 0
## ManufacturingProcess35 0
## ManufacturingProcess19 0
## ManufacturingProcess34 0
## ManufacturingProcess22 0
## ManufacturingProcess08 0
## ManufacturingProcess01 0
## BiologicalMaterial11 0
## ManufacturingProcess20 0
## ManufacturingProcess11 0
## ManufacturingProcess03 0
## ManufacturingProcess38 0
KNN
- RMSE on test test is 1.51
knnModel <- train(x = train,
y = train_y,
method = "knn",preProc = c("center", "scale"),
tuneLength = 10)
knnModel
## k-Nearest Neighbors
##
## 132 samples
## 36 predictor
##
## Pre-processing: centered (36), scaled (36)
## Resampling: Bootstrapped (25 reps)
## Summary of sample sizes: 132, 132, 132, 132, 132, 132, ...
## Resampling results across tuning parameters:
##
## k RMSE Rsquared MAE
## 5 1.472477 0.3282891 1.152186
## 7 1.463268 0.3209371 1.160692
## 9 1.459681 0.3179446 1.157638
## 11 1.461165 0.3151395 1.158253
## 13 1.463401 0.3112594 1.159268
## 15 1.463532 0.3076825 1.158154
## 17 1.460605 0.3115393 1.155990
## 19 1.457138 0.3182375 1.152340
## 21 1.457895 0.3214083 1.147691
## 23 1.465486 0.3178753 1.153840
##
## RMSE was used to select the optimal model using the smallest value.
## The final value used for the model was k = 19.
## loess r-squared variable importance
##
## only 20 most important variables shown (out of 36)
##
## Overall
## BiologicalMaterial03 100.00
## ManufacturingProcess17 93.51
## ManufacturingProcess36 86.62
## ManufacturingProcess33 66.87
## BiologicalMaterial11 66.25
## ManufacturingProcess06 63.65
## BiologicalMaterial09 50.78
## ManufacturingProcess11 45.97
## ManufacturingProcess30 39.92
## ManufacturingProcess12 36.89
## ManufacturingProcess28 31.69
## ManufacturingProcess01 26.22
## ManufacturingProcess27 26.04
## BiologicalMaterial10 23.40
## BiologicalMaterial05 22.94
## ManufacturingProcess16 21.21
## ManufacturingProcess35 19.28
## ManufacturingProcess04 18.70
## ManufacturingProcess20 18.02
## ManufacturingProcess02 16.06
knn_pred <- predict(knnModel,test)
RMSE(test_y,knn_pred)
## [1] 1.517846
predicted <- predict(knnModel,test)
actual <-test_y
axisRange <- extendrange(c(actual, predicted))
plot(actual, predicted, ylim = axisRange, xlim = axisRange)
abline(0, 1, col = "darkgrey", lty = 2)

plot(predicted, (predicted-actual), ylab = "residual")
abline(h = 0, col = "darkgrey", lty = 2)

caret ensemble on entire dataframe
- XGBOOST gives best results, at rmse under 1
completedData <- read.csv("imputed.csv")
complete_df_imputed <- completedData
complete_df_imputed <- complete_df_imputed[,-1]
set.seed(123)
smp_size <- floor(0.75 * nrow(complete_df_imputed))
train_ind <- sample(seq_len(nrow(complete_df_imputed)), size = smp_size)
train <- complete_df_imputed[train_ind, ]
test <- complete_df_imputed[-train_ind, ]
train_y <- yield[train_ind]
test_y <- yield[-train_ind]
registerDoParallel(4)
getDoParWorkers()
## [1] 4
set.seed(123)
my_control <- trainControl(method = 'cv', # for “cross-validation”
number = 5, # number of k-folds
savePredictions = 'final',
allowParallel = TRUE)
model_list <- caretList(train,
train_y,
trControl = my_control,
methodList = c('lm', 'svmRadial', 'rf',"pls",
'xgbTree', 'xgbLinear',"bagEarth",'glmnet',"knn" ),
tuneList = NULL,
continue_on_fail = FALSE,
preProcess = c('center','scale'))
## Warning in trControlCheck(x = trControl, y = target): indexes not defined
## in trControl. Attempting to set them ourselves, so each model in the
## ensemble will have the same resampling indexes.
options(digits = 3)
model_results <- data.frame(LM = min(model_list$lm$results$RMSE),
SVM = min(model_list$svmRadial$results$RMSE),
RF = min(model_list$rf$results$RMSE),
XGBT = min(model_list$xgbTree$results$RMSE),
XGBL = min(model_list$xgbLinear$results$RMSE),
Mars = min(model_list$bagEarth$results$RMSE),
GLMNET = min(model_list$glmnet$results$RMSE),
PLS =min(model_list$pls$results$RMSE),
KNN =min(model_list$knn$results$RMSE))
print(model_results)
## LM SVM RF XGBT XGBL Mars GLMNET PLS KNN
## 1 12.9 1.32 1.17 1.07 1.15 1.39 1.31 1.58 1.37
options(digits = 3)
LM = predict(model_list$lm,test)
## Warning in predict.lm(modelFit, newdata): prediction from a rank-deficient
## fit may be misleading
## [1] 1.52
svmRadial = predict(model_list$svmRadial,test)
RMSE(test_y,svmRadial)
## [1] 1.14
rf = predict(model_list$rf,test)
RMSE(test_y,rf)
## [1] 1.23
resamples <- resamples(model_list)
dotplot(resamples, metric = 'RMSE')

ensemble_1 <- caretEnsemble(model_list,
metric = 'RMSE',
trControl = my_control)
summary(ensemble_1)
## The following models were ensembled: lm, svmRadial, rf, pls, xgbTree, xgbLinear, bagEarth, glmnet, knn
## They were weighted:
## -8.925 0.0044 0.0684 0.2258 -0.2042 0.5689 0.0924 -0.0561 0.3611 0.162
## The resulting RMSE is: 1.2986
## The fit for each individual model on the RMSE is:
## method RMSE RMSESD
## lm 12.91 22.828
## svmRadial 1.32 0.128
## rf 1.17 0.232
## pls 1.58 0.293
## xgbTree 1.07 0.175
## xgbLinear 1.15 0.255
## bagEarth 1.39 0.160
## glmnet 1.31 0.224
## knn 1.37 0.187
C
- Just like our previous hw, we can see the biological material all acts as some sort of accelerant. All have positive correlation with yield
- In terms of manufacturing process, I added in MP 31 01 and 07 because our best model(XGboost) identified them
- strangely enough these variables actually show the least correlation with our yield
- Otherwise just like in our linear model, we can see that certain manufacturing process can hinder or help our yield.
- Manufacturing process 32 seems very helpful, which looking back at my gls model did happen in the dataset with all predictors as well
- manufacturing process 13 seems very detrimental to our yield which was also identified in the same model for linear hw last week
corr_df <- as.data.frame(cbind(complete_df_imputed,yield))
corrplot::corrplot(cor(corr_df[,1:58])[c(3,12,10,5,9),58, drop=FALSE], cl.pos='n')

corrplot::corrplot(cor(corr_df[,1:58])[c(25,18,29,45,21,43,13,49,44),58, drop=FALSE], cl.pos='n')
