suppressMessages(suppressWarnings(library(fpp2)))
suppressMessages(suppressWarnings(library(readxl)))
suppressMessages(suppressWarnings(library(seasonal)))
suppressMessages(suppressWarnings(library(rdatamarket)))
suppressMessages(suppressWarnings(library(tseries)))
suppressMessages(suppressWarnings(library(AppliedPredictiveModeling)))
suppressMessages(suppressWarnings(library(fma)))
suppressMessages(suppressWarnings(library(corrplot)))
suppressMessages(suppressWarnings(library(caret)))
suppressMessages(suppressWarnings(library(pls)))
suppressMessages(suppressWarnings(library(glmnet)))
suppressMessages(suppressWarnings(library(missForest)))
suppressMessages(suppressWarnings(library(mlbench)))
suppressMessages(suppressWarnings(library(nnet)))
suppressMessages(suppressWarnings(library(earth)))
suppressMessages(suppressWarnings(library(kernlab)))
suppressMessages(suppressWarnings(library(MASS)))
suppressMessages(suppressWarnings(library(forecast)))
suppressMessages(suppressWarnings(library(tidyverse)))
suppressMessages(suppressWarnings(library(Metrics)))
suppressMessages(suppressWarnings(library(Amelia)))
suppressMessages(suppressWarnings(library(doParallel)))
suppressMessages(suppressWarnings(library(cwhmisc)))
\[y = 10sin(\pi x_1 x_2) + 20(x_3 -0.5)^2+10x_4 +5x_5 + N(0,\sigma^2)\]
set.seed(200)
trainingData <- mlbench.friedman1(200, sd = 1)
## We convert the 'x' data from a matrix to a data frame
## One reason is that this will give the columns names.
trainingData$x <- data.frame(trainingData$x)
## Look at the data using
#featurePlot(trainingData$x, trainingData$y)
## or other methods.
## This creates a list with a vector 'y' and a matrix
## of predictors 'x'. Also simulate a large test set to
## estimate the true error rate with good precision:
testData <- mlbench.friedman1(5000, sd = 1)
testData$x <- data.frame(testData$x)
knnModel <- train(x = trainingData$x, y = trainingData$y, method = "knn", preProc = c("center", "scale"), tuneLength = 10)
knnModel
## k-Nearest Neighbors
##
## 200 samples
## 10 predictor
##
## Pre-processing: centered (10), scaled (10)
## Resampling: Bootstrapped (25 reps)
## Summary of sample sizes: 200, 200, 200, 200, 200, 200, ...
## Resampling results across tuning parameters:
##
## k RMSE Rsquared MAE
## 5 3.565620 0.4887976 2.886629
## 7 3.422420 0.5300524 2.752964
## 9 3.368072 0.5536927 2.715310
## 11 3.323010 0.5779056 2.669375
## 13 3.275835 0.6030846 2.628663
## 15 3.261864 0.6163510 2.621192
## 17 3.261973 0.6267032 2.616956
## 19 3.286299 0.6281075 2.640585
## 21 3.280950 0.6390386 2.643807
## 23 3.292397 0.6440392 2.656080
##
## RMSE was used to select the optimal model using the smallest value.
## The final value used for the model was k = 15.
knnPred <- predict(knnModel, newdata = testData$x)
## The function 'postResample' can be used to get the test set
## perforamnce values
postResample(pred = knnPred, obs = testData$y)
## RMSE Rsquared MAE
## 3.1750657 0.6785946 2.5443169
set.seed(123)
mars_grid = expand.grid(degree =1:2, nprune=seq(2,14,by=2))
marsTune = train(x= trainingData$x, y=trainingData$y, method='earth', tuneGrid=mars_grid, trControl = trainControl(method = "cv"))
marsTune
## Multivariate Adaptive Regression Spline
##
## 200 samples
## 10 predictor
##
## No pre-processing
## Resampling: Cross-Validated (10 fold)
## Summary of sample sizes: 180, 180, 180, 180, 180, 180, ...
## Resampling results across tuning parameters:
##
## degree nprune RMSE Rsquared MAE
## 1 2 4.456357 0.2147829 3.763412
## 1 4 2.600345 0.7422959 2.090504
## 1 6 2.203236 0.8204569 1.775677
## 1 8 1.729425 0.8852780 1.370689
## 1 10 1.683149 0.8887776 1.328496
## 1 12 1.634706 0.8956391 1.283077
## 1 14 1.645803 0.8934351 1.296689
## 2 2 4.456357 0.2147829 3.763412
## 2 4 2.600345 0.7422959 2.090504
## 2 6 2.237171 0.8118249 1.753283
## 2 8 1.652901 0.8930780 1.264622
## 2 10 1.447568 0.9203813 1.142897
## 2 12 1.293865 0.9377882 1.013377
## 2 14 1.251764 0.9421335 1.003794
##
## RMSE was used to select the optimal model using the smallest value.
## The final values used for the model were nprune = 14 and degree = 2.
varImp(marsTune)
## earth variable importance
##
## Overall
## X1 100.00
## X4 85.13
## X2 69.22
## X5 49.28
## X3 39.95
## X8 0.00
## X10 0.00
## X7 0.00
## X6 0.00
## X9 0.00
mars_pred = predict (marsTune, testData$x)
postResample(pred = mars_pred, obs = testData$y)
## RMSE Rsquared MAE
## 1.1722635 0.9448890 0.9324923
set.seed(124)
svmTune = train(x= trainingData$x, y=trainingData$y, method='svmRadial', tuneLength = 14, trControl = trainControl(method = "cv"))
svmTune$finalModel
## Support Vector Machine object of class "ksvm"
##
## SV type: eps-svr (regression)
## parameter : epsilon = 0.1 cost C = 16
##
## Gaussian Radial Basis kernel function.
## Hyperparameter : sigma = 0.0646229473798898
##
## Number of Support Vectors : 151
##
## Objective Function Value : -70.893
## Training error : 0.00852
varImp(svmTune)
## loess r-squared variable importance
##
## Overall
## X4 100.0000
## X1 95.5047
## X2 89.6186
## X5 45.2170
## X3 29.9330
## X9 6.3299
## X10 5.5182
## X8 3.2527
## X6 0.8884
## X7 0.0000
svmPred <- predict(svmTune, newdata = testData$x)
postResample(pred = svmPred, obs = testData$y)
## RMSE Rsquared MAE
## 2.0776847 0.8250301 1.5783076
data(ChemicalManufacturingProcess)
df = ChemicalManufacturingProcess
#summary(df)
df_imp1 = missForest(df)
## missForest iteration 1 in progress...
## Warning in randomForest.default(x = obsX, y = obsY, ntree = ntree, mtry =
## mtry, : The response has five or fewer unique values. Are you sure you want
## to do regression?
## Warning in randomForest.default(x = obsX, y = obsY, ntree = ntree, mtry =
## mtry, : The response has five or fewer unique values. Are you sure you want
## to do regression?
## Warning in randomForest.default(x = obsX, y = obsY, ntree = ntree, mtry =
## mtry, : The response has five or fewer unique values. Are you sure you want
## to do regression?
## Warning in randomForest.default(x = obsX, y = obsY, ntree = ntree, mtry =
## mtry, : The response has five or fewer unique values. Are you sure you want
## to do regression?
## Warning in randomForest.default(x = obsX, y = obsY, ntree = ntree, mtry =
## mtry, : The response has five or fewer unique values. Are you sure you want
## to do regression?
## Warning in randomForest.default(x = obsX, y = obsY, ntree = ntree, mtry =
## mtry, : The response has five or fewer unique values. Are you sure you want
## to do regression?
## done!
## missForest iteration 2 in progress...
## Warning in randomForest.default(x = obsX, y = obsY, ntree = ntree, mtry =
## mtry, : The response has five or fewer unique values. Are you sure you want
## to do regression?
## Warning in randomForest.default(x = obsX, y = obsY, ntree = ntree, mtry =
## mtry, : The response has five or fewer unique values. Are you sure you want
## to do regression?
## Warning in randomForest.default(x = obsX, y = obsY, ntree = ntree, mtry =
## mtry, : The response has five or fewer unique values. Are you sure you want
## to do regression?
## Warning in randomForest.default(x = obsX, y = obsY, ntree = ntree, mtry =
## mtry, : The response has five or fewer unique values. Are you sure you want
## to do regression?
## Warning in randomForest.default(x = obsX, y = obsY, ntree = ntree, mtry =
## mtry, : The response has five or fewer unique values. Are you sure you want
## to do regression?
## Warning in randomForest.default(x = obsX, y = obsY, ntree = ntree, mtry =
## mtry, : The response has five or fewer unique values. Are you sure you want
## to do regression?
## done!
## missForest iteration 3 in progress...
## Warning in randomForest.default(x = obsX, y = obsY, ntree = ntree, mtry =
## mtry, : The response has five or fewer unique values. Are you sure you want
## to do regression?
## Warning in randomForest.default(x = obsX, y = obsY, ntree = ntree, mtry =
## mtry, : The response has five or fewer unique values. Are you sure you want
## to do regression?
## Warning in randomForest.default(x = obsX, y = obsY, ntree = ntree, mtry =
## mtry, : The response has five or fewer unique values. Are you sure you want
## to do regression?
## Warning in randomForest.default(x = obsX, y = obsY, ntree = ntree, mtry =
## mtry, : The response has five or fewer unique values. Are you sure you want
## to do regression?
## Warning in randomForest.default(x = obsX, y = obsY, ntree = ntree, mtry =
## mtry, : The response has five or fewer unique values. Are you sure you want
## to do regression?
## Warning in randomForest.default(x = obsX, y = obsY, ntree = ntree, mtry =
## mtry, : The response has five or fewer unique values. Are you sure you want
## to do regression?
## done!
## missForest iteration 4 in progress...
## Warning in randomForest.default(x = obsX, y = obsY, ntree = ntree, mtry =
## mtry, : The response has five or fewer unique values. Are you sure you want
## to do regression?
## Warning in randomForest.default(x = obsX, y = obsY, ntree = ntree, mtry =
## mtry, : The response has five or fewer unique values. Are you sure you want
## to do regression?
## Warning in randomForest.default(x = obsX, y = obsY, ntree = ntree, mtry =
## mtry, : The response has five or fewer unique values. Are you sure you want
## to do regression?
## Warning in randomForest.default(x = obsX, y = obsY, ntree = ntree, mtry =
## mtry, : The response has five or fewer unique values. Are you sure you want
## to do regression?
## Warning in randomForest.default(x = obsX, y = obsY, ntree = ntree, mtry =
## mtry, : The response has five or fewer unique values. Are you sure you want
## to do regression?
## Warning in randomForest.default(x = obsX, y = obsY, ntree = ntree, mtry =
## mtry, : The response has five or fewer unique values. Are you sure you want
## to do regression?
## done!
df_imp = df_imp1$ximp
data = df_imp[,2:58]
target = df_imp[,1]
training = createDataPartition( target, p=0.75 )
predictor_training = data[training$Resample1,]
target_training = target[training$Resample]
predictor_testing = data[-training$Resample1,]
target_testing = target[-training$Resample1]
knnModel1 <- train(x = predictor_training, y = target_training, method = "knn", preProc = c("center", "scale"), tuneLength = 10)
knnModel1
## k-Nearest Neighbors
##
## 132 samples
## 57 predictor
##
## Pre-processing: centered (57), scaled (57)
## Resampling: Bootstrapped (25 reps)
## Summary of sample sizes: 132, 132, 132, 132, 132, 132, ...
## Resampling results across tuning parameters:
##
## k RMSE Rsquared MAE
## 5 1.453045 0.3846283 1.150241
## 7 1.428844 0.4034515 1.147851
## 9 1.425711 0.4108613 1.147274
## 11 1.403578 0.4325944 1.130706
## 13 1.409387 0.4289528 1.137463
## 15 1.400754 0.4404230 1.136332
## 17 1.403127 0.4411012 1.145331
## 19 1.414829 0.4349747 1.156591
## 21 1.421582 0.4343973 1.164287
## 23 1.423165 0.4387631 1.167686
##
## RMSE was used to select the optimal model using the smallest value.
## The final value used for the model was k = 15.
knnPred1 <- predict(knnModel1, newdata = predictor_testing)
postResample(pred = knnPred1, obs = target_testing)
## RMSE Rsquared MAE
## 1.4284096 0.4920472 1.0870170
mars1 = expand.grid(degree =1:2, nprune=seq(2,14,by=2))
mars2 = train(x = predictor_training, y = target_training, method='earth', tuneGrid=mars1, trControl = trainControl(method = "cv"))
mars2
## Multivariate Adaptive Regression Spline
##
## 132 samples
## 57 predictor
##
## No pre-processing
## Resampling: Cross-Validated (10 fold)
## Summary of sample sizes: 120, 118, 118, 118, 120, 119, ...
## Resampling results across tuning parameters:
##
## degree nprune RMSE Rsquared MAE
## 1 2 1.364681 0.4585905 1.0829231
## 1 4 1.158041 0.6160310 0.9236491
## 1 6 1.179410 0.6167058 0.9184239
## 1 8 1.156381 0.6228721 0.9230851
## 1 10 1.116534 0.6632506 0.8977107
## 1 12 1.180115 0.6426455 0.9332636
## 1 14 1.158161 0.6490796 0.9310004
## 2 2 1.378750 0.4476384 1.0889202
## 2 4 1.245372 0.5489994 1.0113678
## 2 6 1.250174 0.5475575 1.0159313
## 2 8 1.259254 0.5487133 0.9912936
## 2 10 1.300642 0.5315020 1.0289910
## 2 12 1.413606 0.5430414 1.0841638
## 2 14 1.453732 0.5376685 1.1181972
##
## RMSE was used to select the optimal model using the smallest value.
## The final values used for the model were nprune = 10 and degree = 1.
marspred = predict (mars2, predictor_testing)
postResample(pred = marspred, obs = target_testing)
## RMSE Rsquared MAE
## 1.2696435 0.5787261 1.0359571
svm1 = train(x = predictor_training, y = target_training, method='svmRadial', tuneLength = 14, trControl = trainControl(method = "cv"))
svm1
## Support Vector Machines with Radial Basis Function Kernel
##
## 132 samples
## 57 predictor
##
## No pre-processing
## Resampling: Cross-Validated (10 fold)
## Summary of sample sizes: 120, 119, 120, 119, 119, 118, ...
## Resampling results across tuning parameters:
##
## C RMSE Rsquared MAE
## 0.25 1.394172 0.5192581 1.1378896
## 0.50 1.303427 0.5364552 1.0639061
## 1.00 1.255315 0.5511798 1.0128133
## 2.00 1.240910 0.5612984 0.9909936
## 4.00 1.249447 0.5606367 0.9851349
## 8.00 1.262441 0.5513856 0.9972423
## 16.00 1.253929 0.5565101 0.9895903
## 32.00 1.253929 0.5565101 0.9895903
## 64.00 1.253929 0.5565101 0.9895903
## 128.00 1.253929 0.5565101 0.9895903
## 256.00 1.253929 0.5565101 0.9895903
## 512.00 1.253929 0.5565101 0.9895903
## 1024.00 1.253929 0.5565101 0.9895903
## 2048.00 1.253929 0.5565101 0.9895903
##
## Tuning parameter 'sigma' was held constant at a value of 0.01267093
## RMSE was used to select the optimal model using the smallest value.
## The final values used for the model were sigma = 0.01267093 and C = 2.
svmPred <- predict(svm1, newdata = predictor_testing)
postResample(pred = svmPred, obs = target_testing)
## RMSE Rsquared MAE
## 1.0739452 0.7611615 0.8975338
plot(varImp(svm1))
v_imp <- varImp(svm1)
v_imp
## loess r-squared variable importance
##
## only 20 most important variables shown (out of 57)
##
## Overall
## ManufacturingProcess32 100.00
## BiologicalMaterial06 91.24
## BiologicalMaterial03 82.81
## ManufacturingProcess13 81.60
## ManufacturingProcess36 74.52
## BiologicalMaterial02 72.85
## ManufacturingProcess33 66.93
## BiologicalMaterial12 64.92
## ManufacturingProcess17 63.51
## BiologicalMaterial04 60.09
## ManufacturingProcess31 56.14
## ManufacturingProcess09 53.86
## BiologicalMaterial01 53.63
## ManufacturingProcess29 44.00
## ManufacturingProcess02 43.34
## BiologicalMaterial11 38.87
## ManufacturingProcess20 37.71
## BiologicalMaterial08 37.09
## ManufacturingProcess06 32.90
## ManufacturingProcess27 30.15
plot(data$ManufacturingProcess32, target)
abline(lm(target~data$ManufacturingProcess32),col="red",lwd=1.5)
plot(data$BiologicalMaterial06, target)
abline(lm(target~data$BiologicalMaterial06),col="red",lwd=1.5)
Both predictors have positive corelation. Lets check the exact corelation
cor(data$ManufacturingProcess32, target)
## [1] 0.6083321
cor(data$BiologicalMaterial06, target)
## [1] 0.4781634