Hyperparameter Tuning

I’ll take an example from Kuhn’s documentation on github. This uses a support vector machine with a radial kernel to distinguish between mines and rocks using sonar data. The particulars of the example are unimportant. What I’m interested in is the significance of the way that hyperparameter tuning is specified. In this example, there are two hyperparameters - C and sigma.

It is always a good idea to look at the dataframe of results which is one of the elements in the list produced by train.

library(mlbench)
library(kernlab)
data(Sonar)
library(caret)
## Loading required package: lattice
## Loading required package: ggplot2
## 
## Attaching package: 'ggplot2'
## The following object is masked from 'package:kernlab':
## 
##     alpha
library(tictoc)
set.seed(998)
inTraining <- createDataPartition(Sonar$Class, p = .75, list = FALSE)
training <- Sonar[ inTraining,]
testing  <- Sonar[-inTraining,]


svmControl <- trainControl(method = "repeatedcv",
number = 10, 
repeats = 10,
classProbs = TRUE)

tic()
set.seed(825)
svmFit <- train(Class ~ ., data = training,
                method = "svmRadial", 
                trControl = svmControl, 
                preProc = c("center", "scale"),
                metric = "Accuracy"
                )
toc()
## 9.762 sec elapsed
str(svmFit$results)
## 'data.frame':    3 obs. of  6 variables:
##  $ sigma     : num  0.0118 0.0118 0.0118
##  $ C         : num  0.25 0.5 1
##  $ Accuracy  : num  0.737 0.773 0.787
##  $ Kappa     : num  0.471 0.543 0.571
##  $ AccuracySD: num  0.115 0.11 0.104
##  $ KappaSD   : num  0.231 0.221 0.208
svmFit$results
##        sigma    C  Accuracy     Kappa AccuracySD   KappaSD
## 1 0.01181293 0.25 0.7365466 0.4710495  0.1149270 0.2307519
## 2 0.01181293 0.50 0.7732328 0.5426548  0.1098938 0.2214216
## 3 0.01181293 1.00 0.7865343 0.5712726  0.1036692 0.2084029

Looking at the dataframe of results, we can see that only one of the two hyperparameters is being varied in tuning.

Without specifying anything about tuning, the model used one value of sigma and three values of C. The highest value of C performed best, which is concerning because even higher values of C may have been superior. Let’s rerun this example with a tuneLength of 4.

tic()
svmFit4 <- train(Class ~ ., data = training,
                method = "svmRadial", 
                trControl = svmControl, 
                preProc = c("center", "scale"),
                metric = "Accuracy",
                tuneLength = 4
                )
toc()
## 11.952 sec elapsed
str(svmFit4$results)
## 'data.frame':    4 obs. of  6 variables:
##  $ sigma     : num  0.0136 0.0136 0.0136 0.0136
##  $ C         : num  0.25 0.5 1 2
##  $ Accuracy  : num  0.737 0.769 0.793 0.812
##  $ Kappa     : num  0.471 0.535 0.583 0.621
##  $ AccuracySD: num  0.105 0.115 0.102 0.102
##  $ KappaSD   : num  0.213 0.233 0.206 0.206
svmFit4$results
##        sigma    C  Accuracy     Kappa AccuracySD   KappaSD
## 1 0.01360214 0.25 0.7366740 0.4708012  0.1054291 0.2126107
## 2 0.01360214 0.50 0.7694020 0.5347933  0.1153425 0.2329351
## 3 0.01360214 1.00 0.7925343 0.5830024  0.1024355 0.2055742
## 4 0.01360214 2.00 0.8117279 0.6205051  0.1021609 0.2061736

Let’s do an explicit grid search and look at the results.

svmControl <- trainControl(method = "repeatedcv",
number = 10, 
repeats = 10,
classProbs = TRUE,
search = "grid")

myGrid = expand.grid(sigma = c(.0133*.5,.0133, .0133*2),
                     C = .25*2^(0:9)) 
set.seed(825)

tic()
svmFitGrid <- train(Class ~ ., data = training,
                method = "svmRadial", 
                trControl = svmControl, 
                preProc = c("center", "scale"),
                metric = "Accuracy",
                tuneGrid = myGrid
                )
toc()
## 87.248 sec elapsed
svmFitGrid$bestTune
##     sigma  C
## 17 0.0133 16
str(svmFitGrid$results)
## 'data.frame':    30 obs. of  6 variables:
##  $ sigma     : num  0.00665 0.00665 0.00665 0.00665 0.00665 0.00665 0.00665 0.00665 0.00665 0.00665 ...
##  $ C         : num  0.25 0.5 1 2 4 8 16 32 64 128 ...
##  $ Accuracy  : num  0.711 0.768 0.769 0.787 0.813 ...
##  $ Kappa     : num  0.423 0.532 0.536 0.573 0.626 ...
##  $ AccuracySD: num  0.1113 0.1048 0.1058 0.0947 0.0905 ...
##  $ KappaSD   : num  0.223 0.212 0.212 0.19 0.181 ...
svmFitGrid$results
##      sigma      C  Accuracy     Kappa AccuracySD   KappaSD
## 1  0.00665   0.25 0.7109069 0.4230469 0.11133899 0.2232574
## 2  0.00665   0.50 0.7680245 0.5322562 0.10475012 0.2117049
## 3  0.00665   1.00 0.7689730 0.5359981 0.10583579 0.2122770
## 4  0.00665   2.00 0.7870074 0.5728198 0.09472244 0.1896162
## 5  0.00665   4.00 0.8134583 0.6256687 0.09051800 0.1810306
## 6  0.00665   8.00 0.8370809 0.6722456 0.08215304 0.1644265
## 7  0.00665  16.00 0.8409142 0.6791553 0.08183504 0.1643256
## 8  0.00665  32.00 0.8410392 0.6793486 0.08129754 0.1633458
## 9  0.00665  64.00 0.8403775 0.6777328 0.08009262 0.1617632
## 10 0.00665 128.00 0.8398309 0.6773044 0.08423262 0.1689661
## 11 0.01330   0.25 0.7432598 0.4836304 0.11180238 0.2257788
## 12 0.01330   0.50 0.7794461 0.5553285 0.10813333 0.2175859
## 13 0.01330   1.00 0.7897059 0.5779083 0.09907841 0.1985945
## 14 0.01330   2.00 0.8242230 0.6462055 0.09074958 0.1818238
## 15 0.01330   4.00 0.8404044 0.6785791 0.08195699 0.1643656
## 16 0.01330   8.00 0.8511324 0.6997508 0.07810380 0.1569551
## 17 0.01330  16.00 0.8541422 0.7056982 0.08144626 0.1636939
## 18 0.01330  32.00 0.8480025 0.6935477 0.07913595 0.1589081
## 19 0.01330  64.00 0.8531176 0.7035801 0.08008044 0.1610547
## 20 0.01330 128.00 0.8517157 0.7008115 0.08102466 0.1626427
## 21 0.02660   0.25 0.7551127 0.5098603 0.11684780 0.2349180
## 22 0.02660   0.50 0.7988873 0.5939222 0.09515513 0.1924806
## 23 0.02660   1.00 0.8232034 0.6426913 0.08795617 0.1772456
## 24 0.02660   2.00 0.8295515 0.6558810 0.08479592 0.1699307
## 25 0.02660   4.00 0.8468946 0.6905084 0.08735424 0.1756529
## 26 0.02660   8.00 0.8466544 0.6900878 0.08488057 0.1705708
## 27 0.02660  16.00 0.8391863 0.6755100 0.08304815 0.1665383
## 28 0.02660  32.00 0.8442598 0.6852795 0.08393297 0.1686732
## 29 0.02660  64.00 0.8379681 0.6723321 0.08646423 0.1740743
## 30 0.02660 128.00 0.8403113 0.6776502 0.08646401 0.1733022

What we might think we know now is that the best set of hyperparameters is at or above .0266 and somewhere between 1 and 4 for C. This would be true with only one independent variable, but we have two.

That leads me to do the following grid search.

tic()
myGrid = expand.grid(sigma = seq(.0266,.0276,length = 4),
C= seq(1,4,length=8))
set.seed(825)

svmFitGrid <- train(Class ~ ., data = training,
                method = "svmRadial", 
                trControl = svmControl, 
                preProc = c("center", "scale"),
                metric = "Accuracy",
                tuneGrid = myGrid
                )
toc()
## 94.855 sec elapsed
svmFitGrid$bestTune
##     sigma        C
## 31 0.0276 3.571429
str(svmFitGrid$results)
## 'data.frame':    32 obs. of  6 variables:
##  $ sigma     : num  0.0266 0.0266 0.0266 0.0266 0.0266 ...
##  $ C         : num  1 1.43 1.86 2.29 2.71 ...
##  $ Accuracy  : num  0.826 0.831 0.833 0.838 0.84 ...
##  $ Kappa     : num  0.649 0.659 0.663 0.674 0.677 ...
##  $ AccuracySD: num  0.09 0.0876 0.0866 0.0882 0.0888 ...
##  $ KappaSD   : num  0.182 0.176 0.174 0.176 0.179 ...
svmFitGrid$results
##         sigma        C  Accuracy     Kappa AccuracySD   KappaSD
## 1  0.02660000 1.000000 0.8263799 0.6490607 0.09003923 0.1816266
## 2  0.02660000 1.428571 0.8308799 0.6587729 0.08756742 0.1755309
## 3  0.02660000 1.857143 0.8334265 0.6633080 0.08661472 0.1743122
## 4  0.02660000 2.285714 0.8384363 0.6736964 0.08816600 0.1764460
## 5  0.02660000 2.714286 0.8403480 0.6772519 0.08876830 0.1785556
## 6  0.02660000 3.142857 0.8404314 0.6773341 0.08692482 0.1748227
## 7  0.02660000 3.571429 0.8367966 0.6704482 0.08653823 0.1736352
## 8  0.02660000 4.000000 0.8424314 0.6816488 0.08400518 0.1687550
## 9  0.02693333 1.000000 0.8208554 0.6381496 0.09220819 0.1850470
## 10 0.02693333 1.428571 0.8303750 0.6573057 0.08644883 0.1739429
## 11 0.02693333 1.857143 0.8316299 0.6597775 0.08308980 0.1670940
## 12 0.02693333 2.285714 0.8380417 0.6730233 0.08621315 0.1728872
## 13 0.02693333 2.714286 0.8423848 0.6817877 0.08262121 0.1658771
## 14 0.02693333 3.142857 0.8404314 0.6774919 0.08605374 0.1721686
## 15 0.02693333 3.571429 0.8443113 0.6856300 0.08407102 0.1686504
## 16 0.02693333 4.000000 0.8412132 0.6790216 0.08942519 0.1797663
## 17 0.02726667 1.000000 0.8269632 0.6496067 0.08815440 0.1785573
## 18 0.02726667 1.428571 0.8284583 0.6532718 0.08718159 0.1757422
## 19 0.02726667 1.857143 0.8267402 0.6503130 0.08480416 0.1701740
## 20 0.02726667 2.285714 0.8321299 0.6609488 0.08842804 0.1769413
## 21 0.02726667 2.714286 0.8419167 0.6804144 0.08520826 0.1711493
## 22 0.02726667 3.142857 0.8442279 0.6853908 0.08522277 0.1709135
## 23 0.02726667 3.571429 0.8397598 0.6763245 0.08698414 0.1745599
## 24 0.02726667 4.000000 0.8408578 0.6787131 0.08639852 0.1726964
## 25 0.02760000 1.000000 0.8252917 0.6469534 0.08478907 0.1705061
## 26 0.02760000 1.428571 0.8333480 0.6636296 0.08413660 0.1690398
## 27 0.02760000 1.857143 0.8283431 0.6537053 0.08731862 0.1749327
## 28 0.02760000 2.285714 0.8364828 0.6703201 0.08833746 0.1767534
## 29 0.02760000 2.714286 0.8366348 0.6697244 0.08436073 0.1693820
## 30 0.02760000 3.142857 0.8399265 0.6767220 0.08610751 0.1726456
## 31 0.02760000 3.571429 0.8443529 0.6855205 0.08669758 0.1739750
## 32 0.02760000 4.000000 0.8441863 0.6849635 0.08282918 0.1662078

Rather than doing a manual search, we can do a systematic search using the capabilities of caret.

tic()
adControl <- trainControl(method = "adaptive_cv",
                           number = 10, repeats = 10)
set.seed(825)

svmFitad <- train(Class ~ ., data = training,
                method = "svmRadial", 
                trControl = adControl, 
                preProc = c("center", "scale"),
                metric = "Accuracy",
                tuneLength = 10
                )
toc()
## 5.124 sec elapsed
svmFitad$bestTune
##         sigma C
## 10 0.01181293 8
str(svmFitad$results)
## 'data.frame':    10 obs. of  7 variables:
##  $ sigma     : num  0.0118 0.0118 0.0118 0.0118 0.0118 ...
##  $ C         : num  0.25 0.5 1 2 4 8 16 32 64 128
##  $ Accuracy  : num  0.718 0.756 0.757 0.782 0.838 ...
##  $ Kappa     : num  0.415 0.499 0.502 0.558 0.675 ...
##  $ AccuracySD: num  0.0689 0.0706 0.0509 0.0559 0.0815 ...
##  $ KappaSD   : num  0.154 0.155 0.114 0.114 0.161 ...
##  $ .B        : int  5 5 5 5 32 100 6 6 6 6
svmFitad$results
##         sigma      C  Accuracy     Kappa AccuracySD   KappaSD  .B
## 1  0.01181293   0.25 0.7183333 0.4146668 0.06888245 0.1544630   5
## 2  0.01181293   0.50 0.7558333 0.4991819 0.07056321 0.1553820   5
## 3  0.01181293   1.00 0.7566667 0.5019550 0.05091182 0.1142971   5
## 6  0.01181293   2.00 0.7825000 0.5583937 0.05593275 0.1141370   5
## 8  0.01181293   4.00 0.8383502 0.6745518 0.08152624 0.1614193  32
## 10 0.01181293   8.00 0.8508824 0.6985281 0.07880221 0.1593479 100
## 5  0.01181293  16.00 0.8722222 0.7384152 0.08052202 0.1656383   6
## 7  0.01181293  32.00 0.8722222 0.7384152 0.08052202 0.1656383   6
## 9  0.01181293  64.00 0.8722222 0.7384152 0.08052202 0.1656383   6
## 4  0.01181293 128.00 0.8722222 0.7384152 0.08052202 0.1656383   6

Now I want to explore the ability of caret to do parallel processing.

library(doMC)
## Loading required package: foreach
## Loading required package: iterators
## Loading required package: parallel
# I have a new MB Pro with 8 cores.
registerDoMC(cores = 8) 

We need to insert a line in the call to train.

adControl <- trainControl(method = "adaptive_cv",
                           number = 10, repeats = 10)
set.seed(825)

tic()
svmFitad <- train(Class ~ ., data = training,
                method = "svmRadial", 
                trControl = adControl, 
                preProc = c("center", "scale"),
                metric = "Accuracy",
                tuneLength = 10,
                allowParallel=TRUE
                )
toc()
## 5.533 sec elapsed
svmFitad$bestTune
##         sigma C
## 10 0.01181293 8
str(svmFitad$results)
## 'data.frame':    10 obs. of  7 variables:
##  $ sigma     : num  0.0118 0.0118 0.0118 0.0118 0.0118 ...
##  $ C         : num  0.25 0.5 1 2 4 8 16 32 64 128
##  $ Accuracy  : num  0.718 0.756 0.757 0.782 0.838 ...
##  $ Kappa     : num  0.415 0.499 0.502 0.558 0.675 ...
##  $ AccuracySD: num  0.0689 0.0706 0.0509 0.0559 0.0815 ...
##  $ KappaSD   : num  0.154 0.155 0.114 0.114 0.161 ...
##  $ .B        : int  5 5 5 5 32 100 6 6 6 6
svmFitad$results
##         sigma      C  Accuracy     Kappa AccuracySD   KappaSD  .B
## 1  0.01181293   0.25 0.7183333 0.4146668 0.06888245 0.1544630   5
## 2  0.01181293   0.50 0.7558333 0.4991819 0.07056321 0.1553820   5
## 3  0.01181293   1.00 0.7566667 0.5019550 0.05091182 0.1142971   5
## 6  0.01181293   2.00 0.7825000 0.5583937 0.05593275 0.1141370   5
## 8  0.01181293   4.00 0.8383502 0.6745518 0.08152624 0.1614193  32
## 10 0.01181293   8.00 0.8508824 0.6985281 0.07880221 0.1593479 100
## 5  0.01181293  16.00 0.8722222 0.7384152 0.08052202 0.1656383   6
## 7  0.01181293  32.00 0.8722222 0.7384152 0.08052202 0.1656383   6
## 9  0.01181293  64.00 0.8722222 0.7384152 0.08052202 0.1656383   6
## 4  0.01181293 128.00 0.8722222 0.7384152 0.08052202 0.1656383   6