library(kernlab)
library(caret)
## Loading required package: lattice
## Loading required package: ggplot2
## 
## Attaching package: 'ggplot2'
## The following object is masked from 'package:kernlab':
## 
##     alpha

Spam Data Set

data(spam)
nrow(spam)
## [1] 4601
head(spam)
##   make address  all num3d  our over remove internet order mail receive
## 1 0.00    0.64 0.64     0 0.32 0.00   0.00     0.00  0.00 0.00    0.00
## 2 0.21    0.28 0.50     0 0.14 0.28   0.21     0.07  0.00 0.94    0.21
## 3 0.06    0.00 0.71     0 1.23 0.19   0.19     0.12  0.64 0.25    0.38
## 4 0.00    0.00 0.00     0 0.63 0.00   0.31     0.63  0.31 0.63    0.31
## 5 0.00    0.00 0.00     0 0.63 0.00   0.31     0.63  0.31 0.63    0.31
## 6 0.00    0.00 0.00     0 1.85 0.00   0.00     1.85  0.00 0.00    0.00
##   will people report addresses free business email  you credit your font
## 1 0.64   0.00   0.00      0.00 0.32     0.00  1.29 1.93   0.00 0.96    0
## 2 0.79   0.65   0.21      0.14 0.14     0.07  0.28 3.47   0.00 1.59    0
## 3 0.45   0.12   0.00      1.75 0.06     0.06  1.03 1.36   0.32 0.51    0
## 4 0.31   0.31   0.00      0.00 0.31     0.00  0.00 3.18   0.00 0.31    0
## 5 0.31   0.31   0.00      0.00 0.31     0.00  0.00 3.18   0.00 0.31    0
## 6 0.00   0.00   0.00      0.00 0.00     0.00  0.00 0.00   0.00 0.00    0
##   num000 money hp hpl george num650 lab labs telnet num857 data num415
## 1   0.00  0.00  0   0      0      0   0    0      0      0    0      0
## 2   0.43  0.43  0   0      0      0   0    0      0      0    0      0
## 3   1.16  0.06  0   0      0      0   0    0      0      0    0      0
## 4   0.00  0.00  0   0      0      0   0    0      0      0    0      0
## 5   0.00  0.00  0   0      0      0   0    0      0      0    0      0
## 6   0.00  0.00  0   0      0      0   0    0      0      0    0      0
##   num85 technology num1999 parts pm direct cs meeting original project
## 1     0          0    0.00     0  0   0.00  0       0     0.00       0
## 2     0          0    0.07     0  0   0.00  0       0     0.00       0
## 3     0          0    0.00     0  0   0.06  0       0     0.12       0
## 4     0          0    0.00     0  0   0.00  0       0     0.00       0
## 5     0          0    0.00     0  0   0.00  0       0     0.00       0
## 6     0          0    0.00     0  0   0.00  0       0     0.00       0
##     re  edu table conference charSemicolon charRoundbracket
## 1 0.00 0.00     0          0          0.00            0.000
## 2 0.00 0.00     0          0          0.00            0.132
## 3 0.06 0.06     0          0          0.01            0.143
## 4 0.00 0.00     0          0          0.00            0.137
## 5 0.00 0.00     0          0          0.00            0.135
## 6 0.00 0.00     0          0          0.00            0.223
##   charSquarebracket charExclamation charDollar charHash capitalAve
## 1                 0           0.778      0.000    0.000      3.756
## 2                 0           0.372      0.180    0.048      5.114
## 3                 0           0.276      0.184    0.010      9.821
## 4                 0           0.137      0.000    0.000      3.537
## 5                 0           0.135      0.000    0.000      3.537
## 6                 0           0.000      0.000    0.000      3.000
##   capitalLong capitalTotal type
## 1          61          278 spam
## 2         101         1028 spam
## 3         485         2259 spam
## 4          40          191 spam
## 5          40          191 spam
## 6          15           54 spam

Split the data into training and test sets

set.seed(998)
inTraining <- createDataPartition(spam$type, p = .75, list = FALSE) # this is a stratified sampling technique! Cool!
spam_train <- spam[ inTraining,]
spam_test  <- spam[-inTraining,]

Classification Trees

Hint:: Model: type~.

Bagging

Random Forests

Boosting

SVM

Prometergene Data Set

data("promotergene")
?promotergene
## starting httpd help server ... done
head(promotergene)
##   Class V2 V3 V4 V5 V6 V7 V8 V9 V10 V11 V12 V13 V14 V15 V16 V17 V18 V19
## 1     +  g  c  c  t  t  c  t  c   c   a   a   a   a   c   g   t   g   t
## 2     +  a  t  g  c  a  a  t  t   t   t   t   t   a   g   t   t   g   c
## 3     +  c  c  g  t  t  t  a  t   t   t   t   t   t   c   t   a   c   c
## 4     +  t  c  t  c  a  a  c  g   t   a   a   c   a   c   t   t   t   a
## 5     +  t  a  g  g  c  a  c  c   c   c   a   g   g   c   t   t   t   a
## 6     +  a  t  a  t  a  a  a  a   a   a   g   t   t   c   t   t   g   c
##   V20 V21 V22 V23 V24 V25 V26 V27 V28 V29 V30 V31 V32 V33 V34 V35 V36 V37
## 1   t   t   t   t   t   g   t   t   g   t   t   a   a   t   t   c   g   g
## 2   a   t   g   a   a   c   t   c   g   c   a   t   g   t   c   t   c   c
## 3   c   a   t   a   t   c   c   t   t   g   a   a   g   c   g   g   t   g
## 4   c   a   g   c   g   g   c   g   c   g   t   c   a   t   t   t   g   a
## 5   c   a   c   t   t   t   a   t   g   c   t   t   c   c   g   g   c   t
## 6   t   t   t   c   t   a   a   c   g   t   g   a   a   a   g   t   g   g
##   V38 V39 V40 V41 V42 V43 V44 V45 V46 V47 V48 V49 V50 V51 V52 V53 V54 V55
## 1   t   g   t   a   g   a   c   t   t   g   t   a   a   a   c   c   t   a
## 2   a   t   a   g   a   a   t   g   c   g   c   g   c   t   a   c   t   t
## 3   t   t   a   t   a   a   t   g   c   c   g   c   g   c   c   c   t   c
## 4   t   a   t   g   a   t   g   c   g   c   c   c   c   g   c   t   t   c
## 5   c   g   t   a   t   g   t   t   g   t   g   t   g   g   a   a   t   t
## 6   t   t   t   a   g   g   t   t   a   a   a   a   g   a   c   a   t   c
##   V56 V57 V58
## 1   a   a   t
## 2   g   a   t
## 3   g   a   t
## 4   c   c   g
## 5   g   t   g
## 6   a   g   t
nrow(promotergene)
## [1] 106
table(promotergene$Class)
## 
##  +  - 
## 53 53

Split the data into training and test sets

set.seed(123)
inTraining <- createDataPartition(promotergene$Class, p = .80, list = FALSE) # this is a stratified sampling technique! Cool!
promotergeneTrain <- promotergene[ inTraining,]
promotergeneTest  <- promotergene[-inTraining,]
table(promotergeneTest$Class)
## 
##  +  - 
## 10 10

Classification Trees

Hint:: Model: Class~.

Bagging

Random Forests

Boosting

SVM