Can I use quantitative characteristics of the emails to classify them as SPAM/HAM?
#install.packages('kernlab')
library(kernlab)
data(spam)
str(spam)
## 'data.frame': 4601 obs. of 58 variables:
## $ make : num 0 0.21 0.06 0 0 0 0 0 0.15 0.06 ...
## $ address : num 0.64 0.28 0 0 0 0 0 0 0 0.12 ...
## $ all : num 0.64 0.5 0.71 0 0 0 0 0 0.46 0.77 ...
## $ num3d : num 0 0 0 0 0 0 0 0 0 0 ...
## $ our : num 0.32 0.14 1.23 0.63 0.63 1.85 1.92 1.88 0.61 0.19 ...
## $ over : num 0 0.28 0.19 0 0 0 0 0 0 0.32 ...
## $ remove : num 0 0.21 0.19 0.31 0.31 0 0 0 0.3 0.38 ...
## $ internet : num 0 0.07 0.12 0.63 0.63 1.85 0 1.88 0 0 ...
## $ order : num 0 0 0.64 0.31 0.31 0 0 0 0.92 0.06 ...
## $ mail : num 0 0.94 0.25 0.63 0.63 0 0.64 0 0.76 0 ...
## $ receive : num 0 0.21 0.38 0.31 0.31 0 0.96 0 0.76 0 ...
## $ will : num 0.64 0.79 0.45 0.31 0.31 0 1.28 0 0.92 0.64 ...
## $ people : num 0 0.65 0.12 0.31 0.31 0 0 0 0 0.25 ...
## $ report : num 0 0.21 0 0 0 0 0 0 0 0 ...
## $ addresses : num 0 0.14 1.75 0 0 0 0 0 0 0.12 ...
## $ free : num 0.32 0.14 0.06 0.31 0.31 0 0.96 0 0 0 ...
## $ business : num 0 0.07 0.06 0 0 0 0 0 0 0 ...
## $ email : num 1.29 0.28 1.03 0 0 0 0.32 0 0.15 0.12 ...
## $ you : num 1.93 3.47 1.36 3.18 3.18 0 3.85 0 1.23 1.67 ...
## $ credit : num 0 0 0.32 0 0 0 0 0 3.53 0.06 ...
## $ your : num 0.96 1.59 0.51 0.31 0.31 0 0.64 0 2 0.71 ...
## $ font : num 0 0 0 0 0 0 0 0 0 0 ...
## $ num000 : num 0 0.43 1.16 0 0 0 0 0 0 0.19 ...
## $ money : num 0 0.43 0.06 0 0 0 0 0 0.15 0 ...
## $ hp : num 0 0 0 0 0 0 0 0 0 0 ...
## $ hpl : num 0 0 0 0 0 0 0 0 0 0 ...
## $ george : num 0 0 0 0 0 0 0 0 0 0 ...
## $ num650 : num 0 0 0 0 0 0 0 0 0 0 ...
## $ lab : num 0 0 0 0 0 0 0 0 0 0 ...
## $ labs : num 0 0 0 0 0 0 0 0 0 0 ...
## $ telnet : num 0 0 0 0 0 0 0 0 0 0 ...
## $ num857 : num 0 0 0 0 0 0 0 0 0 0 ...
## $ data : num 0 0 0 0 0 0 0 0 0.15 0 ...
## $ num415 : num 0 0 0 0 0 0 0 0 0 0 ...
## $ num85 : num 0 0 0 0 0 0 0 0 0 0 ...
## $ technology : num 0 0 0 0 0 0 0 0 0 0 ...
## $ num1999 : num 0 0.07 0 0 0 0 0 0 0 0 ...
## $ parts : num 0 0 0 0 0 0 0 0 0 0 ...
## $ pm : num 0 0 0 0 0 0 0 0 0 0 ...
## $ direct : num 0 0 0.06 0 0 0 0 0 0 0 ...
## $ cs : num 0 0 0 0 0 0 0 0 0 0 ...
## $ meeting : num 0 0 0 0 0 0 0 0 0 0 ...
## $ original : num 0 0 0.12 0 0 0 0 0 0.3 0 ...
## $ project : num 0 0 0 0 0 0 0 0 0 0.06 ...
## $ re : num 0 0 0.06 0 0 0 0 0 0 0 ...
## $ edu : num 0 0 0.06 0 0 0 0 0 0 0 ...
## $ table : num 0 0 0 0 0 0 0 0 0 0 ...
## $ conference : num 0 0 0 0 0 0 0 0 0 0 ...
## $ charSemicolon : num 0 0 0.01 0 0 0 0 0 0 0.04 ...
## $ charRoundbracket : num 0 0.132 0.143 0.137 0.135 0.223 0.054 0.206 0.271 0.03 ...
## $ charSquarebracket: num 0 0 0 0 0 0 0 0 0 0 ...
## $ charExclamation : num 0.778 0.372 0.276 0.137 0.135 0 0.164 0 0.181 0.244 ...
## $ charDollar : num 0 0.18 0.184 0 0 0 0.054 0 0.203 0.081 ...
## $ charHash : num 0 0.048 0.01 0 0 0 0 0 0.022 0 ...
## $ capitalAve : num 3.76 5.11 9.82 3.54 3.54 ...
## $ capitalLong : num 61 101 485 40 40 15 4 11 445 43 ...
## $ capitalTotal : num 278 1028 2259 191 191 ...
## $ type : Factor w/ 2 levels "nonspam","spam": 2 2 2 2 2 2 2 2 2 2 ...
# Perform the subsampling
set.seed(3435)
trainIndicator = rbinom(4601, size = 1, prob = 0.5)
table(trainIndicator)
## trainIndicator
## 0 1
## 2314 2287
# create the variables
trainSpam = spam[trainIndicator == 1, ]
testSpam = spam[trainIndicator == 0, ]
names(trainSpam)
## [1] "make" "address" "all"
## [4] "num3d" "our" "over"
## [7] "remove" "internet" "order"
## [10] "mail" "receive" "will"
## [13] "people" "report" "addresses"
## [16] "free" "business" "email"
## [19] "you" "credit" "your"
## [22] "font" "num000" "money"
## [25] "hp" "hpl" "george"
## [28] "num650" "lab" "labs"
## [31] "telnet" "num857" "data"
## [34] "num415" "num85" "technology"
## [37] "num1999" "parts" "pm"
## [40] "direct" "cs" "meeting"
## [43] "original" "project" "re"
## [46] "edu" "table" "conference"
## [49] "charSemicolon" "charRoundbracket" "charSquarebracket"
## [52] "charExclamation" "charDollar" "charHash"
## [55] "capitalAve" "capitalLong" "capitalTotal"
## [58] "type"
summary(trainSpam)
## make address all num3d
## Min. :0.00000 Min. : 0.0000 Min. :0.0000 Min. : 0.0000
## 1st Qu.:0.00000 1st Qu.: 0.0000 1st Qu.:0.0000 1st Qu.: 0.0000
## Median :0.00000 Median : 0.0000 Median :0.0000 Median : 0.0000
## Mean :0.09756 Mean : 0.1866 Mean :0.2607 Mean : 0.1072
## 3rd Qu.:0.00000 3rd Qu.: 0.0000 3rd Qu.:0.3800 3rd Qu.: 0.0000
## Max. :4.34000 Max. :14.2800 Max. :4.0000 Max. :42.8100
## our over remove internet
## Min. :0.0000 Min. :0.0000 Min. :0.0000 Min. : 0.0000
## 1st Qu.:0.0000 1st Qu.:0.0000 1st Qu.:0.0000 1st Qu.: 0.0000
## Median :0.0000 Median :0.0000 Median :0.0000 Median : 0.0000
## Mean :0.3222 Mean :0.0997 Mean :0.1195 Mean : 0.1076
## 3rd Qu.:0.3900 3rd Qu.:0.0000 3rd Qu.:0.0000 3rd Qu.: 0.0000
## Max. :7.6900 Max. :5.8800 Max. :7.2700 Max. :11.1100
## order mail receive will
## Min. :0.00000 Min. : 0.0000 Min. :0.00000 Min. :0.000
## 1st Qu.:0.00000 1st Qu.: 0.0000 1st Qu.:0.00000 1st Qu.:0.000
## Median :0.00000 Median : 0.0000 Median :0.00000 Median :0.150
## Mean :0.08517 Mean : 0.2424 Mean :0.05683 Mean :0.565
## 3rd Qu.:0.00000 3rd Qu.: 0.1600 3rd Qu.:0.00000 3rd Qu.:0.840
## Max. :5.26000 Max. :18.1800 Max. :2.06000 Max. :7.690
## people report addresses free
## Min. :0.00000 Min. : 0.00000 Min. :0.00000 Min. : 0.0000
## 1st Qu.:0.00000 1st Qu.: 0.00000 1st Qu.:0.00000 1st Qu.: 0.0000
## Median :0.00000 Median : 0.00000 Median :0.00000 Median : 0.0000
## Mean :0.09383 Mean : 0.06453 Mean :0.04173 Mean : 0.2588
## 3rd Qu.:0.00000 3rd Qu.: 0.00000 3rd Qu.:0.00000 3rd Qu.: 0.1100
## Max. :5.55000 Max. :10.00000 Max. :2.31000 Max. :20.0000
## business email you credit
## Min. :0.0000 Min. :0.0000 Min. : 0.000 Min. :0.0000
## 1st Qu.:0.0000 1st Qu.:0.0000 1st Qu.: 0.000 1st Qu.:0.0000
## Median :0.0000 Median :0.0000 Median : 1.290 Median :0.0000
## Mean :0.1526 Mean :0.1767 Mean : 1.654 Mean :0.0723
## 3rd Qu.:0.0000 3rd Qu.:0.0000 3rd Qu.: 2.630 3rd Qu.:0.0000
## Max. :7.1400 Max. :6.6600 Max. :14.280 Max. :5.3300
## your font num000 money
## Min. : 0.0000 Min. : 0.0000 Min. :0.00000 Min. : 0.00000
## 1st Qu.: 0.0000 1st Qu.: 0.0000 1st Qu.:0.00000 1st Qu.: 0.00000
## Median : 0.1700 Median : 0.0000 Median :0.00000 Median : 0.00000
## Mean : 0.7861 Mean : 0.1277 Mean :0.09328 Mean : 0.09315
## 3rd Qu.: 1.2500 3rd Qu.: 0.0000 3rd Qu.:0.00000 3rd Qu.: 0.00000
## Max. :11.1100 Max. :17.1000 Max. :4.76000 Max. :12.50000
## hp hpl george num650
## Min. : 0.0000 Min. : 0.0000 Min. : 0.00 Min. :0.0000
## 1st Qu.: 0.0000 1st Qu.: 0.0000 1st Qu.: 0.00 1st Qu.:0.0000
## Median : 0.0000 Median : 0.0000 Median : 0.00 Median :0.0000
## Mean : 0.5902 Mean : 0.2818 Mean : 0.71 Mean :0.1252
## 3rd Qu.: 0.0000 3rd Qu.: 0.0000 3rd Qu.: 0.00 3rd Qu.:0.0000
## Max. :20.8300 Max. :16.6600 Max. :33.33 Max. :8.3300
## lab labs telnet num857
## Min. : 0.00000 Min. :0.0000 Min. :0.00000 Min. :0.00000
## 1st Qu.: 0.00000 1st Qu.:0.0000 1st Qu.:0.00000 1st Qu.:0.00000
## Median : 0.00000 Median :0.0000 Median :0.00000 Median :0.00000
## Mean : 0.09829 Mean :0.1053 Mean :0.05837 Mean :0.04167
## 3rd Qu.: 0.00000 3rd Qu.:0.0000 3rd Qu.:0.00000 3rd Qu.:0.00000
## Max. :11.11000 Max. :5.8800 Max. :4.76000 Max. :4.76000
## data num415 num85 technology
## Min. : 0.0000 Min. :0.0000 Min. :0.0000 Min. :0.00000
## 1st Qu.: 0.0000 1st Qu.:0.0000 1st Qu.:0.0000 1st Qu.:0.00000
## Median : 0.0000 Median :0.0000 Median :0.0000 Median :0.00000
## Mean : 0.1061 Mean :0.0428 Mean :0.1047 Mean :0.09569
## 3rd Qu.: 0.0000 3rd Qu.:0.0000 3rd Qu.:0.0000 3rd Qu.:0.00000
## Max. :18.1800 Max. :4.7600 Max. :5.8800 Max. :4.76000
## num1999 parts pm direct
## Min. :0.0000 Min. :0.00000 Min. : 0.00000 Min. :0.00000
## 1st Qu.:0.0000 1st Qu.:0.00000 1st Qu.: 0.00000 1st Qu.:0.00000
## Median :0.0000 Median :0.00000 Median : 0.00000 Median :0.00000
## Mean :0.1458 Mean :0.01597 Mean : 0.08409 Mean :0.06068
## 3rd Qu.:0.0000 3rd Qu.:0.00000 3rd Qu.: 0.00000 3rd Qu.:0.00000
## Max. :5.0500 Max. :7.40000 Max. :11.11000 Max. :4.76000
## cs meeting original project
## Min. :0.00000 Min. : 0.0000 Min. :0.00000 Min. : 0.00000
## 1st Qu.:0.00000 1st Qu.: 0.0000 1st Qu.:0.00000 1st Qu.: 0.00000
## Median :0.00000 Median : 0.0000 Median :0.00000 Median : 0.00000
## Mean :0.04053 Mean : 0.1517 Mean :0.04646 Mean : 0.09815
## 3rd Qu.:0.00000 3rd Qu.: 0.0000 3rd Qu.:0.00000 3rd Qu.: 0.00000
## Max. :7.14000 Max. :14.2800 Max. :3.57000 Max. :20.00000
## re edu table conference
## Min. : 0.000 Min. : 0.0000 Min. :0.000000 Min. : 0.00000
## 1st Qu.: 0.000 1st Qu.: 0.0000 1st Qu.:0.000000 1st Qu.: 0.00000
## Median : 0.000 Median : 0.0000 Median :0.000000 Median : 0.00000
## Mean : 0.315 Mean : 0.2057 Mean :0.006017 Mean : 0.03967
## 3rd Qu.: 0.120 3rd Qu.: 0.0000 3rd Qu.:0.000000 3rd Qu.: 0.00000
## Max. :21.420 Max. :22.0500 Max. :1.910000 Max. :10.00000
## charSemicolon charRoundbracket charSquarebracket charExclamation
## Min. :0.00000 Min. :0.0000 Min. :0.00000 Min. : 0.0000
## 1st Qu.:0.00000 1st Qu.:0.0000 1st Qu.:0.00000 1st Qu.: 0.0000
## Median :0.00000 Median :0.0650 Median :0.00000 Median : 0.0000
## Mean :0.04362 Mean :0.1401 Mean :0.01658 Mean : 0.2714
## 3rd Qu.:0.00000 3rd Qu.:0.1850 3rd Qu.:0.00000 3rd Qu.: 0.3100
## Max. :4.38500 Max. :9.7520 Max. :4.08100 Max. :32.4780
## charDollar charHash capitalAve capitalLong
## Min. :0.00000 Min. : 0.00000 Min. : 1.000 Min. : 1.00
## 1st Qu.:0.00000 1st Qu.: 0.00000 1st Qu.: 1.600 1st Qu.: 6.00
## Median :0.00000 Median : 0.00000 Median : 2.236 Median : 15.00
## Mean :0.07403 Mean : 0.05405 Mean : 4.810 Mean : 51.61
## 3rd Qu.:0.04600 3rd Qu.: 0.00000 3rd Qu.: 3.676 3rd Qu.: 42.00
## Max. :6.00300 Max. :19.82900 Max. :664.000 Max. :9989.00
## capitalTotal type
## Min. : 1.0 nonspam:1381
## 1st Qu.: 34.5 spam : 906
## Median : 95.0
## Mean : 284.3
## 3rd Qu.: 263.0
## Max. :15841.0
head(trainSpam)
## make address all num3d our over remove internet order mail receive will
## 1 0.00 0.64 0.64 0 0.32 0.00 0.00 0 0.00 0.00 0.00 0.64
## 7 0.00 0.00 0.00 0 1.92 0.00 0.00 0 0.00 0.64 0.96 1.28
## 9 0.15 0.00 0.46 0 0.61 0.00 0.30 0 0.92 0.76 0.76 0.92
## 12 0.00 0.00 0.25 0 0.38 0.25 0.25 0 0.00 0.00 0.12 0.12
## 14 0.00 0.00 0.00 0 0.90 0.00 0.90 0 0.00 0.90 0.90 0.00
## 16 0.00 0.42 0.42 0 1.27 0.00 0.42 0 0.00 1.27 0.00 0.00
## people report addresses free business email you credit your font num000
## 1 0.00 0 0 0.32 0 1.29 1.93 0.00 0.96 0 0
## 7 0.00 0 0 0.96 0 0.32 3.85 0.00 0.64 0 0
## 9 0.00 0 0 0.00 0 0.15 1.23 3.53 2.00 0 0
## 12 0.12 0 0 0.00 0 0.00 1.16 0.00 0.77 0 0
## 14 0.90 0 0 0.00 0 0.00 2.72 0.00 0.90 0 0
## 16 0.00 0 0 1.27 0 0.00 1.70 0.42 1.27 0 0
## money hp hpl george num650 lab labs telnet num857 data num415 num85
## 1 0.00 0 0 0 0 0 0 0 0 0.00 0 0
## 7 0.00 0 0 0 0 0 0 0 0 0.00 0 0
## 9 0.15 0 0 0 0 0 0 0 0 0.15 0 0
## 12 0.00 0 0 0 0 0 0 0 0 0.00 0 0
## 14 0.00 0 0 0 0 0 0 0 0 0.00 0 0
## 16 0.42 0 0 0 0 0 0 0 0 0.00 0 0
## technology num1999 parts pm direct cs meeting original project re edu table
## 1 0 0.00 0 0 0.00 0 0 0.0 0 0 0 0
## 7 0 0.00 0 0 0.00 0 0 0.0 0 0 0 0
## 9 0 0.00 0 0 0.00 0 0 0.3 0 0 0 0
## 12 0 0.00 0 0 0.00 0 0 0.0 0 0 0 0
## 14 0 0.00 0 0 0.00 0 0 0.0 0 0 0 0
## 16 0 1.27 0 0 0.42 0 0 0.0 0 0 0 0
## conference charSemicolon charRoundbracket charSquarebracket charExclamation
## 1 0 0.000 0.000 0 0.778
## 7 0 0.000 0.054 0 0.164
## 9 0 0.000 0.271 0 0.181
## 12 0 0.022 0.044 0 0.663
## 14 0 0.000 0.000 0 0.000
## 16 0 0.000 0.063 0 0.572
## charDollar charHash capitalAve capitalLong capitalTotal type
## 1 0.000 0.000 3.756 61 278 spam
## 7 0.054 0.000 1.671 4 112 spam
## 9 0.203 0.022 9.744 445 1257 spam
## 12 0.000 0.000 1.243 11 184 spam
## 14 0.000 0.000 2.083 7 25 spam
## 16 0.063 0.000 5.659 55 249 spam
-Check for missing data & data types
table(is.na(trainSpam))
##
## FALSE
## 132646
table(trainSpam$type)
##
## nonspam spam
## 1381 906
-Create exploratory plots
plot(trainSpam$capitalAve ~ trainSpam$type)
#used (log10(df) +1)as data was too compact
plot(log10(trainSpam$capitalAve + 1) ~ trainSpam$type)
#Relationships between predictors Relationships between predictors
plot(log10(trainSpam[, 1:4] + 1))
hCluster = hclust(dist(t(trainSpam[, 1:57])))
#not very informative
plot(hCluster)
#after data trasformation
hClusterUpdated = hclust(dist(t(log10(trainSpam[, 1:55] + 1))))
plot(hClusterUpdated)
#install.packages('boot')
trainSpam$numType = as.numeric(trainSpam$type) - 1
costFunction = function(x, y) sum(x != (y > 0.5))
cvError = rep(NA, 55)
library(boot)
for (i in 1:55) {
lmFormula = reformulate(names(trainSpam)[i], response = "numType")
glmFit = glm(lmFormula, family = "binomial", data = trainSpam)
cvError[i] = cv.glm(trainSpam, glmFit, costFunction, 2)$delta[2]
}
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Which predictor has minimum cross-validated error?
names(trainSpam)[which.min(cvError)]
## [1] "charDollar"
## Use the best model from the group
predictionModel = glm(numType ~ charDollar, family = "binomial", data = trainSpam)
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Get predictions on the test set
predictionTest = predict(predictionModel, testSpam)
predictedSpam = rep("nonspam", dim(testSpam)[1])
## Classify as `spam' for those with prob > 0.5
predictedSpam[predictionModel$fitted > 0.5] = "spam"
## Classification table
table(predictedSpam, testSpam$type)
##
## predictedSpam nonspam spam
## nonspam 1346 458
## spam 61 449
##
## predictedSpam nonspam spam
## nonspam 1346 458
## spam 61 449
## Error rate
(61 + 458)/(1346 + 458 + 61 + 449)
## [1] 0.2242869
Discussion: The fraction of characters that are dollar signs can be used to predict if an email is Spam, anything with more than 6.6% dollar signs is classified as Spam. More dollar signs always means more Spam under our prediction, the test set error rate was 22.4%.