R Markdown for Reproducible Research:

Can I use quantitative characteristics of the emails to classify them as SPAM/HAM?

#install.packages('kernlab')
library(kernlab)
data(spam)
str(spam)
## 'data.frame':    4601 obs. of  58 variables:
##  $ make             : num  0 0.21 0.06 0 0 0 0 0 0.15 0.06 ...
##  $ address          : num  0.64 0.28 0 0 0 0 0 0 0 0.12 ...
##  $ all              : num  0.64 0.5 0.71 0 0 0 0 0 0.46 0.77 ...
##  $ num3d            : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ our              : num  0.32 0.14 1.23 0.63 0.63 1.85 1.92 1.88 0.61 0.19 ...
##  $ over             : num  0 0.28 0.19 0 0 0 0 0 0 0.32 ...
##  $ remove           : num  0 0.21 0.19 0.31 0.31 0 0 0 0.3 0.38 ...
##  $ internet         : num  0 0.07 0.12 0.63 0.63 1.85 0 1.88 0 0 ...
##  $ order            : num  0 0 0.64 0.31 0.31 0 0 0 0.92 0.06 ...
##  $ mail             : num  0 0.94 0.25 0.63 0.63 0 0.64 0 0.76 0 ...
##  $ receive          : num  0 0.21 0.38 0.31 0.31 0 0.96 0 0.76 0 ...
##  $ will             : num  0.64 0.79 0.45 0.31 0.31 0 1.28 0 0.92 0.64 ...
##  $ people           : num  0 0.65 0.12 0.31 0.31 0 0 0 0 0.25 ...
##  $ report           : num  0 0.21 0 0 0 0 0 0 0 0 ...
##  $ addresses        : num  0 0.14 1.75 0 0 0 0 0 0 0.12 ...
##  $ free             : num  0.32 0.14 0.06 0.31 0.31 0 0.96 0 0 0 ...
##  $ business         : num  0 0.07 0.06 0 0 0 0 0 0 0 ...
##  $ email            : num  1.29 0.28 1.03 0 0 0 0.32 0 0.15 0.12 ...
##  $ you              : num  1.93 3.47 1.36 3.18 3.18 0 3.85 0 1.23 1.67 ...
##  $ credit           : num  0 0 0.32 0 0 0 0 0 3.53 0.06 ...
##  $ your             : num  0.96 1.59 0.51 0.31 0.31 0 0.64 0 2 0.71 ...
##  $ font             : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ num000           : num  0 0.43 1.16 0 0 0 0 0 0 0.19 ...
##  $ money            : num  0 0.43 0.06 0 0 0 0 0 0.15 0 ...
##  $ hp               : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ hpl              : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ george           : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ num650           : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ lab              : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ labs             : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ telnet           : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ num857           : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ data             : num  0 0 0 0 0 0 0 0 0.15 0 ...
##  $ num415           : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ num85            : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ technology       : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ num1999          : num  0 0.07 0 0 0 0 0 0 0 0 ...
##  $ parts            : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ pm               : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ direct           : num  0 0 0.06 0 0 0 0 0 0 0 ...
##  $ cs               : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ meeting          : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ original         : num  0 0 0.12 0 0 0 0 0 0.3 0 ...
##  $ project          : num  0 0 0 0 0 0 0 0 0 0.06 ...
##  $ re               : num  0 0 0.06 0 0 0 0 0 0 0 ...
##  $ edu              : num  0 0 0.06 0 0 0 0 0 0 0 ...
##  $ table            : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ conference       : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ charSemicolon    : num  0 0 0.01 0 0 0 0 0 0 0.04 ...
##  $ charRoundbracket : num  0 0.132 0.143 0.137 0.135 0.223 0.054 0.206 0.271 0.03 ...
##  $ charSquarebracket: num  0 0 0 0 0 0 0 0 0 0 ...
##  $ charExclamation  : num  0.778 0.372 0.276 0.137 0.135 0 0.164 0 0.181 0.244 ...
##  $ charDollar       : num  0 0.18 0.184 0 0 0 0.054 0 0.203 0.081 ...
##  $ charHash         : num  0 0.048 0.01 0 0 0 0 0 0.022 0 ...
##  $ capitalAve       : num  3.76 5.11 9.82 3.54 3.54 ...
##  $ capitalLong      : num  61 101 485 40 40 15 4 11 445 43 ...
##  $ capitalTotal     : num  278 1028 2259 191 191 ...
##  $ type             : Factor w/ 2 levels "nonspam","spam": 2 2 2 2 2 2 2 2 2 2 ...
# Perform the subsampling
set.seed(3435)
trainIndicator = rbinom(4601, size = 1, prob = 0.5)
table(trainIndicator)
## trainIndicator
##    0    1 
## 2314 2287

Exploratory data analysis Exploratory data analysis

# create the variables 
trainSpam = spam[trainIndicator == 1, ]
testSpam = spam[trainIndicator == 0, ]
names(trainSpam)
##  [1] "make"              "address"           "all"              
##  [4] "num3d"             "our"               "over"             
##  [7] "remove"            "internet"          "order"            
## [10] "mail"              "receive"           "will"             
## [13] "people"            "report"            "addresses"        
## [16] "free"              "business"          "email"            
## [19] "you"               "credit"            "your"             
## [22] "font"              "num000"            "money"            
## [25] "hp"                "hpl"               "george"           
## [28] "num650"            "lab"               "labs"             
## [31] "telnet"            "num857"            "data"             
## [34] "num415"            "num85"             "technology"       
## [37] "num1999"           "parts"             "pm"               
## [40] "direct"            "cs"                "meeting"          
## [43] "original"          "project"           "re"               
## [46] "edu"               "table"             "conference"       
## [49] "charSemicolon"     "charRoundbracket"  "charSquarebracket"
## [52] "charExclamation"   "charDollar"        "charHash"         
## [55] "capitalAve"        "capitalLong"       "capitalTotal"     
## [58] "type"
summary(trainSpam)
##       make            address             all             num3d        
##  Min.   :0.00000   Min.   : 0.0000   Min.   :0.0000   Min.   : 0.0000  
##  1st Qu.:0.00000   1st Qu.: 0.0000   1st Qu.:0.0000   1st Qu.: 0.0000  
##  Median :0.00000   Median : 0.0000   Median :0.0000   Median : 0.0000  
##  Mean   :0.09756   Mean   : 0.1866   Mean   :0.2607   Mean   : 0.1072  
##  3rd Qu.:0.00000   3rd Qu.: 0.0000   3rd Qu.:0.3800   3rd Qu.: 0.0000  
##  Max.   :4.34000   Max.   :14.2800   Max.   :4.0000   Max.   :42.8100  
##       our              over            remove          internet      
##  Min.   :0.0000   Min.   :0.0000   Min.   :0.0000   Min.   : 0.0000  
##  1st Qu.:0.0000   1st Qu.:0.0000   1st Qu.:0.0000   1st Qu.: 0.0000  
##  Median :0.0000   Median :0.0000   Median :0.0000   Median : 0.0000  
##  Mean   :0.3222   Mean   :0.0997   Mean   :0.1195   Mean   : 0.1076  
##  3rd Qu.:0.3900   3rd Qu.:0.0000   3rd Qu.:0.0000   3rd Qu.: 0.0000  
##  Max.   :7.6900   Max.   :5.8800   Max.   :7.2700   Max.   :11.1100  
##      order              mail            receive             will      
##  Min.   :0.00000   Min.   : 0.0000   Min.   :0.00000   Min.   :0.000  
##  1st Qu.:0.00000   1st Qu.: 0.0000   1st Qu.:0.00000   1st Qu.:0.000  
##  Median :0.00000   Median : 0.0000   Median :0.00000   Median :0.150  
##  Mean   :0.08517   Mean   : 0.2424   Mean   :0.05683   Mean   :0.565  
##  3rd Qu.:0.00000   3rd Qu.: 0.1600   3rd Qu.:0.00000   3rd Qu.:0.840  
##  Max.   :5.26000   Max.   :18.1800   Max.   :2.06000   Max.   :7.690  
##      people            report           addresses            free        
##  Min.   :0.00000   Min.   : 0.00000   Min.   :0.00000   Min.   : 0.0000  
##  1st Qu.:0.00000   1st Qu.: 0.00000   1st Qu.:0.00000   1st Qu.: 0.0000  
##  Median :0.00000   Median : 0.00000   Median :0.00000   Median : 0.0000  
##  Mean   :0.09383   Mean   : 0.06453   Mean   :0.04173   Mean   : 0.2588  
##  3rd Qu.:0.00000   3rd Qu.: 0.00000   3rd Qu.:0.00000   3rd Qu.: 0.1100  
##  Max.   :5.55000   Max.   :10.00000   Max.   :2.31000   Max.   :20.0000  
##     business          email             you             credit      
##  Min.   :0.0000   Min.   :0.0000   Min.   : 0.000   Min.   :0.0000  
##  1st Qu.:0.0000   1st Qu.:0.0000   1st Qu.: 0.000   1st Qu.:0.0000  
##  Median :0.0000   Median :0.0000   Median : 1.290   Median :0.0000  
##  Mean   :0.1526   Mean   :0.1767   Mean   : 1.654   Mean   :0.0723  
##  3rd Qu.:0.0000   3rd Qu.:0.0000   3rd Qu.: 2.630   3rd Qu.:0.0000  
##  Max.   :7.1400   Max.   :6.6600   Max.   :14.280   Max.   :5.3300  
##       your              font             num000            money         
##  Min.   : 0.0000   Min.   : 0.0000   Min.   :0.00000   Min.   : 0.00000  
##  1st Qu.: 0.0000   1st Qu.: 0.0000   1st Qu.:0.00000   1st Qu.: 0.00000  
##  Median : 0.1700   Median : 0.0000   Median :0.00000   Median : 0.00000  
##  Mean   : 0.7861   Mean   : 0.1277   Mean   :0.09328   Mean   : 0.09315  
##  3rd Qu.: 1.2500   3rd Qu.: 0.0000   3rd Qu.:0.00000   3rd Qu.: 0.00000  
##  Max.   :11.1100   Max.   :17.1000   Max.   :4.76000   Max.   :12.50000  
##        hp               hpl              george          num650      
##  Min.   : 0.0000   Min.   : 0.0000   Min.   : 0.00   Min.   :0.0000  
##  1st Qu.: 0.0000   1st Qu.: 0.0000   1st Qu.: 0.00   1st Qu.:0.0000  
##  Median : 0.0000   Median : 0.0000   Median : 0.00   Median :0.0000  
##  Mean   : 0.5902   Mean   : 0.2818   Mean   : 0.71   Mean   :0.1252  
##  3rd Qu.: 0.0000   3rd Qu.: 0.0000   3rd Qu.: 0.00   3rd Qu.:0.0000  
##  Max.   :20.8300   Max.   :16.6600   Max.   :33.33   Max.   :8.3300  
##       lab                labs            telnet            num857       
##  Min.   : 0.00000   Min.   :0.0000   Min.   :0.00000   Min.   :0.00000  
##  1st Qu.: 0.00000   1st Qu.:0.0000   1st Qu.:0.00000   1st Qu.:0.00000  
##  Median : 0.00000   Median :0.0000   Median :0.00000   Median :0.00000  
##  Mean   : 0.09829   Mean   :0.1053   Mean   :0.05837   Mean   :0.04167  
##  3rd Qu.: 0.00000   3rd Qu.:0.0000   3rd Qu.:0.00000   3rd Qu.:0.00000  
##  Max.   :11.11000   Max.   :5.8800   Max.   :4.76000   Max.   :4.76000  
##       data             num415           num85          technology     
##  Min.   : 0.0000   Min.   :0.0000   Min.   :0.0000   Min.   :0.00000  
##  1st Qu.: 0.0000   1st Qu.:0.0000   1st Qu.:0.0000   1st Qu.:0.00000  
##  Median : 0.0000   Median :0.0000   Median :0.0000   Median :0.00000  
##  Mean   : 0.1061   Mean   :0.0428   Mean   :0.1047   Mean   :0.09569  
##  3rd Qu.: 0.0000   3rd Qu.:0.0000   3rd Qu.:0.0000   3rd Qu.:0.00000  
##  Max.   :18.1800   Max.   :4.7600   Max.   :5.8800   Max.   :4.76000  
##     num1999           parts               pm               direct       
##  Min.   :0.0000   Min.   :0.00000   Min.   : 0.00000   Min.   :0.00000  
##  1st Qu.:0.0000   1st Qu.:0.00000   1st Qu.: 0.00000   1st Qu.:0.00000  
##  Median :0.0000   Median :0.00000   Median : 0.00000   Median :0.00000  
##  Mean   :0.1458   Mean   :0.01597   Mean   : 0.08409   Mean   :0.06068  
##  3rd Qu.:0.0000   3rd Qu.:0.00000   3rd Qu.: 0.00000   3rd Qu.:0.00000  
##  Max.   :5.0500   Max.   :7.40000   Max.   :11.11000   Max.   :4.76000  
##        cs             meeting           original          project        
##  Min.   :0.00000   Min.   : 0.0000   Min.   :0.00000   Min.   : 0.00000  
##  1st Qu.:0.00000   1st Qu.: 0.0000   1st Qu.:0.00000   1st Qu.: 0.00000  
##  Median :0.00000   Median : 0.0000   Median :0.00000   Median : 0.00000  
##  Mean   :0.04053   Mean   : 0.1517   Mean   :0.04646   Mean   : 0.09815  
##  3rd Qu.:0.00000   3rd Qu.: 0.0000   3rd Qu.:0.00000   3rd Qu.: 0.00000  
##  Max.   :7.14000   Max.   :14.2800   Max.   :3.57000   Max.   :20.00000  
##        re              edu              table            conference      
##  Min.   : 0.000   Min.   : 0.0000   Min.   :0.000000   Min.   : 0.00000  
##  1st Qu.: 0.000   1st Qu.: 0.0000   1st Qu.:0.000000   1st Qu.: 0.00000  
##  Median : 0.000   Median : 0.0000   Median :0.000000   Median : 0.00000  
##  Mean   : 0.315   Mean   : 0.2057   Mean   :0.006017   Mean   : 0.03967  
##  3rd Qu.: 0.120   3rd Qu.: 0.0000   3rd Qu.:0.000000   3rd Qu.: 0.00000  
##  Max.   :21.420   Max.   :22.0500   Max.   :1.910000   Max.   :10.00000  
##  charSemicolon     charRoundbracket charSquarebracket charExclamation  
##  Min.   :0.00000   Min.   :0.0000   Min.   :0.00000   Min.   : 0.0000  
##  1st Qu.:0.00000   1st Qu.:0.0000   1st Qu.:0.00000   1st Qu.: 0.0000  
##  Median :0.00000   Median :0.0650   Median :0.00000   Median : 0.0000  
##  Mean   :0.04362   Mean   :0.1401   Mean   :0.01658   Mean   : 0.2714  
##  3rd Qu.:0.00000   3rd Qu.:0.1850   3rd Qu.:0.00000   3rd Qu.: 0.3100  
##  Max.   :4.38500   Max.   :9.7520   Max.   :4.08100   Max.   :32.4780  
##    charDollar         charHash          capitalAve       capitalLong     
##  Min.   :0.00000   Min.   : 0.00000   Min.   :  1.000   Min.   :   1.00  
##  1st Qu.:0.00000   1st Qu.: 0.00000   1st Qu.:  1.600   1st Qu.:   6.00  
##  Median :0.00000   Median : 0.00000   Median :  2.236   Median :  15.00  
##  Mean   :0.07403   Mean   : 0.05405   Mean   :  4.810   Mean   :  51.61  
##  3rd Qu.:0.04600   3rd Qu.: 0.00000   3rd Qu.:  3.676   3rd Qu.:  42.00  
##  Max.   :6.00300   Max.   :19.82900   Max.   :664.000   Max.   :9989.00  
##   capitalTotal          type     
##  Min.   :    1.0   nonspam:1381  
##  1st Qu.:   34.5   spam   : 906  
##  Median :   95.0                 
##  Mean   :  284.3                 
##  3rd Qu.:  263.0                 
##  Max.   :15841.0
head(trainSpam)
##    make address  all num3d  our over remove internet order mail receive will
## 1  0.00    0.64 0.64     0 0.32 0.00   0.00        0  0.00 0.00    0.00 0.64
## 7  0.00    0.00 0.00     0 1.92 0.00   0.00        0  0.00 0.64    0.96 1.28
## 9  0.15    0.00 0.46     0 0.61 0.00   0.30        0  0.92 0.76    0.76 0.92
## 12 0.00    0.00 0.25     0 0.38 0.25   0.25        0  0.00 0.00    0.12 0.12
## 14 0.00    0.00 0.00     0 0.90 0.00   0.90        0  0.00 0.90    0.90 0.00
## 16 0.00    0.42 0.42     0 1.27 0.00   0.42        0  0.00 1.27    0.00 0.00
##    people report addresses free business email  you credit your font num000
## 1    0.00      0         0 0.32        0  1.29 1.93   0.00 0.96    0      0
## 7    0.00      0         0 0.96        0  0.32 3.85   0.00 0.64    0      0
## 9    0.00      0         0 0.00        0  0.15 1.23   3.53 2.00    0      0
## 12   0.12      0         0 0.00        0  0.00 1.16   0.00 0.77    0      0
## 14   0.90      0         0 0.00        0  0.00 2.72   0.00 0.90    0      0
## 16   0.00      0         0 1.27        0  0.00 1.70   0.42 1.27    0      0
##    money hp hpl george num650 lab labs telnet num857 data num415 num85
## 1   0.00  0   0      0      0   0    0      0      0 0.00      0     0
## 7   0.00  0   0      0      0   0    0      0      0 0.00      0     0
## 9   0.15  0   0      0      0   0    0      0      0 0.15      0     0
## 12  0.00  0   0      0      0   0    0      0      0 0.00      0     0
## 14  0.00  0   0      0      0   0    0      0      0 0.00      0     0
## 16  0.42  0   0      0      0   0    0      0      0 0.00      0     0
##    technology num1999 parts pm direct cs meeting original project re edu table
## 1           0    0.00     0  0   0.00  0       0      0.0       0  0   0     0
## 7           0    0.00     0  0   0.00  0       0      0.0       0  0   0     0
## 9           0    0.00     0  0   0.00  0       0      0.3       0  0   0     0
## 12          0    0.00     0  0   0.00  0       0      0.0       0  0   0     0
## 14          0    0.00     0  0   0.00  0       0      0.0       0  0   0     0
## 16          0    1.27     0  0   0.42  0       0      0.0       0  0   0     0
##    conference charSemicolon charRoundbracket charSquarebracket charExclamation
## 1           0         0.000            0.000                 0           0.778
## 7           0         0.000            0.054                 0           0.164
## 9           0         0.000            0.271                 0           0.181
## 12          0         0.022            0.044                 0           0.663
## 14          0         0.000            0.000                 0           0.000
## 16          0         0.000            0.063                 0           0.572
##    charDollar charHash capitalAve capitalLong capitalTotal type
## 1       0.000    0.000      3.756          61          278 spam
## 7       0.054    0.000      1.671           4          112 spam
## 9       0.203    0.022      9.744         445         1257 spam
## 12      0.000    0.000      1.243          11          184 spam
## 14      0.000    0.000      2.083           7           25 spam
## 16      0.063    0.000      5.659          55          249 spam

-Check for missing data & data types

table(is.na(trainSpam))
## 
##  FALSE 
## 132646
table(trainSpam$type)
## 
## nonspam    spam 
##    1381     906

-Create exploratory plots

plot(trainSpam$capitalAve ~ trainSpam$type)

#used (log10(df) +1)as data was too compact
plot(log10(trainSpam$capitalAve + 1) ~ trainSpam$type)

#Relationships between predictors Relationships between predictors
plot(log10(trainSpam[, 1:4] + 1))

hCluster = hclust(dist(t(trainSpam[, 1:57])))
#not very informative
plot(hCluster)

#after data trasformation
hClusterUpdated = hclust(dist(t(log10(trainSpam[, 1:55] + 1))))
plot(hClusterUpdated)

#install.packages('boot')
trainSpam$numType = as.numeric(trainSpam$type) - 1
costFunction = function(x, y) sum(x != (y > 0.5))
cvError = rep(NA, 55)
library(boot)
for (i in 1:55) {
 lmFormula = reformulate(names(trainSpam)[i], response = "numType")
 glmFit = glm(lmFormula, family = "binomial", data = trainSpam)
 cvError[i] = cv.glm(trainSpam, glmFit, costFunction, 2)$delta[2]
}
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Which predictor has minimum cross-validated error?
names(trainSpam)[which.min(cvError)]
## [1] "charDollar"
## Use the best model from the group
predictionModel = glm(numType ~ charDollar, family = "binomial", data = trainSpam)
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Get predictions on the test set
predictionTest = predict(predictionModel, testSpam)
predictedSpam = rep("nonspam", dim(testSpam)[1])
## Classify as `spam' for those with prob > 0.5
predictedSpam[predictionModel$fitted > 0.5] = "spam"
## Classification table
table(predictedSpam, testSpam$type)
##              
## predictedSpam nonspam spam
##       nonspam    1346  458
##       spam         61  449
##
## predictedSpam nonspam spam
##       nonspam   1346   458
##          spam     61   449
## Error rate
(61 + 458)/(1346 + 458 + 61 + 449)
## [1] 0.2242869

Discussion: The fraction of characters that are dollar signs can be used to predict if an email is Spam, anything with more than 6.6% dollar signs is classified as Spam. More dollar signs always means more Spam under our prediction, the test set error rate was 22.4%.