Identify Spam Mail

Read dataset and divide data into 70% training and 30% validation sample

library(nutshell)

## Warning: package 'nutshell' was built under R version 3.1.2

## Loading required package: nutshell.bbdb

## Warning: package 'nutshell.bbdb' was built under R version 3.1.2

## Loading required package: nutshell.audioscrobbler

## Warning: package 'nutshell.audioscrobbler' was built under R version 3.1.2

data(spambase)
library(caret)

## Warning: package 'caret' was built under R version 3.1.2

## Loading required package: lattice
## Loading required package: ggplot2

set.seed(100)
inTrain=createDataPartition(y=spambase$is_spam,p=.7,list=F)
spambase.training<-spambase[inTrain,]
spambase.validation<-spambase[-inTrain,]

Quadratic Discriminant Analysis

library(MASS)
spam.qda<-qda(formula=is_spam~.,data=spambase.training)
summary(spam.qda)

##         Length Class  Mode     
## prior      2   -none- numeric  
## counts     2   -none- numeric  
## means    114   -none- numeric  
## scaling 6498   -none- numeric  
## ldet       2   -none- numeric  
## lev        2   -none- character
## N          1   -none- numeric  
## call       3   -none- call     
## terms      3   terms  call     
## xlevels    0   -none- list

Checking Predicted Results

table(actual=spambase.training$is_spam,predicted=predict(spam.qda,newdata=spambase.training)$class)

##       predicted
## actual    0    1
##      0 1445  507
##      1   61 1209

table(actual=spambase.validation$is_spam,predicted=predict(spam.qda,newdata=spambase.validation)$class)

##       predicted
## actual   0   1
##      0 647 189
##      1  31 512

Flexible discriminant analysis
fits a model by nonparametric regression

library(mda)

## Warning: package 'mda' was built under R version 3.1.2

## Loading required package: class
## Loaded mda 0.4-4

spam.fda<-fda(formula=is_spam~.,data=spambase.training)
table(actual=spambase.validation$is_spam,predicted=predict(spam.fda,newdata=spambase.validation,type="class"))

##       predicted
## actual   0   1
##      0 796  40
##      1 118 425

Mixture discriminant analysis
represents each class with a Gaussian mixture

spam.mda<-mda(formula=is_spam~.,data=spambase.training)
table(actual=spambase.validation$is_spam,predicted=predict(spam.mda,newdata=spambase.validation,type="class"))

##       predicted
## actual   0   1
##      0 796  40
##      1 115 428

k Nearest Neighbors
1. Specifies a training data set.
2. To Predict the class of a new value, the algorthim looks for the k observations in the training set that are closest to the new value.
3. The prediction for the new value is the class of the majority of the k nearest neighbors.

spambase.knn<-knn(train=spambase.training,test=spambase.validation,cl = spambase.training$is_spam,k = 5)
summary(spambase.knn)

##   0   1 
## 856 523

table(predicted=spambase.knn,acttual=spambase.validation$is_spam)

##          acttual
## predicted   0   1
##         0 717 139
##         1 119 404

Classification and regression trees model (CART)

Grow the tree using the following (recursive) method:

Start with a single set containing all the training data.
If the number of observations is less than the minimum required for a split, stop splitting the tree. Output the average of all the y-values in the training data as the predicted value for the terminal node.
Find a variable \(x{j}\) and the value s that minimizes the RMS error when you split the data into two sets.

Prune the tree using the following (iterative) method:

Stop if there is only one node in the tree.
Measure the cost/complexity of the overall tree.
Try collapsing each internal node on the tree and measure which subtree has the best cost/complexity.
Repeat the process on the subtree with the best cost/complexity.

Output the tree with the lowest cost/complexity.

library(rpart)
spam.tree<-rpart(is_spam~.,data=spambase.training)
printcp(spam.tree)

## 
## Classification tree:
## rpart(formula = is_spam ~ ., data = spambase.training)
## 
## Variables actually used in tree construction:
## [1] capital_run_length_average char_freq_dollar          
## [3] char_freq_exclamation      word_freq_free            
## [5] word_freq_hp               word_freq_internet        
## [7] word_freq_remove          
## 
## Root node error: 1270/3222 = 0.39417
## 
## n= 3222 
## 
##         CP nsplit rel error  xerror     xstd
## 1 0.492126      0   1.00000 1.00000 0.021841
## 2 0.069291      1   0.50787 0.50945 0.017905
## 3 0.065354      2   0.43858 0.46457 0.017286
## 4 0.058268      3   0.37323 0.40866 0.016430
## 5 0.025197      4   0.31496 0.33622 0.015154
## 6 0.022047      5   0.28976 0.32126 0.014864
## 7 0.019685      6   0.26772 0.28504 0.014115
## 8 0.011811      7   0.24803 0.26378 0.013642
## 9 0.010000      8   0.23622 0.25276 0.013386

Plotting Classification Tree

library(maptree)

## Loading required package: cluster

draw.tree(spam.tree,nodeinfo=T,cex=0.5,col=gray(0:8/8))

table(actual=spambase.validation$is_spam,prediction=predict(spam.tree,newdata=spambase.validation,type="class"))

##       prediction
## actual   0   1
##      0 772  64
##      1  88 455

Bagging (bootstrap aggregation)

build predictive models based on other models (most commonly trees).
Use bootstrapping to build a number of different models and then average the results.

library(adabag)

## Warning: package 'adabag' was built under R version 3.1.2

## Loading required package: mlbench

## Warning: package 'mlbench' was built under R version 3.1.2

spam.bag<-bagging(formula=is_spam~.,data=spambase.training)
summary(spam.bag)

##            Length Class   Mode     
## formula         3 formula call     
## trees         100 -none-  list     
## votes        6444 -none-  numeric  
## prob         6444 -none-  numeric  
## class        3222 -none-  character
## samples    322200 -none-  numeric  
## importance     57 -none-  numeric

table(actual=spambase.training$is_spam,predicted=predict(spam.bag,newdata=spambase.training)$class)

##       predicted
## actual    0    1
##      0 1834  118
##      1  164 1106

table(actual=spambase.validation$is_spam,predicted=predict(spam.bag,newdata=spambase.validation)$class)

##       predicted
## actual   0   1
##      0 783  53
##      1  85 458

Boosting

Individual models don’t all have equal votes. Better models are given stronger votes.

library(ada)

## Warning: package 'ada' was built under R version 3.1.2

(spam.ada<-ada(formula=is_spam~.,data=spambase.training,loss="logistic"))

## Call:
## ada(is_spam ~ ., data = spambase.training, loss = "logistic")
## 
## Loss: logistic Method: discrete   Iteration: 50 
## 
## Final Confusion Matrix for Data:
##           Final Prediction
## True value    0    1
##          0 1936   16
##          1   37 1233
## 
## Train Error: 0.016 
## 
## Out-Of-Bag Error:  0.034  iteration= 50 
## 
## Additional Estimates of number of iterations:
## 
## train.err1 train.kap1 
##         50         50

table(actual=spambase.training$is_spam,predicted=predict(spam.ada,newdata=spambase.training))

##       predicted
## actual    0    1
##      0 1936   16
##      1   37 1233

table(actual=spambase.validation$is_spam,predicted=predict(spam.ada,newdata=spambase.validation))

##       predicted
## actual   0   1
##      0 809  27
##      1  38 505

Neural Networks

library(nnet)
spam.nnet<-nnet(is_spam~.,data=spambase.training,size=10,decay=0.1)

## # weights:  591
## initial  value 2354.815458 
## iter  10 value 1812.585455
## iter  20 value 1156.545027
## iter  30 value 798.502404
## iter  40 value 589.741218
## iter  50 value 539.109435
## iter  60 value 475.549209
## iter  70 value 465.069186
## iter  80 value 439.510277
## iter  90 value 412.013948
## iter 100 value 405.223837
## final  value 405.223837 
## stopped after 100 iterations

table(actual=spambase.training$is_spam,predicted=predict(spam.nnet,type="class"))

##       predicted
## actual    0    1
##      0 1905   47
##      1   74 1196

table(actual=spambase.validation$is_spam,predicted=predict(spam.nnet,newdata=spambase.validation,type="class"))

##       predicted
## actual   0   1
##      0 803  33
##      1  41 502

Support Vector Machine

library(e1071)
spam.svm<-svm(is_spam~.,data=spambase.training)
spam.svm

## 
## Call:
## svm(formula = is_spam ~ ., data = spambase.training)
## 
## 
## Parameters:
##    SVM-Type:  C-classification 
##  SVM-Kernel:  radial 
##        cost:  1 
##       gamma:  0.01754386 
## 
## Number of Support Vectors:  973

table(actual=spambase.validation$is_spam,predicted=predict(spam.svm,newdata=spambase.validation,type="class"))

##       predicted
## actual   0   1
##      0 803  33
##      1  63 480

Random Forest

library(randomForest)

## randomForest 4.6-10
## Type rfNews() to see new features/changes/bug fixes.

spam.rf<-randomForest(is_spam~.,data=spambase.training)
spam.rf

## 
## Call:
##  randomForest(formula = is_spam ~ ., data = spambase.training) 
##                Type of random forest: classification
##                      Number of trees: 500
## No. of variables tried at each split: 7
## 
##         OOB estimate of  error rate: 4.81%
## Confusion matrix:
##      0    1 class.error
## 0 1899   53  0.02715164
## 1  102 1168  0.08031496

table(actual=spambase.validation$is_spam,predicted=predict(spam.rf,newdata=spambase.validation,type="class"))

##       predicted
## actual   0   1
##      0 813  23
##      1  44 499