Read dataset and divide data into 70% training and 30% validation sample
library(nutshell)
## Warning: package 'nutshell' was built under R version 3.1.2
## Loading required package: nutshell.bbdb
## Warning: package 'nutshell.bbdb' was built under R version 3.1.2
## Loading required package: nutshell.audioscrobbler
## Warning: package 'nutshell.audioscrobbler' was built under R version 3.1.2
data(spambase)
library(caret)
## Warning: package 'caret' was built under R version 3.1.2
## Loading required package: lattice
## Loading required package: ggplot2
set.seed(100)
inTrain=createDataPartition(y=spambase$is_spam,p=.7,list=F)
spambase.training<-spambase[inTrain,]
spambase.validation<-spambase[-inTrain,]
Quadratic Discriminant Analysis
library(MASS)
spam.qda<-qda(formula=is_spam~.,data=spambase.training)
summary(spam.qda)
## Length Class Mode
## prior 2 -none- numeric
## counts 2 -none- numeric
## means 114 -none- numeric
## scaling 6498 -none- numeric
## ldet 2 -none- numeric
## lev 2 -none- character
## N 1 -none- numeric
## call 3 -none- call
## terms 3 terms call
## xlevels 0 -none- list
Checking Predicted Results
table(actual=spambase.training$is_spam,predicted=predict(spam.qda,newdata=spambase.training)$class)
## predicted
## actual 0 1
## 0 1445 507
## 1 61 1209
table(actual=spambase.validation$is_spam,predicted=predict(spam.qda,newdata=spambase.validation)$class)
## predicted
## actual 0 1
## 0 647 189
## 1 31 512
Flexible discriminant analysis
fits a model by nonparametric regression
library(mda)
## Warning: package 'mda' was built under R version 3.1.2
## Loading required package: class
## Loaded mda 0.4-4
spam.fda<-fda(formula=is_spam~.,data=spambase.training)
table(actual=spambase.validation$is_spam,predicted=predict(spam.fda,newdata=spambase.validation,type="class"))
## predicted
## actual 0 1
## 0 796 40
## 1 118 425
Mixture discriminant analysis
represents each class with a Gaussian mixture
spam.mda<-mda(formula=is_spam~.,data=spambase.training)
table(actual=spambase.validation$is_spam,predicted=predict(spam.mda,newdata=spambase.validation,type="class"))
## predicted
## actual 0 1
## 0 796 40
## 1 115 428
k Nearest Neighbors
1. Specifies a training data set.
2. To Predict the class of a new value, the algorthim looks for the k observations in the training set that are closest to the new value.
3. The prediction for the new value is the class of the majority of the k nearest neighbors.
spambase.knn<-knn(train=spambase.training,test=spambase.validation,cl = spambase.training$is_spam,k = 5)
summary(spambase.knn)
## 0 1
## 856 523
table(predicted=spambase.knn,acttual=spambase.validation$is_spam)
## acttual
## predicted 0 1
## 0 717 139
## 1 119 404
Classification and regression trees model (CART)
Start with a single set containing all the training data.
If the number of observations is less than the minimum required for a split, stop splitting the tree. Output the average of all the y-values in the training data as the predicted value for the terminal node.
Find a variable \(x{j}\) and the value s that minimizes the RMS error when you split the data into two sets.
Stop if there is only one node in the tree.
Measure the cost/complexity of the overall tree.
Try collapsing each internal node on the tree and measure which subtree has the best cost/complexity.
Repeat the process on the subtree with the best cost/complexity.
library(rpart)
spam.tree<-rpart(is_spam~.,data=spambase.training)
printcp(spam.tree)
##
## Classification tree:
## rpart(formula = is_spam ~ ., data = spambase.training)
##
## Variables actually used in tree construction:
## [1] capital_run_length_average char_freq_dollar
## [3] char_freq_exclamation word_freq_free
## [5] word_freq_hp word_freq_internet
## [7] word_freq_remove
##
## Root node error: 1270/3222 = 0.39417
##
## n= 3222
##
## CP nsplit rel error xerror xstd
## 1 0.492126 0 1.00000 1.00000 0.021841
## 2 0.069291 1 0.50787 0.50945 0.017905
## 3 0.065354 2 0.43858 0.46457 0.017286
## 4 0.058268 3 0.37323 0.40866 0.016430
## 5 0.025197 4 0.31496 0.33622 0.015154
## 6 0.022047 5 0.28976 0.32126 0.014864
## 7 0.019685 6 0.26772 0.28504 0.014115
## 8 0.011811 7 0.24803 0.26378 0.013642
## 9 0.010000 8 0.23622 0.25276 0.013386
library(maptree)
## Loading required package: cluster
draw.tree(spam.tree,nodeinfo=T,cex=0.5,col=gray(0:8/8))
table(actual=spambase.validation$is_spam,prediction=predict(spam.tree,newdata=spambase.validation,type="class"))
## prediction
## actual 0 1
## 0 772 64
## 1 88 455
build predictive models based on other models (most commonly trees).
Use bootstrapping to build a number of different models and then average the results.
library(adabag)
## Warning: package 'adabag' was built under R version 3.1.2
## Loading required package: mlbench
## Warning: package 'mlbench' was built under R version 3.1.2
spam.bag<-bagging(formula=is_spam~.,data=spambase.training)
summary(spam.bag)
## Length Class Mode
## formula 3 formula call
## trees 100 -none- list
## votes 6444 -none- numeric
## prob 6444 -none- numeric
## class 3222 -none- character
## samples 322200 -none- numeric
## importance 57 -none- numeric
table(actual=spambase.training$is_spam,predicted=predict(spam.bag,newdata=spambase.training)$class)
## predicted
## actual 0 1
## 0 1834 118
## 1 164 1106
table(actual=spambase.validation$is_spam,predicted=predict(spam.bag,newdata=spambase.validation)$class)
## predicted
## actual 0 1
## 0 783 53
## 1 85 458
Individual models don’t all have equal votes. Better models are given stronger votes.
library(ada)
## Warning: package 'ada' was built under R version 3.1.2
(spam.ada<-ada(formula=is_spam~.,data=spambase.training,loss="logistic"))
## Call:
## ada(is_spam ~ ., data = spambase.training, loss = "logistic")
##
## Loss: logistic Method: discrete Iteration: 50
##
## Final Confusion Matrix for Data:
## Final Prediction
## True value 0 1
## 0 1936 16
## 1 37 1233
##
## Train Error: 0.016
##
## Out-Of-Bag Error: 0.034 iteration= 50
##
## Additional Estimates of number of iterations:
##
## train.err1 train.kap1
## 50 50
table(actual=spambase.training$is_spam,predicted=predict(spam.ada,newdata=spambase.training))
## predicted
## actual 0 1
## 0 1936 16
## 1 37 1233
table(actual=spambase.validation$is_spam,predicted=predict(spam.ada,newdata=spambase.validation))
## predicted
## actual 0 1
## 0 809 27
## 1 38 505
library(nnet)
spam.nnet<-nnet(is_spam~.,data=spambase.training,size=10,decay=0.1)
## # weights: 591
## initial value 2354.815458
## iter 10 value 1812.585455
## iter 20 value 1156.545027
## iter 30 value 798.502404
## iter 40 value 589.741218
## iter 50 value 539.109435
## iter 60 value 475.549209
## iter 70 value 465.069186
## iter 80 value 439.510277
## iter 90 value 412.013948
## iter 100 value 405.223837
## final value 405.223837
## stopped after 100 iterations
table(actual=spambase.training$is_spam,predicted=predict(spam.nnet,type="class"))
## predicted
## actual 0 1
## 0 1905 47
## 1 74 1196
table(actual=spambase.validation$is_spam,predicted=predict(spam.nnet,newdata=spambase.validation,type="class"))
## predicted
## actual 0 1
## 0 803 33
## 1 41 502
library(e1071)
spam.svm<-svm(is_spam~.,data=spambase.training)
spam.svm
##
## Call:
## svm(formula = is_spam ~ ., data = spambase.training)
##
##
## Parameters:
## SVM-Type: C-classification
## SVM-Kernel: radial
## cost: 1
## gamma: 0.01754386
##
## Number of Support Vectors: 973
table(actual=spambase.validation$is_spam,predicted=predict(spam.svm,newdata=spambase.validation,type="class"))
## predicted
## actual 0 1
## 0 803 33
## 1 63 480
library(randomForest)
## randomForest 4.6-10
## Type rfNews() to see new features/changes/bug fixes.
spam.rf<-randomForest(is_spam~.,data=spambase.training)
spam.rf
##
## Call:
## randomForest(formula = is_spam ~ ., data = spambase.training)
## Type of random forest: classification
## Number of trees: 500
## No. of variables tried at each split: 7
##
## OOB estimate of error rate: 4.81%
## Confusion matrix:
## 0 1 class.error
## 0 1899 53 0.02715164
## 1 102 1168 0.08031496
table(actual=spambase.validation$is_spam,predicted=predict(spam.rf,newdata=spambase.validation,type="class"))
## predicted
## actual 0 1
## 0 813 23
## 1 44 499