data<-read.csv(url("http://archive.ics.uci.edu/ml/machine-learning-databases/spambase/spambase.data"),
header=FALSE)
The Following Variables, “char_freq_;,char_freq_(,char_freq_[,char_freq_!,char_freq_$,char_freq_#”, were replaced with “char_freq_1,char_freq_2.”char_freq_3,char_freq_4“,char_freq_5,char_freq_6” respectively to avoid a bug in the Random Forest package
data$spam <- ifelse(test=data$spam ==0 , yes ="nonspam", no="spam")
data$spam <- as.factor(data$spam)
table(data$spam)
##
## nonspam spam
## 2788 1813
set.seed(120)
ind<-sample(2,nrow(data), replace = TRUE, prob= c(0.7,0.3))
train<- data[ind==1,]
test<- data[ind==2,]
str(train)
## 'data.frame': 3284 obs. of 58 variables:
## $ word_freq_make : num 0 0.21 0.06 0 0 0 0 0 0 0 ...
## $ word_freq_address : num 0.64 0.28 0 0 0 0 0 0.69 0 0.42 ...
## $ word_freq_all : num 0.64 0.5 0.71 0 0 0 0 0.34 0 0.42 ...
## $ word_freq_3d : num 0 0 0 0 0 0 0 0 0 0 ...
## $ word_freq_our : num 0.32 0.14 1.23 0.63 1.85 1.88 0 0.34 0.9 1.27 ...
## $ word_freq_over : num 0 0.28 0.19 0 0 0 0 0 0 0 ...
## $ word_freq_remove : num 0 0.21 0.19 0.31 0 0 0.96 0 0.9 0.42 ...
## $ word_freq_internet : num 0 0.07 0.12 0.63 1.85 1.88 0 0 0 0 ...
## $ word_freq_order : num 0 0 0.64 0.31 0 0 0 0 0 0 ...
## $ word_freq_mail : num 0 0.94 0.25 0.63 0 0 1.92 0 0.9 1.27 ...
## $ word_freq_receive : num 0 0.21 0.38 0.31 0 0 0.96 0 0.9 0 ...
## $ word_freq_will : num 0.64 0.79 0.45 0.31 0 0 0 0.69 0 0 ...
## $ word_freq_people : num 0 0.65 0.12 0.31 0 0 0 0 0.9 0 ...
## $ word_freq_report : num 0 0.21 0 0 0 0 0 0 0 0 ...
## $ word_freq_addresses : num 0 0.14 1.75 0 0 0 0 0 0 0 ...
## $ word_freq_free : num 0.32 0.14 0.06 0.31 0 0 0 0.34 0 1.27 ...
## $ word_freq_business : num 0 0.07 0.06 0 0 0 0 0 0 0 ...
## $ word_freq_email : num 1.29 0.28 1.03 0 0 0 0.96 1.39 0 0 ...
## $ word_freq_you : num 1.93 3.47 1.36 3.18 0 0 3.84 2.09 2.72 1.7 ...
## $ word_freq_credit : num 0 0 0.32 0 0 0 0 0 0 0.42 ...
## $ word_freq_your : num 0.96 1.59 0.51 0.31 0 0 0.96 1.04 0.9 1.27 ...
## $ word_freq_font : num 0 0 0 0 0 0 0 0 0 0 ...
## $ word_freq_000 : num 0 0.43 1.16 0 0 0 0 0 0 0 ...
## $ word_freq_money : num 0 0.43 0.06 0 0 0 0 0 0 0.42 ...
## $ word_freq_hp : num 0 0 0 0 0 0 0 0 0 0 ...
## $ word_freq_hpl : num 0 0 0 0 0 0 0 0 0 0 ...
## $ word_freq_george : num 0 0 0 0 0 0 0 0 0 0 ...
## $ word_freq_650 : num 0 0 0 0 0 0 0 0 0 0 ...
## $ word_freq_lab : num 0 0 0 0 0 0 0 0 0 0 ...
## $ word_freq_labs : num 0 0 0 0 0 0 0 0 0 0 ...
## $ word_freq_telnet : num 0 0 0 0 0 0 0 0 0 0 ...
## $ word_freq_857 : num 0 0 0 0 0 0 0 0 0 0 ...
## $ word_freq_data : num 0 0 0 0 0 0 0 0 0 0 ...
## $ word_freq_415 : num 0 0 0 0 0 0 0 0 0 0 ...
## $ word_freq_85 : num 0 0 0 0 0 0 0 0 0 0 ...
## $ word_freq_technology : num 0 0 0 0 0 0 0 0 0 0 ...
## $ word_freq_1999 : num 0 0.07 0 0 0 0 0 0 0 1.27 ...
## $ word_freq_parts : num 0 0 0 0 0 0 0 0 0 0 ...
## $ word_freq_pm : num 0 0 0 0 0 0 0 0 0 0 ...
## $ word_freq_direct : num 0 0 0.06 0 0 0 0.96 0 0 0.42 ...
## $ word_freq_cs : num 0 0 0 0 0 0 0 0 0 0 ...
## $ word_freq_meeting : num 0 0 0 0 0 0 0 0 0 0 ...
## $ word_freq_original : num 0 0 0.12 0 0 0 0 0 0 0 ...
## $ word_freq_project : num 0 0 0 0 0 0 0 0 0 0 ...
## $ word_freq_re : num 0 0 0.06 0 0 0 0 0 0 0 ...
## $ word_freq_edu : num 0 0 0.06 0 0 0 0 0 0 0 ...
## $ word_freq_table : num 0 0 0 0 0 0 0 0 0 0 ...
## $ word_freq_conference : num 0 0 0 0 0 0 0 0 0 0 ...
## $ char_freq_1 : num 0 0 0.01 0 0 0 0 0 0 0 ...
## $ char_freq_2 : num 0 0.132 0.143 0.135 0.223 0.206 0 0.056 0 0.063 ...
## $ char_freq_3 : num 0 0 0 0 0 0 0 0 0 0 ...
## $ char_freq_4 : num 0.778 0.372 0.276 0.135 0 0 0.462 0.786 0 0.572 ...
## $ char_freq_5 : num 0 0.18 0.184 0 0 0 0 0 0 0.063 ...
## $ char_freq_6 : num 0 0.048 0.01 0 0 0 0 0 0 0 ...
## $ capital_run_length_average: num 3.76 5.11 9.82 3.54 3 ...
## $ capital_run_length_longest: int 61 101 485 40 15 11 6 61 7 55 ...
## $ capital_run_length_total : int 278 1028 2259 191 54 49 21 261 25 249 ...
## $ spam : Factor w/ 2 levels "nonspam","spam": 2 2 2 2 2 2 2 2 2 2 ...
The data was divided into a training and testing datasets.
set.seed(230)
library(caret)
## Warning: package 'caret' was built under R version 4.0.5
## Loading required package: lattice
## Loading required package: ggplot2
## Warning: package 'ggplot2' was built under R version 4.0.4
library(randomForest)
## Warning: package 'randomForest' was built under R version 4.0.5
## randomForest 4.6-14
## Type rfNews() to see new features/changes/bug fixes.
##
## Attaching package: 'randomForest'
## The following object is masked from 'package:ggplot2':
##
## margin
rf<- randomForest(spam~.,data=train)
print(rf)
##
## Call:
## randomForest(formula = spam ~ ., data = train)
## Type of random forest: classification
## Number of trees: 500
## No. of variables tried at each split: 7
##
## OOB estimate of error rate: 4.75%
## Confusion matrix:
## nonspam spam class.error
## nonspam 1966 55 0.02721425
## spam 101 1162 0.07996833
We can see that the average prediction error for the unseen sample to be 4.75% for the random forest.
p1<- predict(rf, train)
confusionMatrix(p1, train$spam)
## Confusion Matrix and Statistics
##
## Reference
## Prediction nonspam spam
## nonspam 2021 8
## spam 0 1255
##
## Accuracy : 0.9976
## 95% CI : (0.9952, 0.9989)
## No Information Rate : 0.6154
## P-Value [Acc > NIR] : < 2e-16
##
## Kappa : 0.9948
##
## Mcnemar's Test P-Value : 0.01333
##
## Sensitivity : 1.0000
## Specificity : 0.9937
## Pos Pred Value : 0.9961
## Neg Pred Value : 1.0000
## Prevalence : 0.6154
## Detection Rate : 0.6154
## Detection Prevalence : 0.6178
## Balanced Accuracy : 0.9968
##
## 'Positive' Class : nonspam
##
We Can See that the accuracy of the Random Forest on the training set is 99.76% with all the error coming nonspam mail labeled as spam
p2<- predict(rf, test)
confusionMatrix(p2, test$spam)
## Confusion Matrix and Statistics
##
## Reference
## Prediction nonspam spam
## nonspam 742 42
## spam 25 508
##
## Accuracy : 0.9491
## 95% CI : (0.9358, 0.9604)
## No Information Rate : 0.5824
## P-Value [Acc > NIR] : < 2e-16
##
## Kappa : 0.895
##
## Mcnemar's Test P-Value : 0.05062
##
## Sensitivity : 0.9674
## Specificity : 0.9236
## Pos Pred Value : 0.9464
## Neg Pred Value : 0.9531
## Prevalence : 0.5824
## Detection Rate : 0.5634
## Detection Prevalence : 0.5953
## Balanced Accuracy : 0.9455
##
## 'Positive' Class : nonspam
##
We Observe that for the unseen dataset, the accuracy drops down to 94.91%. The error rate is not significantly larger than the out of bag error rate, meaning that this drop is expected.
plot (rf)
We can see that the error rate for the Random Forests stagnates somewhere between 100 and 200 trees.
t<-tuneRF(train[,-58],
train[,58],
stepFactor= 0.5,
plot = TRUE,
ntreeTry = 300,
trace = TRUE,
improve=0.01)
## mtry = 7 OOB error = 4.6%
## Searching left ...
## mtry = 14 OOB error = 4.78%
## -0.0397351 0.01
## Searching right ...
## mtry = 3 OOB error = 4.78%
## -0.0397351 0.01
My choice for 300 trees was based on the computing time and accuracy. Running the Tune Random Forest Command, we can see that 7 variables for each split is the best choice for the Random Forest.
rf<- randomForest(spam~.,data=train,
ntree = 300,
mtry= 7,
importance= TRUE,
Proximity = TRUE)
print(rf)
##
## Call:
## randomForest(formula = spam ~ ., data = train, ntree = 300, mtry = 7, importance = TRUE, Proximity = TRUE)
## Type of random forest: classification
## Number of trees: 300
## No. of variables tried at each split: 7
##
## OOB estimate of error rate: 4.57%
## Confusion matrix:
## nonspam spam class.error
## nonspam 1964 57 0.02820386
## spam 93 1170 0.07363420
We Can See that the out of bag error rate slightly goes down to 4.57%.
p2<- predict(rf, test)
confusionMatrix(p2, test$spam)
## Confusion Matrix and Statistics
##
## Reference
## Prediction nonspam spam
## nonspam 740 42
## spam 27 508
##
## Accuracy : 0.9476
## 95% CI : (0.9342, 0.959)
## No Information Rate : 0.5824
## P-Value [Acc > NIR] : < 2e-16
##
## Kappa : 0.8919
##
## Mcnemar's Test P-Value : 0.09191
##
## Sensitivity : 0.9648
## Specificity : 0.9236
## Pos Pred Value : 0.9463
## Neg Pred Value : 0.9495
## Prevalence : 0.5824
## Detection Rate : 0.5619
## Detection Prevalence : 0.5938
## Balanced Accuracy : 0.9442
##
## 'Positive' Class : nonspam
##
Unexpectedly, the error rate goes up. However, the accuracy rate for the RF will most likely only fluctuate through the confidence interval depending on the number of trees. Therefore, the tuned rf performs similarly to the untuned RF.
hist(treesize(rf),
main = "Number of Nodes for the Trees",
col = "blue")
This Histograms showcases the frequency of trees on a certain number of nodes.As,we can see 230 nodes appears to be the most frequent number of nodes for trees.
varImpPlot(rf,
sort= T,
n.var=10,
main= "Top 10- Variable Importance")
We can see from the importance graph, that char_freq_ 4 appears to be the most important variable in predicting spam.
getTree(rf,1, labelVar =TRUE)