The dataset is based on the real data examples of spam emails at - https://archive.ics.uci.edu/ml/datasets/Spambase The dataset contains 57 attributes. Each attribute refers to the occurance or frequency of a specific word in the spambase email files.
It also contains attributes which show the number of certain chars in the email like “;”, “$”, “!” etc.
And the last attribute denotes whether the email was considered spam (1) or ham (not spam) (0).
# Reading the file from my repository into R
file = "https://raw.githubusercontent.com/isrini/SI_IS607/master/data.csv"
data <- read.csv(file, header=TRUE, sep="|")
# Make the last column 'y' (spam yes or no) a factor variable for binary classification
data$y <- as.factor(data$y)
# Sample of 100 rows
data <- data[sample(nrow(data), 500),]
colnames(data)
## [1] "word_freq_make" "word_freq_address"
## [3] "word_freq_all" "word_freq_3d"
## [5] "word_freq_our" "word_freq_over"
## [7] "word_freq_remove" "word_freq_internet"
## [9] "word_freq_order" "word_freq_mail"
## [11] "word_freq_receive" "word_freq_will"
## [13] "word_freq_people" "word_freq_report"
## [15] "word_freq_addresses" "word_freq_free"
## [17] "word_freq_business" "word_freq_email"
## [19] "word_freq_you" "word_freq_credit"
## [21] "word_freq_your" "word_freq_font"
## [23] "word_freq_000" "word_freq_money"
## [25] "word_freq_hp" "word_freq_hpl"
## [27] "word_freq_george" "word_freq_650"
## [29] "word_freq_lab" "word_freq_labs"
## [31] "word_freq_telnet" "word_freq_857"
## [33] "word_freq_data" "word_freq_415"
## [35] "word_freq_85" "word_freq_technology"
## [37] "word_freq_1999" "word_freq_parts"
## [39] "word_freq_pm" "word_freq_direct"
## [41] "word_freq_cs" "word_freq_meeting"
## [43] "word_freq_original" "word_freq_project"
## [45] "word_freq_re" "word_freq_edu"
## [47] "word_freq_table" "word_freq_conference"
## [49] "char_freq_." "char_freq_..1"
## [51] "char_freq_..2" "char_freq_..3"
## [53] "char_freq_" "char_freq_..4"
## [55] "capital_run_length_average" "capital_run_length_longest"
## [57] "capital_run_length_total" "y"
library(caret)
## Loading required package: lattice
## Loading required package: ggplot2
library(rpart)
library(e1071)
# split data into a train and test set
i <- 1:nrow(data)
trainIndex <- sample(i, trunc(length(i)/4))
data_test <- data[trainIndex,]
data_train <- data[-trainIndex,]
# Both for the SVM and the partitioning tree (via rpart()), we fit the model and try to predict the test set values:
# create a model using the training data set
svm.model <- svm(y~., data = data_train)
rpart.model <- rpart(y~., data = data_train)
svm.pred <- predict(svm.model,data_test[,-58])
rpart.pred <- predict(rpart.model,data_test[,-58], type = "class")
# compute svm confusion matrix
svm.cm <- table(pred = svm.pred, true = data_test[,58])
svm.cm
## true
## pred 0 1
## 0 69 5
## 1 3 48
# compute rpart confusion matrix
rpart.cm <- table(pred = rpart.pred, true = data_test[,58])
rpart.cm
## true
## pred 0 1
## 0 66 10
## 1 6 43
# compare the performance of the two methods by computing the respective accuracy rates and the kappa indices
classAgreement(svm.cm)
## $diag
## [1] 0.936
##
## $kappa
## [1] 0.8683171
##
## $rand
## [1] 0.8792258
##
## $crand
## [1] 0.7583557
classAgreement(rpart.cm)
## $diag
## [1] 0.872
##
## $kappa
## [1] 0.7353097
##
## $rand
## [1] 0.7749677
##
## $crand
## [1] 0.5496683
The results are almost same for SVM and rpart models.