Document Classification

Load the two files into R:

The dataset is based on the real data examples of spam emails at - https://archive.ics.uci.edu/ml/datasets/Spambase The dataset contains 57 attributes. Each attribute refers to the occurance or frequency of a specific word in the spambase email files.

It also contains attributes which show the number of certain chars in the email like “;”, “$”, “!” etc.

And the last attribute denotes whether the email was considered spam (1) or ham (not spam) (0).

# Reading the file from my repository into R
file = "https://raw.githubusercontent.com/isrini/SI_IS607/master/data.csv"
data <- read.csv(file, header=TRUE, sep="|")
# Make the last column 'y' (spam yes or no) a factor variable for binary classification
data$y <- as.factor(data$y)

# Sample of 100 rows
data <- data[sample(nrow(data), 500),]
colnames(data)
##  [1] "word_freq_make"             "word_freq_address"         
##  [3] "word_freq_all"              "word_freq_3d"              
##  [5] "word_freq_our"              "word_freq_over"            
##  [7] "word_freq_remove"           "word_freq_internet"        
##  [9] "word_freq_order"            "word_freq_mail"            
## [11] "word_freq_receive"          "word_freq_will"            
## [13] "word_freq_people"           "word_freq_report"          
## [15] "word_freq_addresses"        "word_freq_free"            
## [17] "word_freq_business"         "word_freq_email"           
## [19] "word_freq_you"              "word_freq_credit"          
## [21] "word_freq_your"             "word_freq_font"            
## [23] "word_freq_000"              "word_freq_money"           
## [25] "word_freq_hp"               "word_freq_hpl"             
## [27] "word_freq_george"           "word_freq_650"             
## [29] "word_freq_lab"              "word_freq_labs"            
## [31] "word_freq_telnet"           "word_freq_857"             
## [33] "word_freq_data"             "word_freq_415"             
## [35] "word_freq_85"               "word_freq_technology"      
## [37] "word_freq_1999"             "word_freq_parts"           
## [39] "word_freq_pm"               "word_freq_direct"          
## [41] "word_freq_cs"               "word_freq_meeting"         
## [43] "word_freq_original"         "word_freq_project"         
## [45] "word_freq_re"               "word_freq_edu"             
## [47] "word_freq_table"            "word_freq_conference"      
## [49] "char_freq_."                "char_freq_..1"             
## [51] "char_freq_..2"              "char_freq_..3"             
## [53] "char_freq_"                 "char_freq_..4"             
## [55] "capital_run_length_average" "capital_run_length_longest"
## [57] "capital_run_length_total"   "y"

Load the library packages

library(caret)
## Loading required package: lattice
## Loading required package: ggplot2
library(rpart)
library(e1071)

Split the data in to two - train and test

# split data into a train and test set
i <- 1:nrow(data)
trainIndex <- sample(i, trunc(length(i)/4))

data_test <- data[trainIndex,]
data_train  <- data[-trainIndex,]

Create the SVM and rpart model

# Both for the SVM and the partitioning tree (via rpart()), we fit the model and try to predict the test set values:
# create a model using the training data set
svm.model <- svm(y~., data = data_train)
rpart.model <- rpart(y~., data = data_train)

Evaluate the model for prediction

svm.pred <- predict(svm.model,data_test[,-58])
rpart.pred <- predict(rpart.model,data_test[,-58], type = "class")

A cross-tabulation of the true versus the predicted values yields.

# compute svm confusion matrix
svm.cm <- table(pred = svm.pred, true = data_test[,58])
svm.cm
##     true
## pred  0  1
##    0 69  5
##    1  3 48
# compute rpart confusion matrix
rpart.cm <- table(pred = rpart.pred, true = data_test[,58])
rpart.cm
##     true
## pred  0  1
##    0 66 10
##    1  6 43
# compare the performance of the two methods by computing the respective accuracy rates and the kappa indices

classAgreement(svm.cm)
## $diag
## [1] 0.936
## 
## $kappa
## [1] 0.8683171
## 
## $rand
## [1] 0.8792258
## 
## $crand
## [1] 0.7583557
classAgreement(rpart.cm)
## $diag
## [1] 0.872
## 
## $kappa
## [1] 0.7353097
## 
## $rand
## [1] 0.7749677
## 
## $crand
## [1] 0.5496683

The results are almost same for SVM and rpart models.