Load the libraries

library(caret)

## Warning: package 'caret' was built under R version 3.5.3

## Loading required package: lattice

## Loading required package: ggplot2

## Warning: package 'ggplot2' was built under R version 3.5.3

library(MASS)

## Warning: package 'MASS' was built under R version 3.5.3

library(pROC)

## Warning: package 'pROC' was built under R version 3.5.3

## Type 'citation("pROC")' for a citation.

## 
## Attaching package: 'pROC'

## The following objects are masked from 'package:stats':
## 
##     cov, smooth, var

Read the data set

spambase <- read.csv("spambase.data.txt", header = FALSE)
head(spambase)

##     V1   V2   V3 V4   V5   V6   V7   V8   V9  V10  V11  V12  V13  V14  V15
## 1 0.00 0.64 0.64  0 0.32 0.00 0.00 0.00 0.00 0.00 0.00 0.64 0.00 0.00 0.00
## 2 0.21 0.28 0.50  0 0.14 0.28 0.21 0.07 0.00 0.94 0.21 0.79 0.65 0.21 0.14
## 3 0.06 0.00 0.71  0 1.23 0.19 0.19 0.12 0.64 0.25 0.38 0.45 0.12 0.00 1.75
## 4 0.00 0.00 0.00  0 0.63 0.00 0.31 0.63 0.31 0.63 0.31 0.31 0.31 0.00 0.00
## 5 0.00 0.00 0.00  0 0.63 0.00 0.31 0.63 0.31 0.63 0.31 0.31 0.31 0.00 0.00
## 6 0.00 0.00 0.00  0 1.85 0.00 0.00 1.85 0.00 0.00 0.00 0.00 0.00 0.00 0.00
##    V16  V17  V18  V19  V20  V21 V22  V23  V24 V25 V26 V27 V28 V29 V30 V31
## 1 0.32 0.00 1.29 1.93 0.00 0.96   0 0.00 0.00   0   0   0   0   0   0   0
## 2 0.14 0.07 0.28 3.47 0.00 1.59   0 0.43 0.43   0   0   0   0   0   0   0
## 3 0.06 0.06 1.03 1.36 0.32 0.51   0 1.16 0.06   0   0   0   0   0   0   0
## 4 0.31 0.00 0.00 3.18 0.00 0.31   0 0.00 0.00   0   0   0   0   0   0   0
## 5 0.31 0.00 0.00 3.18 0.00 0.31   0 0.00 0.00   0   0   0   0   0   0   0
## 6 0.00 0.00 0.00 0.00 0.00 0.00   0 0.00 0.00   0   0   0   0   0   0   0
##   V32 V33 V34 V35 V36  V37 V38 V39  V40 V41 V42  V43 V44  V45  V46 V47 V48
## 1   0   0   0   0   0 0.00   0   0 0.00   0   0 0.00   0 0.00 0.00   0   0
## 2   0   0   0   0   0 0.07   0   0 0.00   0   0 0.00   0 0.00 0.00   0   0
## 3   0   0   0   0   0 0.00   0   0 0.06   0   0 0.12   0 0.06 0.06   0   0
## 4   0   0   0   0   0 0.00   0   0 0.00   0   0 0.00   0 0.00 0.00   0   0
## 5   0   0   0   0   0 0.00   0   0 0.00   0   0 0.00   0 0.00 0.00   0   0
## 6   0   0   0   0   0 0.00   0   0 0.00   0   0 0.00   0 0.00 0.00   0   0
##    V49   V50 V51   V52   V53   V54   V55 V56  V57 V58
## 1 0.00 0.000   0 0.778 0.000 0.000 3.756  61  278   1
## 2 0.00 0.132   0 0.372 0.180 0.048 5.114 101 1028   1
## 3 0.01 0.143   0 0.276 0.184 0.010 9.821 485 2259   1
## 4 0.00 0.137   0 0.137 0.000 0.000 3.537  40  191   1
## 5 0.00 0.135   0 0.135 0.000 0.000 3.537  40  191   1
## 6 0.00 0.223   0 0.000 0.000 0.000 3.000  15   54   1

Assign column names

colnames(spambase) <-  c("word_freq_make", "word_freq_address", "word_freq_all", "word_freq_3d", 
    "word_freq_our", "word_freq_over", "word_freq_remove", "word_freq_internet", 
    "word_freq_order", "word_freq_mail", "word_freq_receive", "word_freq_will", 
    "word_freq_people", "word_freq_report", "word_freq_addresses", "word_freq_free", 
    "word_freq_business", "word_freq_email", "word_freq_you", "word_freq_credit", 
    "word_freq_your", "word_freq_font", "word_freq_000", "word_freq_money", 
    "word_freq_hp", "word_freq_hpl", "word_freq_george", "word_freq_650", "word_freq_lab", 
    "word_freq_labs", "word_freq_telnet", "word_freq_857", "word_freq_data", 
    "word_freq_415", "word_freq_85", "word_freq_technology", "word_freq_1999", 
    "word_freq_parts", "word_freq_pm", "word_freq_direct", "word_freq_cs", "word_freq_meeting", 
    "word_freq_original", "word_freq_project", "word_freq_re", "word_freq_edu", 
    "word_freq_table", "word_freq_conference", "char_freq_ch;", "char_freq_ch(", 
    "char_freq_ch[", "char_freq_ch!", "char_freq_ch$", "char_freq_ch#", "capital_run_length_average", 
    "capital_run_length_longest", "capital_run_length_total", "spam")
head(spambase)

##   word_freq_make word_freq_address word_freq_all word_freq_3d
## 1           0.00              0.64          0.64            0
## 2           0.21              0.28          0.50            0
## 3           0.06              0.00          0.71            0
## 4           0.00              0.00          0.00            0
## 5           0.00              0.00          0.00            0
## 6           0.00              0.00          0.00            0
##   word_freq_our word_freq_over word_freq_remove word_freq_internet
## 1          0.32           0.00             0.00               0.00
## 2          0.14           0.28             0.21               0.07
## 3          1.23           0.19             0.19               0.12
## 4          0.63           0.00             0.31               0.63
## 5          0.63           0.00             0.31               0.63
## 6          1.85           0.00             0.00               1.85
##   word_freq_order word_freq_mail word_freq_receive word_freq_will
## 1            0.00           0.00              0.00           0.64
## 2            0.00           0.94              0.21           0.79
## 3            0.64           0.25              0.38           0.45
## 4            0.31           0.63              0.31           0.31
## 5            0.31           0.63              0.31           0.31
## 6            0.00           0.00              0.00           0.00
##   word_freq_people word_freq_report word_freq_addresses word_freq_free
## 1             0.00             0.00                0.00           0.32
## 2             0.65             0.21                0.14           0.14
## 3             0.12             0.00                1.75           0.06
## 4             0.31             0.00                0.00           0.31
## 5             0.31             0.00                0.00           0.31
## 6             0.00             0.00                0.00           0.00
##   word_freq_business word_freq_email word_freq_you word_freq_credit
## 1               0.00            1.29          1.93             0.00
## 2               0.07            0.28          3.47             0.00
## 3               0.06            1.03          1.36             0.32
## 4               0.00            0.00          3.18             0.00
## 5               0.00            0.00          3.18             0.00
## 6               0.00            0.00          0.00             0.00
##   word_freq_your word_freq_font word_freq_000 word_freq_money word_freq_hp
## 1           0.96              0          0.00            0.00            0
## 2           1.59              0          0.43            0.43            0
## 3           0.51              0          1.16            0.06            0
## 4           0.31              0          0.00            0.00            0
## 5           0.31              0          0.00            0.00            0
## 6           0.00              0          0.00            0.00            0
##   word_freq_hpl word_freq_george word_freq_650 word_freq_lab
## 1             0                0             0             0
## 2             0                0             0             0
## 3             0                0             0             0
## 4             0                0             0             0
## 5             0                0             0             0
## 6             0                0             0             0
##   word_freq_labs word_freq_telnet word_freq_857 word_freq_data
## 1              0                0             0              0
## 2              0                0             0              0
## 3              0                0             0              0
## 4              0                0             0              0
## 5              0                0             0              0
## 6              0                0             0              0
##   word_freq_415 word_freq_85 word_freq_technology word_freq_1999
## 1             0            0                    0           0.00
## 2             0            0                    0           0.07
## 3             0            0                    0           0.00
## 4             0            0                    0           0.00
## 5             0            0                    0           0.00
## 6             0            0                    0           0.00
##   word_freq_parts word_freq_pm word_freq_direct word_freq_cs
## 1               0            0             0.00            0
## 2               0            0             0.00            0
## 3               0            0             0.06            0
## 4               0            0             0.00            0
## 5               0            0             0.00            0
## 6               0            0             0.00            0
##   word_freq_meeting word_freq_original word_freq_project word_freq_re
## 1                 0               0.00                 0         0.00
## 2                 0               0.00                 0         0.00
## 3                 0               0.12                 0         0.06
## 4                 0               0.00                 0         0.00
## 5                 0               0.00                 0         0.00
## 6                 0               0.00                 0         0.00
##   word_freq_edu word_freq_table word_freq_conference char_freq_ch;
## 1          0.00               0                    0          0.00
## 2          0.00               0                    0          0.00
## 3          0.06               0                    0          0.01
## 4          0.00               0                    0          0.00
## 5          0.00               0                    0          0.00
## 6          0.00               0                    0          0.00
##   char_freq_ch( char_freq_ch[ char_freq_ch! char_freq_ch$ char_freq_ch#
## 1         0.000             0         0.778         0.000         0.000
## 2         0.132             0         0.372         0.180         0.048
## 3         0.143             0         0.276         0.184         0.010
## 4         0.137             0         0.137         0.000         0.000
## 5         0.135             0         0.135         0.000         0.000
## 6         0.223             0         0.000         0.000         0.000
##   capital_run_length_average capital_run_length_longest
## 1                      3.756                         61
## 2                      5.114                        101
## 3                      9.821                        485
## 4                      3.537                         40
## 5                      3.537                         40
## 6                      3.000                         15
##   capital_run_length_total spam
## 1                      278    1
## 2                     1028    1
## 3                     2259    1
## 4                      191    1
## 5                      191    1
## 6                       54    1

barplot(table(spambase$spam),main="Spam vs Non Spam")

In the data most of the emails are non smap.

Split into training and test set

# Set seed value
set.seed(123)
# Shuffle the data set in R
spambase <- spambase[sample(nrow(spambase)),]
#converst spam to factor

spambase$spam <- as.factor(spambase$spam)
index <- createDataPartition(spambase$spam, p = 0.70,list = FALSE)
training <- spambase[index,]
test <- spambase[-index,]

Build model on Training Data

I’ve used SVM,knn and LDA for spambase classification

# Control parameters for model
train_ctrl <- trainControl(method = "repeatedcv",number = 10, repeats = 10)

model_svm <- train(spam~., data = training, method = "svmLinear",trControl = train_ctrl)
model_lda <- train(spam~., data = training, method = "lda",trControl = train_ctrl)
model_knn <- train(spam~., data = training, method = "knn",trControl = train_ctrl)

Do prediction

# Do prediction
pred_svm <- predict(model_svm, test[,-58])
pred_lda <- predict(model_lda, test[, -58])
pred_knn <- predict(model_knn, test[,-58])

Measure Accuracy on Test Data

Confusion Matrix

paste("SVM confusion matrix")

## [1] "SVM confusion matrix"

confusionMatrix(pred_svm,test$spam)

## Confusion Matrix and Statistics
## 
##           Reference
## Prediction   0   1
##          0 788  60
##          1  48 483
##                                           
##                Accuracy : 0.9217          
##                  95% CI : (0.9062, 0.9353)
##     No Information Rate : 0.6062          
##     P-Value [Acc > NIR] : <2e-16          
##                                           
##                   Kappa : 0.8353          
##                                           
##  Mcnemar's Test P-Value : 0.2898          
##                                           
##             Sensitivity : 0.9426          
##             Specificity : 0.8895          
##          Pos Pred Value : 0.9292          
##          Neg Pred Value : 0.9096          
##              Prevalence : 0.6062          
##          Detection Rate : 0.5714          
##    Detection Prevalence : 0.6149          
##       Balanced Accuracy : 0.9160          
##                                           
##        'Positive' Class : 0               
##

paste("LDA confusion matrix")

## [1] "LDA confusion matrix"

confusionMatrix(pred_lda,test$spam)

## Confusion Matrix and Statistics
## 
##           Reference
## Prediction   0   1
##          0 781 111
##          1  55 432
##                                           
##                Accuracy : 0.8796          
##                  95% CI : (0.8613, 0.8963)
##     No Information Rate : 0.6062          
##     P-Value [Acc > NIR] : < 2.2e-16       
##                                           
##                   Kappa : 0.7432          
##                                           
##  Mcnemar's Test P-Value : 1.965e-05       
##                                           
##             Sensitivity : 0.9342          
##             Specificity : 0.7956          
##          Pos Pred Value : 0.8756          
##          Neg Pred Value : 0.8871          
##              Prevalence : 0.6062          
##          Detection Rate : 0.5664          
##    Detection Prevalence : 0.6468          
##       Balanced Accuracy : 0.8649          
##                                           
##        'Positive' Class : 0               
##

paste("KNN confuxion matrix")

## [1] "KNN confuxion matrix"

confusionMatrix(pred_knn,test$spam)

## Confusion Matrix and Statistics
## 
##           Reference
## Prediction   0   1
##          0 718 148
##          1 118 395
##                                           
##                Accuracy : 0.8071          
##                  95% CI : (0.7853, 0.8276)
##     No Information Rate : 0.6062          
##     P-Value [Acc > NIR] : < 2e-16         
##                                           
##                   Kappa : 0.592           
##                                           
##  Mcnemar's Test P-Value : 0.07539         
##                                           
##             Sensitivity : 0.8589          
##             Specificity : 0.7274          
##          Pos Pred Value : 0.8291          
##          Neg Pred Value : 0.7700          
##              Prevalence : 0.6062          
##          Detection Rate : 0.5207          
##    Detection Prevalence : 0.6280          
##       Balanced Accuracy : 0.7931          
##                                           
##        'Positive' Class : 0               
##

Out of smv,lda and knn model SVM has best test performance with test accuracy 0.9217 . # ROC Curve

# calculate roc curve values
roc_svm <- roc(pred_svm,as.numeric(test$spam))
roc_lda <- roc(as.numeric(pred_lda),as.numeric(test$spam))
roc_knn <- roc(as.numeric(pred_knn), as.numeric(test$spam))

# Area under the curve
roc_svm$auc

## Area under the curve: 0.9194

roc_lda$auc

## Area under the curve: 0.8813

roc_knn$auc

## Area under the curve: 0.7995

plot(roc_svm,col="red",lty=1,main="ROC curve for 3 model")
plot(roc_lda,col="green",lty=2,add=TRUE)
plot(roc_knn,col="blue",lty=3,add=TRUE) 
legend("topright", legend=c("SVM","LDA","KNN"), col = c("red","green","blue"), lty=1:3)

From the ROC curve we can that SVM has the highest prediction performance.

DATA 620 Assignment Document Classification

Yohannes Deboch