Step 1 - Creating a dataframe with spam emails
#Assigning directories
dir="/Users/devanshu/Documents/R Datascience/SpamHam/spam"
file = list.files(dir)Step 2 - Creating a dataframe with spam emails
# Assigning directories
dir="/Users/devanshu/Documents/R Datascience/SpamHam/ham"
file = list.files(dir)Step 3 - Combine the ham and spam data frames to create the complete dataframe we will be working with
Step 4 - Create a corpus
## author : character(0)
## datetimestamp: 2019-11-18 02:17:23
## description : character(0)
## heading : character(0)
## id : 1
## language : en
## origin : character(0)
Step 5 - Clean by removing numbers,white space,punctuations. Apply regex to extract only text
## Warning in tm_map.SimpleCorpus(completecorpus,
## content_transformer(removeNumbers)): transformation drops documents
## Warning in tm_map.SimpleCorpus(tmp, content_transformer(function(x)
## str_replace_all(x, : transformation drops documents
## Warning in tm_map.SimpleCorpus(tmp, content_transformer(tolower)):
## transformation drops documents
## Warning in tm_map.SimpleCorpus(tmp, content_transformer(stripWhitespace)):
## transformation drops documents
## Warning in tm_map.SimpleCorpus(tmp,
## content_transformer(removePunctuation)): transformation drops documents
Step 6 - Create a document matrix
Review top 50 terms in the combined dataset
## the from com for with received
## 30102 27536 26650 20095 19967 16834
## list localhost and net sep esmtp
## 16068 15464 15155 12779 12170 9825
## org example that you version aug
## 9270 9214 7808 7573 6399 6385
## http spam this oct postfix xent
## 5968 5906 5876 5530 5473 5287
## ist content admin date thu message
## 5136 5086 4975 4904 4720 4595
## fork mon wed jalapeno sourceforge text
## 4579 4559 4225 4157 4076 4041
## delivered not tue dogma slashnull have
## 4022 3701 3693 3662 3662 3580
## are subject www return type your
## 3573 3535 3370 3364 3341 3258
## path exmh
## 3199 3130
Based on the distribution above, remove words that should not impact classification
# Top 50 terms in spam emails
clean1 <- tm_map(clean, removeWords, c('received', 'the','from','com','for','with','localhost','and','esmtp','that','http'))## Warning in tm_map.SimpleCorpus(clean, removeWords, c("received", "the", :
## transformation drops documents
Review the newly cleaned dataset
# Top 50 terms in spam emails
dtm <- DocumentTermMatrix(clean1)
dtm = removeSparseTerms(dtm, 1-(10/length(clean1)))
dtm_df=(as.data.frame(as.matrix(dtm)))
dtm_df$classification = complete$classification## list net sep org example you
## 16068 12779 12170 9270 9214 7573
## version aug spam this oct postfix
## 6399 6385 5906 5876 5530 5473
## xent ist content admin date thu
## 5287 5136 5086 4975 4904 4720
## message fork mon wed jalapeno sourceforge
## 4595 4579 4559 4225 4157 4076
## text delivered not tue dogma slashnull
## 4041 4022 3701 3693 3662 3662
## have are subject www return type
## 3580 3573 3535 3370 3364 3341
## your path exmh fri single mail
## 3258 3199 3130 3035 3019 3007
## drop rpm fetchmail yyyy imap mailing
## 2922 2914 2886 2827 2783 2692
## but users
## 2666 2656
# Top 50 terms in ham emails
sort(colSums(dtm_df %>% filter(`classification` == 0)) ,decreasing = TRUE)[1:50]## list net sep example org spam
## 15287 11296 10026 8670 7601 5856
## version oct xent aug postfix admin
## 5819 5461 5287 5182 4867 4800
## fork ist date thu sourceforge this
## 4579 4389 4314 4224 3967 3939
## you mon content message wed jalapeno
## 3938 3870 3833 3793 3774 3652
## delivered tue text exmh dogma slashnull
## 3527 3223 3149 3130 3118 3118
## subject rpm yyyy not return have
## 2938 2914 2827 2797 2784 2731
## path are type single mailing users
## 2673 2613 2573 2549 2540 2532
## fri freshrpms drop www imap but
## 2515 2464 2454 2446 2440 2434
## fetchmail cvs
## 2426 2370
# Top 50 terms in spam emails
sort(colSums(dtm_df %>% filter(`classification` == 1)) ,decreasing = TRUE)[1:50]## you sep your this org font nbsp
## 3635 2144 2010 1937 1669 1585 1531
## net content aug mail zzzz are www
## 1483 1253 1203 1140 1118 960 924
## not text will have sized message widthd
## 904 892 863 849 805 802 798
## list type our free ist mon smtp
## 781 768 768 748 747 689 659
## all email postfix img webnote subject date
## 634 621 606 606 603 597 590
## return version can money heightd dogma example
## 580 580 553 548 546 544 544
## slashnull path mime fri colord here out
## 544 526 523 520 518 512 509
## more
## 507
Step 7 - Shuffle the dataset and conduct a traintest split
Step 8 - Building a Decision Tree Classifier
# fit the training set and predict on the test set
dt_fit=rpart(classification ~ . , data = train_set, method="class")
ypreds<-predict(dt_fit,test_set,type='class' )Step 9 - Evaluate the model
## ypreds
## 0 1
## 0 545 3
## 1 6 100
# Accuracy score
Accuracy=(table_mat[1]+table_mat[2])/(table_mat[3]+table_mat[1]+table_mat[4]+table_mat[2])
Accuracy*100## [1] 84.25076