Spam Checker Project
Task: Create a model to intake e-mail messages and determine whether they are unwanted messages or not.
We created a model by searching for terms within a set of spam e-mails. Then we used R to input all of the emails within our “easy ham” and “spam” test sets. To augment our terms, we created a term document matrix with the TM package and searched for the most common terms in the spam set and in the ham set.
In each case, we compared the prevalence of each word in the spam set to its prevalence in the ham set. We created a scoring algorithm that gave greater weight to words that were more heavily weighted in a particular set. We chose to score spam e-mails positive and ham e-mails negative. In order to smooth the scoring appropriately, we chose a fairly complicated logarithmic scale. We did this because the spam set had terms that showed up in 0% of the ham, which need a large weighting. Most of our terms in the ham set were small. However, our ratios there ranged from single digit numbers to over a million. By creating a new scale, we were able to give all terms some weight, but gave terms that were more concentrated in a particular set the most weight. Finally, we weight spam-centric terms higher so bad e-mails could be caught.
We chose not to remove common terms or stop words. We assumed that the normal e-mails might contain more usual speech patterns. The prevalence of words used commonly in sentences might be a marker for the regular e-mails.
Our model performed quite well with the test data. It found 99.01% of the the true emails and 84% of the spam. Sadly, it scored 0% with the test data. Our hand-picked terms found about 70% of the genuine, “hard ham” e-mails, but the parially supervised model did exceedingly poorly. The reason is pretty clear. The compiler of the corpus put nearly all html-based e-mails in the “hard ham” set, but filled the “easy spam” with html-laden e-mails. The prevalent terms found by the TM package included many words from the html coding. A future version could seek to minimize the html codes in the list and seek to determine if html is a true marker for spam e-mails. This could even be done by rerunning the above process while including part of the “hard ham” set.
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
In our program, we read in the list of words to search for. Originally, this list was chosen by the researcher. It was later augment by lists created from word-prevalence within the TM corpus. We then count and compare ratios.
ham_count<-vector(mode = "numeric", length = 651)
spam_words<-vector(mode = "character", length = 651)
spam_words_percent<-vector(mode = "numeric", length = 651)
ham_words_percent<-vector(mode = "numeric", length = 651)
word_comparison<-vector(mode = "numeric", length = 651)
word_id<-vector(mode = "numeric", length = 651)
spam_words_file<-"C:/Users/dawig/Desktop/CUNY/spam_classifier/spam_words_list.txt"
spam_words<-readLines(spam_words_file)
length(spam_words)
for (i in 1:2551){
read_in_file<-"C:/Users/dawig/Desktop/CUNY/spam_classifier/easy_ham/"
read_in_file<-str_c(read_in_file, i, ".txt")
real_email<-readChar(read_in_file, file.info(read_in_file)$size)
real_email<-tolower(real_email)
for(j in 1:651){
if (str_detect(real_email,spam_words[[j]])==1){ham_count[[j]]=ham_count[[j]]+1} else{}
}
}
spam_count<-vector(mode = "numeric", length = 651)
for (i in 0:500){
read_in_file<-"C:/Users/dawig/Desktop/CUNY/spam_classifier/spam/"
read_in_file<-str_c(read_in_file, i, "spam.txt")
real_email<-readChar(read_in_file, file.info(read_in_file)$size)
real_email<-tolower(real_email)
for(j in 1:651){
if (str_detect(real_email,spam_words[[j]])==1){spam_count[[j]]=spam_count[[j]]+1} else{}
}
}
count_frame<-cbind(spam_count,ham_count,spam_words_percent,ham_words_percent,word_comparison,word_id)
str(count_frame)
for (i in 1:651){
count_frame[i,3]<-(count_frame[i,1]/501)
count_frame[i,4]<-(count_frame[i,2]/2551)
count_frame[i,3][count_frame[i,3] == 0] <-.00000001
count_frame[i,5]<-(count_frame[i,4]/count_frame[i,3])
count_frame[i,6]<-i
}
#count_frame
low_frame<-subset(count_frame, count_frame[,5]<.1)
low_frame
high_frame<-subset(count_frame, count_frame[,5]>1)
high_frame
spam_checker<- data.frame('spam_word'=character(),
'addition_factor'=integer(),
stringsAsFactors=FALSE)
We placed our words in 2 sets. One contained words that were more prevalent in the spam e-mail set and vice versa. We put them together to form a single set to constitute a list and set of weightings to check for spam e-mails.
for(i in 1:88){
spam_checker[i,1]<-spam_words[low_frame[i,6]]
spam_checker[i,2]<-(1+(32*(2-exp(low_frame[i,5]*6.8))))
}
for(i in 1:306) {
spam_checker[i+88,1]<-spam_words[high_frame[i,6]]
spam_checker[i+88,2]<-(0-((2.8-(exp(1/high_frame[i,5])))))
}
write.csv(spam_checker, file = "C:/Users/dawig/Desktop/CUNY/spam_classifier/spam_checker.csv")
The following code produced our computer-generated lists of most common terms in the spam and ham e-mail sets.
library(tm)
library(stringr)
library(SnowballC)
ham_set <- rep(list(),2551)
for(i in 1:2551){
read_in_file<-"C:/Users/dawig/Desktop/CUNY/spam_classifier/easy_ham/"
read_in_file<-str_c(read_in_file, i, ".txt")
ham_set[i]<-readChar(read_in_file, file.info(read_in_file)$size)
ham_set[i]<-tolower(ham_set[i])
}
ham_corpus<-Corpus(VectorSource(ham_set))
ham_corpus[[1]]
meta(ham_corpus[[1]])
ham_tdm<-TermDocumentMatrix(ham_corpus)
ham_tdm
ham_corpus_stems<-tm_map(ham_corpus,stemDocument)
ham_tdm<-removeSparseTerms(ham_tdm,1-(10/length(ham_corpus)))
ham_terms2<-findFreqTerms(ham_tdm, lowfreq=300, highfreq=Inf)
write(ham_terms2,"C:/Users/dawig/Desktop/CUNY/spam_classifier/ham_words2.txt")
spam_set <- rep(list(),500)
for(i in 1:500){
read_in_file<-"C:/Users/dawig/Desktop/CUNY/spam_classifier/spam/"
read_in_file<-str_c(read_in_file, i, "spam.txt")
spam_set[i]<-readChar(read_in_file, file.info(read_in_file)$size)
spam_set[i]<-tolower(spam_set[i])
}
spam_corpus<-Corpus(VectorSource(spam_set))
spam_corpus[[1]]
meta(spam_corpus[[1]])
spam_tdm<-TermDocumentMatrix(spam_corpus)
spam_tdm
spam_corpus_stems<-tm_map(spam_corpus,stemDocument)
spam_tdm<-removeSparseTerms(spam_tdm,1-(10/length(spam_corpus)))
spam_terms2<-findFreqTerms(spam_tdm, lowfreq=300, highfreq=Inf)
#write(spam_terms2,"C:/Users/dawig/Desktop/CUNY/spam_classifier/spam_words2.txt")
spam.holder.a<-readLines('C:/Users/dawig/Desktop/CUNY/spam_classifier/spam_words2.txt')
spam.holder.b<-readLines('C:/Users/dawig/Desktop/CUNY/spam_classifier/ham_words2.txt')
spam.holder.c<-readLines('C:/Users/dawig/Desktop/CUNY/spam_classifier/spam_words.txt')
spam.list.final<-c(spam.holder.a,spam.holder.b,spam.holder.c)
spam.list.final
#write(spam.list.final,"C:/Users/dawig/Desktop/CUNY/spam_classifier/spam_words_list.txt")
spam_checker<-read.csv("C:/Users/dawig/Desktop/CUNY/spam_classifier/spam_checker.csv",stringsAsFactors = FALSE)
spam_score.spam=vector(mode="numeric", length = 500)
for (i in 1:500){
read_in_file<-"C:/Users/dawig/Desktop/CUNY/spam_classifier/spam/"
read_in_file<-str_c(read_in_file, i, "spam.txt")
real_email<-readChar(read_in_file, file.info(read_in_file)$size)
real_email<-tolower(real_email)
for(j in 1:394){
spam_word<-spam_checker[j,2]
if (str_detect(real_email,spam_word)==1){
spam_score.spam[i]=spam_score.spam[i]+spam_checker[j,3]} else{}
}
}
Finally, these are the search terms utilized by our spam checker to reach the results mentioned above, followed by performnce percentages.
print(spam_checker)
## X spam_word addition_factor
## 1 1 webnote 15.24686091
## 2 2 000000 30.17349392
## 3 3 align 30.54965118
## 4 4 arial 31.04953422
## 5 5 bgcolor 31.67843180
## 6 6 border 27.84883920
## 7 7 cellpadding 32.37000665
## 8 8 cellspacing 32.36532870
## 9 9 center 2.81470378
## 10 10 color 25.36663064
## 11 11 ffffff 31.01283501
## 12 12 font 28.40460126
## 13 13 height 28.71002110
## 14 14 href 30.47965132
## 15 15 img 5.58093370
## 16 16 input 8.06992778
## 17 17 nbsp 29.11909361
## 18 18 src 6.79162307
## 19 19 tahoma 33.00000000
## 20 20 title 20.03049301
## 21 21 valign 31.50944351
## 22 22 verdana 31.54330959
## 23 23 width 20.75567164
## 24 24 colspan 32.00234471
## 25 25 ff0000 31.37741589
## 26 26 helvetica 33.00000000
## 27 27 sans 31.14906966
## 28 28 serif 33.00000000
## 29 29 webmaster 25.61139119
## 30 30 iiq 31.79061668
## 31 31 span 19.88818931
## 32 32 blockquote 20.31647016
## 33 33 font 28.40460126
## 34 34 take control 33.00000000
## 35 35 superior 4.39108760
## 36 36 base64 21.34526370
## 37 37 deposited 7.60215397
## 38 38 removed from this list 23.20272443
## 39 39 barrister 33.00000000
## 40 40 offshore 26.27372814
## 41 41 risk-free 33.00000000
## 42 42 now! 28.98018353
## 43 43 today! 31.49191312
## 44 44 milosevic 33.00000000
## 45 45 partija 33.00000000
## 46 46 socijalista 33.00000000
## 47 47 baby makers 33.00000000
## 48 48 chronicle my experiences 33.00000000
## 49 49 body of the message 33.00000000
## 50 50 thank you a lot! 33.00000000
## 51 51 guangdong 33.00000000
## 52 52 x-uid: 33.00000000
## 53 53 extra money 33.00000000
## 54 54 xxx 17.23090951
## 55 55 assistance 26.27372814
## 56 56 congo 28.42797766
## 57 57 kabila 33.00000000
## 58 58 close aides 33.00000000
## 59 59 commission 14.41710449
## 60 60 procene 33.00000000
## 61 61 <body> 31.21973240
## 62 62 <table> 33.00000000
## 63 63 lerami 33.00000000
## 64 64 buy recommendations 33.00000000
## 65 65 money-making 33.00000000
## 66 66 lerctr 30.02030833
## 67 67 every week 12.19850476
## 68 68 explosive potential 33.00000000
## 69 69 one of a kind 33.00000000
## 70 70 disclosed 25.48815189
## 71 71 upfront 33.00000000
## 72 72 for yourself 24.20539784
## 73 73 my name is 30.02030833
## 74 74 purchase 5.00895324
## 75 75 loan 17.23090951
## 76 76 lowest rate 33.00000000
## 77 77 opportunity 7.51804256
## 78 78 self motivated 33.00000000
## 79 79 refinancing 33.00000000
## 80 80 new home loans 33.00000000
## 81 81 debt consolidation 33.00000000
## 82 82 debt consultation 33.00000000
## 83 83 auto loans 33.00000000
## 84 84 credit cards 30.53552685
## 85 85 student loans 33.00000000
## 86 86 second mortgage 33.00000000
## 87 87 home equity 33.00000000
## 88 88 I know 33.00000000
## 89 89 0100 -0.09704057
## 90 90 2002 -0.08713847
## 91 91 com -0.15662664
## 92 92 date -0.09254797
## 93 93 delivered -0.10950703
## 94 94 dogma -0.70452058
## 95 95 drop -0.17111270
## 96 96 example -0.94712741
## 97 97 fetchmail -0.16478426
## 98 98 from -0.09254797
## 99 99 http -0.36007776
## 100 100 imap -0.74024489
## 101 101 jmason -0.67513841
## 102 102 linux -0.89378995
## 103 103 list -0.56821574
## 104 104 localhost -0.12871885
## 105 105 message -0.09254797
## 106 106 org -0.50733172
## 107 107 path -0.14605316
## 108 108 postfix -0.09592789
## 109 109 return -0.13543739
## 110 110 single -0.14994390
## 111 111 slashnull -0.70452058
## 112 112 subject -0.09254797
## 113 113 the -0.22096970
## 114 114 thu -0.50201090
## 115 115 version -0.21613644
## 116 116 www -0.21018782
## 117 117 and -0.14875679
## 118 118 can -0.12113825
## 119 119 here -0.17021149
## 120 120 how -0.45569179
## 121 121 just -0.66786051
## 122 122 new -0.09500201
## 123 123 now -0.40659348
## 124 124 that -0.72753852
## 125 125 people -0.26546010
## 126 126 was -0.84464057
## 127 127 100 -0.09073271
## 128 128 fri -0.33649195
## 129 129 report -1.22164330
## 130 130 tue -0.40075060
## 131 131 wed -0.67098803
## 132 132 jalapeno -0.78446114
## 133 133 0100 -0.09704057
## 134 134 0700 -1.22369034
## 135 135 172 -1.44059894
## 136 136 2002 -0.08713847
## 137 137 admin -1.51681981
## 138 138 and -0.14875679
## 139 139 archive -1.65962021
## 140 140 ascii -1.31378506
## 141 141 been -1.21177424
## 142 142 beenthere -1.57556721
## 143 143 bulk -1.47786506
## 144 144 can -0.12113825
## 145 145 chris -1.16117064
## 146 146 code -0.66267998
## 147 147 com -0.15662664
## 148 148 corp -0.95564259
## 149 149 cvs -1.78567542
## 150 150 date -0.09254797
## 151 151 deepeddy -1.79999960
## 152 152 delivered -0.10950703
## 153 153 discussion -1.65440841
## 154 154 dogma -0.70452058
## 155 155 drop -0.17111270
## 156 156 errors -1.51327149
## 157 157 example -0.94712741
## 158 158 exmh -1.79999986
## 159 159 fetchmail -0.16478426
## 160 160 from -0.09254797
## 161 161 help -1.21040766
## 162 162 hits -1.79759418
## 163 163 https -1.68709810
## 164 164 imap -0.74024489
## 165 165 like -0.70499304
## 166 166 list -0.56821574
## 167 167 listinfo -1.56757954
## 168 168 listman -1.79999986
## 169 169 localdomain -1.73140694
## 170 170 localhost -0.12871885
## 171 171 loop -1.53277602
## 172 172 mailing -1.25065642
## 173 173 mailman -1.57637241
## 174 174 mailto -1.43358593
## 175 175 message -0.09254797
## 176 176 mx1 -0.82136269
## 177 177 netnoteinc -1.62816760
## 178 178 new -0.09500201
## 179 179 org -0.50733172
## 180 180 path -0.14605316
## 181 181 plain -0.91513651
## 182 182 postfix -0.09592789
## 183 183 precedence -1.53077946
## 184 184 redhat -1.79999993
## 185 185 references -1.76446799
## 186 186 request -1.37878407
## 187 187 return -0.13543739
## 188 188 run -0.85888480
## 189 189 sender -1.33076518
## 190 190 single -0.14994390
## 191 191 slashnull -0.70452058
## 192 192 still -1.11481405
## 193 193 subject -0.09254797
## 194 194 subscribe -1.21354310
## 195 195 that -0.72753852
## 196 196 the -0.22096970
## 197 197 them -0.15968774
## 198 198 think -1.33462910
## 199 199 thu -0.50201090
## 200 200 unsubscribe -1.29030449
## 201 201 using -1.03921915
## 202 202 version -0.21613644
## 203 203 vircio -1.79999960
## 204 204 wed -0.67098803
## 205 205 where -0.20779029
## 206 206 workers -1.65661271
## 207 207 0000 -0.21069268
## 208 208 7bit -0.44986535
## 209 209 helo -1.21032297
## 210 210 http -0.36007776
## 211 211 invoked -1.39897182
## 212 212 network -1.34853712
## 213 213 now -0.40659348
## 214 214 qmail -1.35907902
## 215 215 scd -1.46004889
## 216 216 system -0.08542037
## 217 217 use -0.45262215
## 218 218 was -0.84464057
## 219 219 well -0.19975156
## 220 220 yahoogroups -1.79999973
## 221 221 zzzzteana -1.79999971
## 222 222 about -0.59685842
## 223 223 exim -1.57590463
## 224 224 got -0.60646506
## 225 225 had -0.47348472
## 226 226 into -0.76096979
## 227 227 news -1.05132581
## 228 228 other -0.82431049
## 229 229 reported -1.69660412
## 230 230 said -1.22984616
## 231 231 talk -1.37089539
## 232 232 then -0.56733632
## 233 233 were -0.74281066
## 234 234 its -1.58064436
## 235 235 messages -1.54546993
## 236 236 some -1.27145236
## 237 237 than -0.13340949
## 238 238 they -1.18570495
## 239 239 world -0.31654903
## 240 240 www -0.21018782
## 241 241 but -1.20333584
## 242 242 edu -0.80211016
## 243 243 files -0.93274760
## 244 244 here -0.17021149
## 245 245 linux -0.89378995
## 246 246 long -0.09398058
## 247 247 might -1.19160595
## 248 248 old -0.41030385
## 249 249 organization -1.53177524
## 250 250 quoted -0.29741394
## 251 251 see -0.40441394
## 252 252 signature -1.69457198
## 253 253 there -1.34680578
## 254 254 users -0.28755162
## 255 255 way -0.57232180
## 256 256 when -1.00322315
## 257 257 would -0.66881408
## 258 258 wrote -1.79999997
## 259 259 agent -1.64296659
## 260 260 just -0.66786051
## 261 261 mozilla -1.76977554
## 262 262 same -0.99604088
## 263 263 used -0.90330280
## 264 264 user -1.16235772
## 265 265 what -1.00048484
## 266 266 which -0.85784535
## 267 267 really -1.36597873
## 268 268 216 -0.53115364
## 269 269 better -0.67558284
## 270 270 could -0.82518810
## 271 271 end -0.38146252
## 272 272 first -0.30075639
## 273 273 should -0.28982672
## 274 274 found -1.31829944
## 275 275 people -0.26546010
## 276 276 after -0.20941794
## 277 277 does -0.52731229
## 278 278 doesn -1.60878177
## 279 279 good -0.70759657
## 280 280 how -0.45569179
## 281 281 known -1.37700136
## 282 282 many -0.47012021
## 283 283 original -0.72466353
## 284 284 something -1.42001487
## 285 285 those -0.15520347
## 286 286 times -1.04105955
## 287 287 two -0.89806990
## 288 288 why -0.28047504
## 289 289 check -0.37572313
## 290 290 clean -1.57410320
## 291 291 pgp -1.57994117
## 292 292 real -1.23614070
## 293 293 url -1.56489804
## 294 294 136 -0.77243996
## 295 295 171 -1.04329711
## 296 296 debian -1.68471458
## 297 297 list1 -1.64692945
## 298 298 list2 -1.64692945
## 299 299 lists -1.40275997
## 300 300 mm2 -1.56181989
## 301 301 osdn -1.70569370
## 302 302 sourceforge -1.65881819
## 303 303 spam -1.69636517
## 304 304 spamassassin -1.76463251
## 305 305 sponsored -1.68684744
## 306 306 usw -1.35595144
## 307 307 0200 -0.45906197
## 308 308 devel -1.45663487
## 309 309 fri -0.33649195
## 310 310 know -1.09558337
## 311 311 problem -1.20788947
## 312 312 too -0.18801768
## 313 313 few -0.29365879
## 314 314 did -0.93987390
## 315 315 kernel -1.79999973
## 316 316 before -0.22558316
## 317 317 file -0.13139741
## 318 318 own -0.69433958
## 319 319 work -0.35982395
## 320 320 161 -1.41461156
## 321 321 236 -1.53721736
## 322 322 another -0.96660651
## 323 323 fork -1.77081490
## 324 324 friends -1.48786391
## 325 325 khare -1.79999996
## 326 326 lair -1.78563501
## 327 327 level -1.72337736
## 328 328 pdt -1.68398567
## 329 329 pipermail -1.67455855
## 330 330 rohit -1.79999996
## 331 331 such -0.34134562
## 332 332 xent -1.79999996
## 333 333 build -0.72630552
## 334 334 100 -0.09073271
## 335 335 say -1.21888139
## 336 336 100000 -1.77122717
## 337 337 pine -1.69829916
## 338 338 secprog -1.79999918
## 339 339 securityfocus -1.79999920
## 340 340 status -1.68892122
## 341 341 tests -1.79279586
## 342 342 rpm -1.68684744
## 343 343 perl -1.26361801
## 344 344 slack -1.79999977
## 345 345 short -1.28595815
## 346 346 alsa -1.79999954
## 347 347 auth02 -1.79999990
## 348 348 egwn -1.78029958
## 349 349 freshrpms -1.79999990
## 350 350 matthias -1.79999985
## 351 351 python -1.79999982
## 352 352 tue -0.40075060
## 353 353 zzzlist -1.79999990
## 354 354 habeas -1.79999958
## 355 355 razor -1.79999977
## 356 356 petting -1.79999841
## 357 357 attribution -1.79999994
## 358 358 jmason -0.67513841
## 359 359 phrase -1.79999997
## 360 360 pyzor -1.79999993
## 361 361 required -1.63697025
## 362 362 yyyy -1.79999999
## 363 363 apt -1.06960543
## 364 364 xml -0.25219660
## 365 365 jalapeno -0.78446114
## 366 366 oct -1.30218015
## 367 367 rssfeeds -1.79999996
## 368 368 utf -1.72718524
## 369 369 newsisfree -1.79999992
## 370 370 datapower -1.79999804
## 371 371 awl -1.74525289
## 372 372 nonsense -1.79999994
## 373 373 rcvd -1.78096247
## 374 374 feb -0.89317127
## 375 375 commits -1.79999850
## 376 376 unsubscribe -1.29030449
## 377 377 bulk -1.47786506
## 378 378 dvds -1.64340486
## 379 379 yahoo! groups -1.79999971
## 380 380 linux -0.89378995
## 381 381 unix -1.79999948
## 382 382 sponsored by -1.68514756
## 383 383 files -0.93274760
## 384 384 headlines -0.56560387
## 385 385 slashnull.org -0.70452058
## 386 386 subject -0.09254797
## 387 387 doesn't -1.59739973
## 388 388 wrong -1.33622272
## 389 389 technologies -1.17593073
## 390 390 maybe -1.34340711
## 391 391 something -1.42001487
## 392 392 shouldn't -1.79999927
## 393 393 woman -0.82827489
## 394 394 sound -1.41273934
spam_score.ham=vector(mode="numeric", length = 2551)
for (i in 1:2551){
read_in_file<-"C:/Users/dawig/Desktop/CUNY/spam_classifier/easy_ham/"
read_in_file<-str_c(read_in_file, i, ".txt")
real_email<-readChar(read_in_file, file.info(read_in_file)$size)
real_email<-tolower(real_email)
for(j in 1:394){
spam_word<-spam_checker[j,2]
if (str_detect(real_email,spam_word)==1){
spam_score.ham[i]=spam_score.ham[i]+spam_checker[j,3]} else{}
}
}
ham.subset<-subset(spam_score.ham,spam_score.ham < (-10))
spam.subset<-subset(spam_score.spam,spam_score.spam >(-10))
length(ham.subset)/ length(spam_score.ham)
## [1] 0.9901999
length(spam.subset) / length(spam_score.spam)
## [1] 0.84
The overall performance with our model data was:
(length(ham.subset)+length(spam.subset))/(length(spam_score.ham)+length(spam_score.spam))
## [1] 0.9655851
spam_score.hard.ham=vector(mode="numeric", length = 250)
for (i in 1:250){
read_in_file<-"C:/Users/dawig/Desktop/CUNY/spam_classifier/hard_ham/"
read_in_file<-str_c(read_in_file, i, ".txt")
real_email<-readChar(read_in_file, file.info(read_in_file)$size)
real_email<-tolower(real_email)
for(j in 1:76){
spam_word<-spam_checker[j,2]
if (str_detect(real_email,spam_word)==1){
spam_score.hard.ham[i]=spam_score.hard.ham[i]+spam_checker[j,3]} else{}
}
}
The performance with our test hard ham set was:
hard.ham.subset<-subset(spam_score.hard.ham,spam_score.hard.ham < (-10))
length(hard.ham.subset)/ length(spam_score.hard.ham)
## [1] 0