Dataset

spam

What words spam mail have in common

wordcloud((spam %>% filter(spam))$text, scale=c(2,1), min.freq=50, colors=rainbow(25))

Phone numbers

phone_numbers <- str_match_all(spam$text, "(?: *[-+().]? *\\d){6,14}") %>%
  unlist() %>%
  as.data.frame()
phone_numbers

Plot them (for fun)

We only have one axis, so we need to use plot(). Sadly, most of it is near 0, likely because of area codes or that it is being treated as a categorical variable.

plot(phone_numbers$.)
Warning in xy.coords(x, y, xlabel, ylabel, log) :
  NAs introduced by coercion

(Likely) Phone recordings as emails

grep("press [[:digit:]]", spam$text, ignore.case = TRUE, value = TRUE) %>%
  as.data.frame()

Tiny spam filter

Based on the EDA, this should work:

# get all emails that aren't spam (supposedly)

get_slipped <- function(spam, not_spam) {
  length(spam %>% filter(spam) %>% pull(text)) - length(not_spam)
}

non_spam <- grep("call", spam %>% filter(spam) %>% pull(text), ignore.case = TRUE, value = TRUE, invert = TRUE)
non_spam_pretty <- non_spam %>% as.data.frame()
print(paste("Amount of spam:", length(spam %>% filter(spam) %>% pull(text)), sep=" "))
[1] "Amount of spam: 747"
print(paste("Amount of 'call' text:", length(non_spam), sep=" "))
[1] "Amount of 'call' text: 400"
print(paste("Amount of mail that slipped through the cracks:", get_slipped(spam, non_spam), sep=" "))
[1] "Amount of mail that slipped through the cracks: 347"
print(paste("Good spam filter:", get_slipped(spam, non_spam) < length(non_spam), sep=" "))
[1] "Good spam filter: TRUE"
LS0tDQp0aXRsZTogIkVtYWlsIEVEQSINCm91dHB1dDogaHRtbF9ub3RlYm9vaw0KLS0tDQoNCmBgYHtyIHNldHVwLCBpbmNsdWRlPUZBTFNFfQ0KbGlicmFyeSh0aWR5dmVyc2UpDQpsaWJyYXJ5KHdvcmRjbG91ZCkNCiMjIHRoZXNlIGFyZSByZXF1aXJlZCBidXQgbm90IHVzZWQNCmxpYnJhcnkodG0pDQpsaWJyYXJ5KHNsYW0pDQpzcGFtIDwtIHJlYWRfY3N2KCJTcGFtLmNzdiIpICU+JQ0KICBtdXRhdGUoc3BhbSA9IENhdGVnb3J5ID09ICJzcGFtIikgJT4lDQogIHNlbGVjdCgtQ2F0ZWdvcnkpICU+JQ0KICBtdXRhdGUoTWVzc2FnZXMgPSBnc3ViKCJcdUZGRkQiLCAiIiwgTWVzc2FnZXMsIGZpeGVkPVRSVUUpKSAlPiUNCiAgcmVuYW1lKHRleHQgPSBNZXNzYWdlcykNCmBgYA0KDQojIyBEYXRhc2V0DQoNCmBgYHtyfQ0Kc3BhbQ0KYGBgDQoNCiMjIFdoYXQgd29yZHMgc3BhbSBtYWlsIGhhdmUgaW4gY29tbW9uDQoNCmBgYHtyIHdhcm5pbmc9RkFMU0V9DQp3b3JkY2xvdWQoKHNwYW0gJT4lIGZpbHRlcihzcGFtKSkkdGV4dCwgc2NhbGU9YygyLDEpLCBtaW4uZnJlcT01MCwgY29sb3JzPXJhaW5ib3coMjUpKQ0KYGBgDQoNCiMjIFBob25lIG51bWJlcnMNCg0KYGBge3J9DQpwaG9uZV9udW1iZXJzIDwtIHN0cl9tYXRjaF9hbGwoc3BhbSR0ZXh0LCAiKD86ICpbLSsoKS5dPyAqXFxkKXs2LDE0fSIpICU+JQ0KICB1bmxpc3QoKSAlPiUNCiAgYXMuZGF0YS5mcmFtZSgpDQpwaG9uZV9udW1iZXJzDQpgYGANCg0KIyMjIFBsb3QgdGhlbSAoZm9yIGZ1bikNCg0KV2Ugb25seSBoYXZlIG9uZSBheGlzLCBzbyB3ZSBuZWVkIHRvIHVzZSBgcGxvdCgpYC4gU2FkbHksIG1vc3Qgb2YgaXQgaXMgbmVhciAwLCBsaWtlbHkgYmVjYXVzZSBvZiBhcmVhIGNvZGVzIG9yIHRoYXQgaXQgaXMgYmVpbmcgdHJlYXRlZCBhcyBhIGNhdGVnb3JpY2FsIHZhcmlhYmxlLg0KDQpgYGB7cn0NCnBsb3QocGhvbmVfbnVtYmVycyQuKQ0KYGBgDQoNCiMjIChMaWtlbHkpIFBob25lIHJlY29yZGluZ3MgYXMgZW1haWxzDQoNCmBgYHtyfQ0KZ3JlcCgicHJlc3MgW1s6ZGlnaXQ6XV0iLCBzcGFtJHRleHQsIGlnbm9yZS5jYXNlID0gVFJVRSwgdmFsdWUgPSBUUlVFKSAlPiUNCiAgYXMuZGF0YS5mcmFtZSgpDQpgYGANCg0KIyMgVGlueSBzcGFtIGZpbHRlcg0KDQpCYXNlZCBvbiB0aGUgRURBLCB0aGlzIHNob3VsZCB3b3JrOg0KDQpgYGB7cn0NCiMgZ2V0IGFsbCBlbWFpbHMgdGhhdCBhcmVuJ3Qgc3BhbSAoc3VwcG9zZWRseSkNCg0KZ2V0X3NsaXBwZWQgPC0gZnVuY3Rpb24oc3BhbSwgbm90X3NwYW0pIHsNCiAgbGVuZ3RoKHNwYW0gJT4lIGZpbHRlcihzcGFtKSAlPiUgcHVsbCh0ZXh0KSkgLSBsZW5ndGgobm90X3NwYW0pDQp9DQoNCm5vbl9zcGFtIDwtIGdyZXAoImNhbGwiLCBzcGFtICU+JSBmaWx0ZXIoc3BhbSkgJT4lIHB1bGwodGV4dCksIGlnbm9yZS5jYXNlID0gVFJVRSwgdmFsdWUgPSBUUlVFLCBpbnZlcnQgPSBUUlVFKQ0Kbm9uX3NwYW1fcHJldHR5IDwtIG5vbl9zcGFtICU+JSBhcy5kYXRhLmZyYW1lKCkNCnByaW50KHBhc3RlKCJBbW91bnQgb2Ygc3BhbToiLCBsZW5ndGgoc3BhbSAlPiUgZmlsdGVyKHNwYW0pICU+JSBwdWxsKHRleHQpKSwgc2VwPSIgIikpDQpwcmludChwYXN0ZSgiQW1vdW50IG9mICdjYWxsJyB0ZXh0OiIsIGxlbmd0aChub25fc3BhbSksIHNlcD0iICIpKQ0KcHJpbnQocGFzdGUoIkFtb3VudCBvZiBtYWlsIHRoYXQgc2xpcHBlZCB0aHJvdWdoIHRoZSBjcmFja3M6IiwgZ2V0X3NsaXBwZWQoc3BhbSwgbm9uX3NwYW0pLCBzZXA9IiAiKSkNCnByaW50KHBhc3RlKCJHb29kIHNwYW0gZmlsdGVyOiIsIGdldF9zbGlwcGVkKHNwYW0sIG5vbl9zcGFtKSA8IGxlbmd0aChub25fc3BhbSksIHNlcD0iICIpKQ0KYGBg