packages = c(
"dplyr","ggplot2","caTools","tm","SnowballC","ROCR","rpart",
"rpart.plot","randomForest")
existing = as.character(installed.packages()[,1])
for(pkg in packages[!(packages %in% existing)]) install.packages(pkg)rm(list=ls(all=TRUE))
Sys.setlocale("LC_ALL","C")## [1] "C/C/C/C/C/zh_TW.UTF-8"
options(digits=5, scipen=10)
library(dplyr)
library(tm)
library(SnowballC)
library(ROCR)
library(caTools)
library(rpart)
library(rpart.plot)
library(randomForest)D = read.csv("data/emails.csv", stringsAsFactors = F)nrow(D)## [1] 5728
table(D$spam)##
## 0 1
## 4360 1368
substr(D$text[1:5], 1, 60)## [1] "Subject: naturally irresistible your corporate identity lt "
## [2] "Subject: the stock trading gunslinger fanny is merrill but "
## [3] "Subject: unbelievable new homes made easy im wanting to sho"
## [4] "Subject: 4 color printing special request additional inform"
## [5] "Subject: do not have money , get software cds from here ! s"
nchar(D$text) %>% max## [1] 43952
nchar(D$text) %>% which.min## [1] 1992
corp = Corpus(VectorSource(D$text))
corp = tm_map(corp, content_transformer(tolower))## Warning in tm_map.SimpleCorpus(corp, content_transformer(tolower)):
## transformation drops documents
corp = tm_map(corp, removePunctuation)## Warning in tm_map.SimpleCorpus(corp, removePunctuation): transformation
## drops documents
corp = tm_map(corp, removeWords, stopwords("english"))## Warning in tm_map.SimpleCorpus(corp, removeWords, stopwords("english")):
## transformation drops documents
corp = tm_map(corp, stemDocument)## Warning in tm_map.SimpleCorpus(corp, stemDocument): transformation drops
## documents
dtm = DocumentTermMatrix(corp)dtm## <<DocumentTermMatrix (documents: 5728, terms: 28687)>>
## Non-/sparse entries: 481719/163837417
## Sparsity : 100%
## Maximal term length: 24
## Weighting : term frequency (tf)
Limit dtm to contain terms appearing in at least 5%
spdtm = removeSparseTerms(dtm, 0.95)spdtm?
spdtm ## <<DocumentTermMatrix (documents: 5728, terms: 330)>>
## Non-/sparse entries: 213551/1676689
## Sparsity : 89%
## Maximal term length: 10
## Weighting : term frequency (tf)
Build a data frame ems from spdtm
ems = as.data.frame(as.matrix(spdtm))spdtm?
colSums(ems) %>% sort %>% tail## hou will vinc subject ect enron
## 5577 8252 8532 10202 11427 13388
Incorporate target variable spam
ems$spam = D$spamsubset(ems, spam==0) %>% colSums %>% sort %>% tail(10)## com pleas kaminski 2000 hou will vinc subject
## 4444 4494 4801 4935 5569 6802 8531 8625
## ect enron
## 11417 13388
subset(ems, spam==1) %>% colSums %>% {.[. > 1000]}## compani subject will spam
## 1065 1577 1450 1368
Split the data and build GLM, CART and random forest models …
ems$spam = factor(ems$spam)
names(ems) = make.names(names(ems))
set.seed(123); spl = sample.split(ems$spam, 0.7)
train = subset(ems, spl == TRUE)
test = subset(ems, spl == FALSE)
table(test$spam) %>% prop.table # 0.76135##
## 0 1
## 0.76135 0.23865
m.glm = glm(spam ~ ., train, family = 'binomial') ## Warning: glm.fit: algorithm did not converge
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
m.cart = rpart(spam ~ ., train, method="class")
set.seed(123); m.rf = randomForest(spam ~ ., train)p.glm = predict(m.glm,type='response') sum(p.glm < 0.00001)## [1] 3046
sum(p.glm > 0.99999)## [1] 954
sum(p.glm >= 0.00001 & p.glm <= 0.99999)## [1] 10
summary(m.glm)##
## Call:
## glm(formula = spam ~ ., family = "binomial", data = train)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -1.01 0.00 0.00 0.00 1.35
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -30.8167 10548.7431 0.00 1.00
## busi -4.8029 10002.2892 0.00 1.00
## chang -27.1680 22152.8907 0.00 1.00
## compani 4.7813 9186.3255 0.00 1.00
## corpor -0.8286 28181.3478 0.00 1.00
## day -6.0998 5866.2876 0.00 1.00
## done 6.8284 18822.0501 0.00 1.00
## effect 19.4824 21002.4128 0.00 1.00
## effort 16.0582 56700.5792 0.00 1.00
## even -16.5389 22886.6380 0.00 1.00
## full 21.2510 21904.3401 0.00 1.00
## good 5.3994 16193.4281 0.00 1.00
## inform 20.7808 8549.0245 0.00 1.00
## interest 26.9804 11587.5922 0.00 1.00
## list -8.6921 2148.9795 0.00 1.00
## look -7.0307 15631.4459 0.00 1.00
## made 2.8205 27432.2618 0.00 1.00
## make 29.0054 15276.3527 0.00 1.00
## manag 6.0145 14452.5449 0.00 1.00
## market 7.8952 8012.2953 0.00 1.00
## much 0.3775 13921.5777 0.00 1.00
## order 6.5327 12424.0848 0.00 1.00
## origin 32.2628 38175.2472 0.00 1.00
## product 10.1584 13447.6403 0.00 1.00
## provid 0.2423 18589.0873 0.00 1.00
## realli -26.6685 46403.4563 0.00 1.00
## result -0.5002 31401.0516 0.00 1.00
## see -11.1990 12932.4680 0.00 1.00
## special 17.7707 27552.3644 0.00 1.00
## subject 30.4112 10548.7431 0.00 1.00
## system 3.7780 9148.6586 0.00 1.00
## use -13.8535 9381.7631 0.00 1.00
## websit -25.6266 18475.0280 0.00 1.00
## will -11.1938 5980.4800 0.00 1.00
## within 29.0029 21632.4965 0.00 1.00
## without 19.4198 17628.7430 0.00 1.00
## continu 14.8661 15351.2387 0.00 1.00
## group 0.5264 10371.4780 0.00 1.00
## like 5.6494 7659.8787 0.00 1.00
## trade -17.5502 14825.1007 0.00 1.00
## tri 0.9278 12819.6375 0.00 1.00
## approv -1.3015 15894.7792 0.00 1.00
## ask -7.7459 19763.1683 0.00 1.00
## complet -13.6288 20237.9036 0.00 1.00
## credit 26.1738 13138.0027 0.00 1.00
## form 8.4835 16736.5446 0.00 1.00
## hear 28.8653 22809.1143 0.00 1.00
## home 5.9729 8964.8271 0.00 1.00
## new 1.0033 10091.5253 0.00 1.00
## offer 11.7383 10837.2211 0.00 1.00
## opportun -4.1312 19183.2113 0.00 1.00
## rate -3.1121 13189.5698 0.00 1.00
## take 5.7314 17156.1217 0.00 1.00
## time -5.9210 8334.7095 0.00 1.00
## visit 25.8460 11697.8534 0.00 1.00
## want -2.5551 11057.5646 0.00 1.00
## way 13.3897 11375.3854 0.00 1.00
## addit 1.4635 27027.1167 0.00 1.00
## click 13.7612 7076.9896 0.00 1.00
## com 1.9363 4039.2049 0.00 1.00
## fax 3.5370 33855.8899 0.00 1.00
## mail 7.5837 10210.9569 0.00 1.00
## messag 17.1570 2561.5756 0.01 0.99
## now 37.8968 12190.2491 0.00 1.00
## phone -6.9566 11717.6917 0.00 1.00
## request -12.3189 11669.6611 0.00 1.00
## version -36.0636 29386.8036 0.00 1.00
## best -8.2005 1333.3866 -0.01 1.00
## end -13.1054 29380.6882 0.00 1.00
## get 5.1538 9737.0707 0.00 1.00
## great 12.2194 10901.0790 0.00 1.00
## money 32.6355 13212.0683 0.00 1.00
## softwar 25.7485 10593.0947 0.00 1.00
## custom 18.2882 10079.0874 0.00 1.00
## hello 21.6555 13606.7312 0.00 1.00
## one 12.4124 6652.0320 0.00 1.00
## onlin 35.8862 16649.7350 0.00 1.00
## pleas -7.9614 9484.4639 0.00 1.00
## access -14.7972 13353.4899 0.00 1.00
## account 24.8812 8164.7879 0.00 1.00
## allow 18.9918 6436.3710 0.00 1.00
## alreadi -24.0748 33188.2885 0.00 1.00
## also 29.8967 13781.7902 0.00 1.00
## applic -2.6487 16735.5771 0.00 1.00
## area 20.4064 22657.7744 0.00 1.00
## assist -11.2827 24895.2579 0.00 1.00
## base -13.5426 21218.1714 0.00 1.00
## believ 32.3259 21360.0825 0.00 1.00
## buy 41.7019 38923.9252 0.00 1.00
## can 3.7617 7673.8931 0.00 1.00
## cost -1.9376 18329.8873 0.00 1.00
## creat 13.3762 39460.0516 0.00 1.00
## current 3.6291 17066.2426 0.00 1.00
## design -7.9231 29388.9389 0.00 1.00
## develop 5.9764 9454.5606 0.00 1.00
## differ -2.2929 10749.5997 0.00 1.00
## director -17.6981 17932.0129 0.00 1.00
## discuss -10.5101 19154.3531 0.00 1.00
## due -4.1627 35316.3726 0.00 1.00
## email 3.8328 11856.5459 0.00 1.00
## event 16.9419 18505.8473 0.00 1.00
## expect -11.7869 19139.4171 0.00 1.00
## file -29.4324 21649.5737 0.00 1.00
## forward -3.4840 18642.9364 0.00 1.00
## futur 41.4595 14387.2419 0.00 1.00
## gas -3.9009 4160.2926 0.00 1.00
## give -25.1831 21296.8350 0.00 1.00
## given -21.8641 54264.0263 0.00 1.00
## high -1.9820 25536.2328 0.00 1.00
## import -1.8593 22364.3382 0.00 1.00
## includ -3.4544 17988.8913 0.00 1.00
## increas 6.4759 23286.6404 0.00 1.00
## industri -31.6007 23734.8108 0.00 1.00
## invest 32.0125 23934.4148 0.00 1.00
## involv 38.1486 33152.6085 0.00 1.00
## just -10.2116 11140.8256 0.00 1.00
## know 12.7708 15263.5677 0.00 1.00
## locat 20.7257 15965.7168 0.00 1.00
## mani 18.8505 14418.0274 0.00 1.00
## may -9.4339 13969.5651 0.00 1.00
## mean 0.6078 29518.7119 0.00 1.00
## mention -22.7859 27136.9157 0.00 1.00
## might 12.4416 17533.0051 0.00 1.00
## month -3.7267 11123.6690 0.00 1.00
## need 0.8437 12207.6172 0.00 1.00
## note 14.4603 22937.8917 0.00 1.00
## number -9.6218 15914.5979 0.00 1.00
## offic -13.4416 23114.7234 0.00 1.00
## oper -16.9570 27565.6010 0.00 1.00
## person 18.6976 9575.4766 0.00 1.00
## posit -15.4311 23155.9923 0.00 1.00
## possibl -13.6596 24918.1573 0.00 1.00
## present -6.1630 12775.0563 0.00 1.00
## price 3.4276 7849.8596 0.00 1.00
## problem 12.6202 9763.0319 0.00 1.00
## process -0.2957 11905.8484 0.00 1.00
## project 2.1733 14973.0516 0.00 1.00
## read -15.2745 21446.7493 0.00 1.00
## relat -51.1383 17926.4612 0.00 1.00
## report -14.8212 14769.9198 0.00 1.00
## requir 0.5004 29365.4548 0.00 1.00
## research -28.2590 15526.4663 0.00 1.00
## resourc -27.3489 35221.0605 0.00 1.00
## return 17.4510 18435.1876 0.00 1.00
## review -4.8245 10132.7968 0.00 1.00
## risk -4.0008 17177.9984 0.00 1.00
## secur -16.0368 2200.7143 -0.01 0.99
## servic -7.1643 12351.2211 0.00 1.00
## set -9.3532 26268.8952 0.00 1.00
## short -8.9735 17207.5148 0.00 1.00
## specif -23.3669 30834.2030 0.00 1.00
## state 12.2075 16772.1315 0.00 1.00
## term 20.1329 23031.5438 0.00 1.00
## thing 25.7860 13405.1719 0.00 1.00
## today -17.6156 19649.5746 0.00 1.00
## two -25.7267 18439.4399 0.00 1.00
## understand 9.3072 23416.6569 0.00 1.00
## unit -4.0205 30080.6466 0.00 1.00
## well -22.2193 9713.4012 0.00 1.00
## work -10.9874 11596.3171 0.00 1.00
## hour 2.4780 13334.9004 0.00 1.00
## lot -19.6368 13211.3752 0.00 1.00
## real 20.4591 23580.8524 0.00 1.00
## right 23.1185 15904.4579 0.00 1.00
## start 14.3748 18972.2695 0.00 1.00
## X000 14.7384 10583.7959 0.00 1.00
## X2001 -32.1477 13177.6879 0.00 1.00
## follow 17.6578 3079.6809 0.01 1.00
## name 16.7214 13218.4481 0.00 1.00
## sent -14.8820 21953.7964 0.00 1.00
## last 1.0464 13724.4471 0.00 1.00
## avail 8.6511 17094.5716 0.00 1.00
## first -0.4666 20429.8045 0.00 1.00
## http 25.2794 21071.1240 0.00 1.00
## join -38.2408 23338.6228 0.00 1.00
## line 8.7432 12361.5396 0.00 1.00
## next. 14.9230 17244.6865 0.00 1.00
## remov 23.2545 24837.8658 0.00 1.00
## repli 15.3798 29155.6188 0.00 1.00
## wish 11.7309 31747.3794 0.00 1.00
## www -7.8672 22237.5989 0.00 1.00
## year -10.1029 10394.6904 0.00 1.00
## back -13.2347 22723.0238 0.00 1.00
## internet 8.7490 10999.9271 0.00 1.00
## member 13.8130 23429.9086 0.00 1.00
## receiv 0.5765 15848.4961 0.00 1.00
## site 8.6886 14955.3526 0.00 1.00
## anoth -8.7440 20316.9364 0.00 1.00
## associ 9.0494 19093.5414 0.00 1.00
## comment -3.2514 33870.0142 0.00 1.00
## corp 16.0550 27083.0385 0.00 1.00
## date -2.7862 16985.3061 0.00 1.00
## find -2.6228 9727.0946 0.00 1.00
## free 6.1132 8121.0418 0.00 1.00
## issu -37.0837 33960.7079 0.00 1.00
## long -14.8913 19336.4494 0.00 1.00
## move -38.3362 30112.4663 0.00 1.00
## particip -11.5427 17383.3058 0.00 1.00
## recent -2.0667 17795.1699 0.00 1.00
## respons -19.5960 36666.0058 0.00 1.00
## say 7.3662 22174.2442 0.00 1.00
## week -6.7950 10458.9864 0.00 1.00
## dear -2.3132 23063.8923 0.00 1.00
## regard -3.6681 15110.0149 0.00 1.00
## thank -38.9047 10586.9613 0.00 1.00
## address -4.6129 11134.3868 0.00 1.00
## contact 1.5300 12616.5255 0.00 1.00
## engin 26.6429 23936.0768 0.00 1.00
## etc 0.9470 15694.7652 0.00 1.00
## immedi 62.8533 33464.6929 0.00 1.00
## net 12.5616 21972.8129 0.00 1.00
## per 13.6749 12732.8339 0.00 1.00
## place 9.0053 36608.9651 0.00 1.00
## respond 29.7419 38879.3035 0.00 1.00
## sincer -20.7317 35145.2647 0.00 1.00
## type -14.4737 27548.2578 0.00 1.00
## come -1.1662 15107.7386 0.00 1.00
## confirm -12.9969 15139.7258 0.00 1.00
## analysi -24.0500 38603.0306 0.00 1.00
## bring 16.0664 67670.9680 0.00 1.00
## call -1.1450 11111.0678 0.00 1.00
## data -26.0909 22714.2774 0.00 1.00
## detail 11.9692 23008.8487 0.00 1.00
## happi 0.0194 12018.6881 0.00 1.00
## idea -18.4486 38918.5070 0.00 1.00
## info -1.2547 4857.1202 0.00 1.00
## send -24.2677 12224.2134 0.00 1.00
## success 4.3436 27830.4737 0.00 1.00
## sure -5.5027 20777.0982 0.00 1.00
## team 7.9405 25703.8499 0.00 1.00
## web 2.7907 16859.8166 0.00 1.00
## don 21.2866 14561.0671 0.00 1.00
## copi -42.7383 30699.5682 0.00 1.00
## help 17.3096 2790.8998 0.01 1.00
## part 4.5943 34830.4298 0.00 1.00
## life 58.1246 38643.0827 0.00 1.00
## meet -1.0626 12633.5575 0.00 1.00
## sever 20.4120 30927.2811 0.00 1.00
## question -34.6747 18588.4409 0.00 1.00
## write 44.0618 28249.1186 0.00 1.00
## think -12.1812 20772.9999 0.00 1.00
## point 5.4984 34025.6562 0.00 1.00
## let -27.6334 14620.6750 0.00 1.00
## link -6.9285 13446.9461 0.00 1.00
## communic 15.7955 8958.0878 0.00 1.00
## contract -12.9540 14984.7437 0.00 1.00
## either -27.4425 39997.0170 0.00 1.00
## final 8.0749 50075.4525 0.00 1.00
## howev -34.4927 35618.8571 0.00 1.00
## peopl -18.6379 14389.7479 0.00 1.00
## power -5.6431 11727.1593 0.00 1.00
## put -10.5189 26812.4322 0.00 1.00
## run -51.6220 44337.5157 0.00 1.00
## shall 19.2987 30748.7762 0.00 1.00
## soon 23.4975 37313.2839 0.00 1.00
## support -15.3927 19761.5524 0.00 1.00
## attach -10.3659 15343.2999 0.00 1.00
## abl -2.0485 20883.2671 0.00 1.00
## program 1.4441 11831.1619 0.00 1.00
## sorri 6.0356 22992.8231 0.00 1.00
## valu 0.9024 13599.5916 0.00 1.00
## check 1.4252 19631.4419 0.00 1.00
## feel 2.5959 23476.2770 0.00 1.00
## better 42.6315 23599.8880 0.00 1.00
## plan -18.3036 6320.4988 0.00 1.00
## experi 2.4597 22404.6552 0.00 1.00
## hope -14.3545 21794.8858 0.00 1.00
## begin 22.2801 29731.4126 0.00 1.00
## X2000 -36.3065 15559.7816 0.00 1.00
## case -33.7240 28804.2328 0.00 1.00
## depart -40.6847 25092.9541 0.00 1.00
## financi -9.7467 17271.8378 0.00 1.00
## houston -18.5450 7305.0368 0.00 1.00
## intern -7.9907 33512.7815 0.00 1.00
## john -0.5326 28562.0674 0.00 1.00
## juli -13.5778 30093.2708 0.00 1.00
## mark -33.5007 32080.8705 0.00 1.00
## open 21.1417 29613.7993 0.00 1.00
## public -52.4985 23410.5823 0.00 1.00
## sinc -3.4385 35455.9821 0.00 1.00
## still 3.8779 26222.2125 0.00 1.00
## thought 12.4329 30228.1125 0.00 1.00
## univers 12.2758 21969.4115 0.00 1.00
## appreci -21.4464 27616.2809 0.00 1.00
## keep 18.6660 27816.0700 0.00 1.00
## cours 16.6526 18338.3815 0.00 1.00
## direct -20.5061 31942.8882 0.00 1.00
## togeth -23.5481 18689.9723 0.00 1.00
## energi -16.1971 16457.8766 0.00 1.00
## london 6.7453 16419.7348 0.00 1.00
## updat -15.0978 14480.7186 0.00 1.00
## suggest -38.4217 44745.1860 0.00 1.00
## option -1.0852 9325.3243 0.00 1.00
## monday -1.0340 32330.8096 0.00 1.00
## kevin -37.7904 47379.7471 0.00 1.00
## book 4.3007 20235.7919 0.00 1.00
## deal -11.2937 14476.4873 0.00 1.00
## invit 4.3037 22150.2429 0.00 1.00
## tuesday -28.0830 39588.8687 0.00 1.00
## interview -16.4048 18733.9704 0.00 1.00
## schedul 1.9191 35796.8427 0.00 1.00
## school -3.8701 28823.4689 0.00 1.00
## model -22.9233 10487.3469 0.00 1.00
## financ -9.1224 7523.9504 0.00 1.00
## morn -26.4476 34027.8914 0.00 1.00
## attend -34.5055 32573.3227 0.00 1.00
## robert -20.9550 29071.4318 0.00 1.00
## student -18.1473 21856.4156 0.00 1.00
## april -26.2027 22080.5315 0.00 1.00
## talk -10.1057 20206.4181 0.00 1.00
## arrang 10.6947 21352.2139 0.00 1.00
## deriv -49.7106 35873.6725 0.00 1.00
## thursday -14.9135 32617.9203 0.00 1.00
## resum -9.2191 20996.1407 0.00 1.00
## doc -25.9712 26031.8371 0.00 1.00
## confer -0.7503 8557.3634 0.00 1.00
## wednesday -15.2636 26422.7645 0.00 1.00
## edu -0.2122 691.7410 0.00 1.00
## friday -11.4616 19964.7326 0.00 1.00
## ect 0.8685 5341.5129 0.00 1.00
## hou 6.8515 6436.8947 0.00 1.00
## vinc -37.3476 8647.1553 0.00 1.00
## X853 -1.2123 59416.8273 0.00 1.00
## shirley -71.3287 63289.3774 0.00 1.00
## enron -8.7888 5718.8782 0.00 1.00
## kaminski -18.1196 6029.0713 0.00 1.00
## X713 -24.2730 29138.3799 0.00 1.00
## crenshaw 99.9441 67692.0276 0.00 1.00
## vkamin -66.4898 57028.7697 0.00 1.00
## gibner 29.0119 24595.4818 0.00 1.00
## stinson -43.4535 26967.0175 0.00 1.00
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 4409.49 on 4009 degrees of freedom
## Residual deviance: 13.46 on 3679 degrees of freedom
## AIC: 675.5
##
## Number of Fisher Scoring iterations: 25
sum( summary(m.glm)$coef[,4] < 0.05 )## [1] 0
prp(m.cart) Recall that we suspect these word stems are specific to Vincent Kaminski and might affect the generalizability of a spam filter built with his ham data.
table(train$spam, p.glm > 0.5) %>% {sum(diag(.)) / sum(.)}## [1] 0.999
colAUC(p.glm, train$spam)## [,1]
## 0 vs. 1 1
p.cart = predict(m.cart)[,2]
table(train$spam, p.cart > 0.5) %>% {sum(diag(.)) / sum(.)}## [1] 0.94239
colAUC(p.cart, train$spam)## [,1]
## 0 vs. 1 0.9696
p.rf = predict(m.rf,type='prob')[,2]
table(train$spam, p.rf > 0.5) %>% {sum(diag(.)) / sum(.)}## [1] 0.9808
colAUC(p.rf, train$spam)## [,1]
## 0 vs. 1 0.9979
pred = data.frame(glm=p.glm, cart=p.cart, rf=p.rf)
rbind(
ACC= apply(pred, 2, function(x) {
table(train$spam, x > 0.5) %>% {sum(diag(.)) / sum(.)} } ),
colAUC(pred, train$spam)
) %>% t ## ACC 0 vs. 1
## glm 0.99900 1.0000
## cart 0.94239 0.9696
## rf 0.98080 0.9979
Obtain predicted probabilities for the testing set for each of the models,
pred2 = data.frame(
glm = predict(m.glm, test, type='response'),
cart = predict(m.cart, test)[,2],
rf = predict(m.rf, test, type='prob')[,2] )
rbind(
ACC = apply(pred2, 2, function(x) {
table(test$spam, x > 0.5) %>% {sum(diag(.)) / sum(.)} } ),
AUC = colAUC(pred2, test$spam) ) %>% t## ACC 0 vs. 1
## glm 0.95052 0.96275
## cart 0.93946 0.96318
## rf 0.97555 0.99777
see the table above