packages = c(
"dplyr","ggplot2","caTools","tm","SnowballC","ROCR","rpart",
"rpart.plot","randomForest")
existing = as.character(installed.packages()[,1])
for(pkg in packages[!(packages %in% existing)]) install.packages(pkg)
rm(list=ls(all=TRUE))
Sys.setlocale("LC_ALL","C")
## [1] "C"
options(digits=5, scipen=10)
library(dplyr)
library(tm)
library(SnowballC)
library(ROCR)
library(caTools)
library(rpart)
library(rpart.plot)
library(randomForest)
D = read.csv("data/emails.csv", stringsAsFactors = F)
nrow(D)
## [1] 5728
table(D$spam)
##
## 0 1
## 4360 1368
substr(D$text[1:5], 1, 60)
## [1] "Subject: naturally irresistible your corporate identity lt "
## [2] "Subject: the stock trading gunslinger fanny is merrill but "
## [3] "Subject: unbelievable new homes made easy im wanting to sho"
## [4] "Subject: 4 color printing special request additional inform"
## [5] "Subject: do not have money , get software cds from here ! s"
【P1.4】Could a spam classifier potentially benefit from including the frequency of the word that appears in every email?
nchar(D$text) %>% max
## [1] 43952
nchar(D$text) %>% which.min
## [1] 1992
corp = Corpus(VectorSource(D$text))
corp = tm_map(corp, content_transformer(tolower))
## Warning in tm_map.SimpleCorpus(corp, content_transformer(tolower)):
## transformation drops documents
corp = tm_map(corp, removePunctuation)
## Warning in tm_map.SimpleCorpus(corp, removePunctuation): transformation
## drops documents
corp = tm_map(corp, removeWords, stopwords("english"))
## Warning in tm_map.SimpleCorpus(corp, removeWords, stopwords("english")):
## transformation drops documents
corp = tm_map(corp, stemDocument)
## Warning in tm_map.SimpleCorpus(corp, stemDocument): transformation drops
## documents
dtm = DocumentTermMatrix(corp)
【P2.1】How many terms are in dtm?
dtm
## <<DocumentTermMatrix (documents: 5728, terms: 28687)>>
## Non-/sparse entries: 481719/163837417
## Sparsity : 100%
## Maximal term length: 24
## Weighting : term frequency (tf)
Limit dtm
to contain terms appearing in at least 5%
spdtm = removeSparseTerms(dtm, 0.95)
【P2.2】How many terms are in spdtm
?
spdtm
## <<DocumentTermMatrix (documents: 5728, terms: 330)>>
## Non-/sparse entries: 213551/1676689
## Sparsity : 89%
## Maximal term length: 10
## Weighting : term frequency (tf)
Build a data frame ems
from spdtm
ems = as.data.frame(as.matrix(spdtm))
【P2.3】What is the most frequent word in spdtm
?
colSums(ems) %>% sort %>% tail
## hou will vinc subject ect enron
## 5577 8252 8532 10202 11427 13388
Incorporate target variable spam
ems$spam = D$spam
【P2.4】How many word stems appear at least 5000 times in the ham emails in the dataset?
subset(ems, spam==0) %>% colSums %>% sort %>% tail(10)
## com pleas kaminski 2000 hou will vinc subject
## 4444 4494 4801 4935 5569 6802 8531 8625
## ect enron
## 11417 13388
【P2.5】How many word stems appear at least 1000 times in the spam emails in the dataset?
subset(ems, spam==1) %>% colSums %>% {.[. > 1000]}
## compani subject will spam
## 1065 1577 1450 1368
【P2.6】The lists of most common words are significantly different between the spam and ham emails. What does this likely imply?
【P2.7】Several of the most common word stems from the ham documents, such as “enron”, “hou” (short for Houston), “vinc” (the word stem of “Vince”) and “kaminski”, are likely specific to Vincent Kaminski’s inbox. What does this mean about the applicability of the text analytics models we will train for the spam filtering problem?
Split the data and build GLM, CART and random forest models …
ems$spam = factor(ems$spam)
names(ems) = make.names(names(ems))
set.seed(123); spl = sample.split(ems$spam, 0.7)
train = subset(ems, spl == TRUE)
test = subset(ems, spl == FALSE)
table(test$spam) %>% prop.table # 0.76135
##
## 0 1
## 0.76135 0.23865
m.glm = glm(spam ~ ., train, family = 'binomial')
## Warning: glm.fit: algorithm did not converge
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
m.cart = rpart(spam ~ ., train, method="class")
set.seed(123); m.rf = randomForest(spam ~ ., train)
p.glm = predict(m.glm,type='response')
【P3.1a】 How many of the training set predicted probabilities from spamLog are less than 0.00001?
sum(p.glm < 0.00001)
## [1] 3046
【P3.1b】 How many of the training set predicted probabilities from spamLog are more than 0.99999?
sum(p.glm > 0.99999)
## [1] 954
【P3.1c】 How many of the training set predicted probabilities from spamLog are between 0.00001 and 0.99999?
sum(p.glm >= 0.00001 & p.glm <= 0.99999)
## [1] 10
【P3.2】How many variables are labeled as significant (at the p=0.05 level) in the logistic regression summary output?
summary(m.glm)
##
## Call:
## glm(formula = spam ~ ., family = "binomial", data = train)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -1.01 0.00 0.00 0.00 1.35
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -30.8167 10548.7431 0.00 1.00
## busi -4.8029 10002.2892 0.00 1.00
## chang -27.1680 22152.8907 0.00 1.00
## compani 4.7813 9186.3255 0.00 1.00
## corpor -0.8286 28181.3478 0.00 1.00
## day -6.0998 5866.2876 0.00 1.00
## done 6.8284 18822.0500 0.00 1.00
## effect 19.4824 21002.4128 0.00 1.00
## effort 16.0582 56700.5791 0.00 1.00
## even -16.5389 22886.6379 0.00 1.00
## full 21.2510 21904.3401 0.00 1.00
## good 5.3994 16193.4281 0.00 1.00
## inform 20.7808 8549.0245 0.00 1.00
## interest 26.9804 11587.5921 0.00 1.00
## list -8.6921 2148.9795 0.00 1.00
## look -7.0307 15631.4459 0.00 1.00
## made 2.8205 27432.2618 0.00 1.00
## make 29.0054 15276.3527 0.00 1.00
## manag 6.0145 14452.5449 0.00 1.00
## market 7.8952 8012.2953 0.00 1.00
## much 0.3775 13921.5777 0.00 1.00
## order 6.5327 12424.0848 0.00 1.00
## origin 32.2628 38175.2472 0.00 1.00
## product 10.1584 13447.6403 0.00 1.00
## provid 0.2423 18589.0873 0.00 1.00
## realli -26.6685 46403.4563 0.00 1.00
## result -0.5002 31401.0516 0.00 1.00
## see -11.1990 12932.4679 0.00 1.00
## special 17.7707 27552.3644 0.00 1.00
## subject 30.4112 10548.7431 0.00 1.00
## system 3.7780 9148.6586 0.00 1.00
## use -13.8535 9381.7631 0.00 1.00
## websit -25.6266 18475.0280 0.00 1.00
## will -11.1938 5980.4800 0.00 1.00
## within 29.0029 21632.4965 0.00 1.00
## without 19.4198 17628.7430 0.00 1.00
## continu 14.8661 15351.2387 0.00 1.00
## group 0.5264 10371.4780 0.00 1.00
## like 5.6494 7659.8787 0.00 1.00
## trade -17.5502 14825.1007 0.00 1.00
## tri 0.9278 12819.6375 0.00 1.00
## approv -1.3015 15894.7792 0.00 1.00
## ask -7.7459 19763.1683 0.00 1.00
## complet -13.6288 20237.9036 0.00 1.00
## credit 26.1738 13138.0027 0.00 1.00
## form 8.4835 16736.5446 0.00 1.00
## hear 28.8653 22809.1143 0.00 1.00
## home 5.9729 8964.8271 0.00 1.00
## new 1.0033 10091.5253 0.00 1.00
## offer 11.7383 10837.2211 0.00 1.00
## opportun -4.1312 19183.2113 0.00 1.00
## rate -3.1121 13189.5698 0.00 1.00
## take 5.7314 17156.1217 0.00 1.00
## time -5.9210 8334.7095 0.00 1.00
## visit 25.8460 11697.8534 0.00 1.00
## want -2.5551 11057.5646 0.00 1.00
## way 13.3897 11375.3854 0.00 1.00
## addit 1.4635 27027.1167 0.00 1.00
## click 13.7612 7076.9896 0.00 1.00
## com 1.9363 4039.2049 0.00 1.00
## fax 3.5370 33855.8899 0.00 1.00
## mail 7.5837 10210.9569 0.00 1.00
## messag 17.1570 2561.5756 0.01 0.99
## now 37.8968 12190.2490 0.00 1.00
## phone -6.9566 11717.6917 0.00 1.00
## request -12.3189 11669.6611 0.00 1.00
## version -36.0636 29386.8036 0.00 1.00
## best -8.2005 1333.3866 -0.01 1.00
## end -13.1054 29380.6882 0.00 1.00
## get 5.1538 9737.0707 0.00 1.00
## great 12.2194 10901.0790 0.00 1.00
## money 32.6355 13212.0683 0.00 1.00
## softwar 25.7485 10593.0947 0.00 1.00
## custom 18.2882 10079.0874 0.00 1.00
## hello 21.6555 13606.7312 0.00 1.00
## one 12.4124 6652.0320 0.00 1.00
## onlin 35.8862 16649.7350 0.00 1.00
## pleas -7.9614 9484.4639 0.00 1.00
## access -14.7972 13353.4899 0.00 1.00
## account 24.8812 8164.7879 0.00 1.00
## allow 18.9918 6436.3710 0.00 1.00
## alreadi -24.0748 33188.2885 0.00 1.00
## also 29.8967 13781.7901 0.00 1.00
## applic -2.6487 16735.5771 0.00 1.00
## area 20.4064 22657.7744 0.00 1.00
## assist -11.2827 24895.2579 0.00 1.00
## base -13.5426 21218.1714 0.00 1.00
## believ 32.3259 21360.0825 0.00 1.00
## buy 41.7019 38923.9252 0.00 1.00
## can 3.7617 7673.8931 0.00 1.00
## cost -1.9376 18329.8873 0.00 1.00
## creat 13.3762 39460.0516 0.00 1.00
## current 3.6291 17066.2426 0.00 1.00
## design -7.9231 29388.9389 0.00 1.00
## develop 5.9764 9454.5606 0.00 1.00
## differ -2.2929 10749.5997 0.00 1.00
## director -17.6981 17932.0129 0.00 1.00
## discuss -10.5101 19154.3531 0.00 1.00
## due -4.1627 35316.3726 0.00 1.00
## email 3.8328 11856.5459 0.00 1.00
## event 16.9419 18505.8473 0.00 1.00
## expect -11.7869 19139.4171 0.00 1.00
## file -29.4324 21649.5737 0.00 1.00
## forward -3.4840 18642.9364 0.00 1.00
## futur 41.4595 14387.2419 0.00 1.00
## gas -3.9009 4160.2926 0.00 1.00
## give -25.1831 21296.8349 0.00 1.00
## given -21.8641 54264.0263 0.00 1.00
## high -1.9820 25536.2327 0.00 1.00
## import -1.8593 22364.3382 0.00 1.00
## includ -3.4544 17988.8912 0.00 1.00
## increas 6.4759 23286.6404 0.00 1.00
## industri -31.6007 23734.8108 0.00 1.00
## invest 32.0125 23934.4148 0.00 1.00
## involv 38.1486 33152.6085 0.00 1.00
## just -10.2116 11140.8256 0.00 1.00
## know 12.7708 15263.5677 0.00 1.00
## locat 20.7257 15965.7168 0.00 1.00
## mani 18.8505 14418.0274 0.00 1.00
## may -9.4339 13969.5651 0.00 1.00
## mean 0.6078 29518.7119 0.00 1.00
## mention -22.7859 27136.9157 0.00 1.00
## might 12.4416 17533.0051 0.00 1.00
## month -3.7267 11123.6690 0.00 1.00
## need 0.8437 12207.6171 0.00 1.00
## note 14.4603 22937.8916 0.00 1.00
## number -9.6218 15914.5979 0.00 1.00
## offic -13.4416 23114.7234 0.00 1.00
## oper -16.9570 27565.6010 0.00 1.00
## person 18.6976 9575.4766 0.00 1.00
## posit -15.4311 23155.9923 0.00 1.00
## possibl -13.6596 24918.1573 0.00 1.00
## present -6.1630 12775.0563 0.00 1.00
## price 3.4276 7849.8596 0.00 1.00
## problem 12.6202 9763.0319 0.00 1.00
## process -0.2957 11905.8484 0.00 1.00
## project 2.1733 14973.0515 0.00 1.00
## read -15.2745 21446.7493 0.00 1.00
## relat -51.1383 17926.4612 0.00 1.00
## report -14.8212 14769.9198 0.00 1.00
## requir 0.5004 29365.4547 0.00 1.00
## research -28.2590 15526.4663 0.00 1.00
## resourc -27.3489 35221.0605 0.00 1.00
## return 17.4510 18435.1876 0.00 1.00
## review -4.8245 10132.7968 0.00 1.00
## risk -4.0008 17177.9984 0.00 1.00
## secur -16.0368 2200.7143 -0.01 0.99
## servic -7.1643 12351.2210 0.00 1.00
## set -9.3532 26268.8952 0.00 1.00
## short -8.9735 17207.5148 0.00 1.00
## specif -23.3669 30834.2029 0.00 1.00
## state 12.2075 16772.1315 0.00 1.00
## term 20.1329 23031.5438 0.00 1.00
## thing 25.7860 13405.1719 0.00 1.00
## today -17.6156 19649.5746 0.00 1.00
## two -25.7267 18439.4399 0.00 1.00
## understand 9.3072 23416.6569 0.00 1.00
## unit -4.0205 30080.6466 0.00 1.00
## well -22.2193 9713.4012 0.00 1.00
## work -10.9874 11596.3171 0.00 1.00
## hour 2.4780 13334.9003 0.00 1.00
## lot -19.6368 13211.3752 0.00 1.00
## real 20.4591 23580.8524 0.00 1.00
## right 23.1185 15904.4579 0.00 1.00
## start 14.3748 18972.2695 0.00 1.00
## X000 14.7384 10583.7959 0.00 1.00
## X2001 -32.1477 13177.6879 0.00 1.00
## follow 17.6578 3079.6809 0.01 1.00
## name 16.7214 13218.4481 0.00 1.00
## sent -14.8820 21953.7964 0.00 1.00
## last 1.0464 13724.4471 0.00 1.00
## avail 8.6511 17094.5716 0.00 1.00
## first -0.4666 20429.8045 0.00 1.00
## http 25.2794 21071.1240 0.00 1.00
## join -38.2408 23338.6228 0.00 1.00
## line 8.7432 12361.5396 0.00 1.00
## next. 14.9230 17244.6865 0.00 1.00
## remov 23.2545 24837.8658 0.00 1.00
## repli 15.3798 29155.6188 0.00 1.00
## wish 11.7309 31747.3794 0.00 1.00
## www -7.8672 22237.5989 0.00 1.00
## year -10.1029 10394.6904 0.00 1.00
## back -13.2347 22723.0238 0.00 1.00
## internet 8.7490 10999.9271 0.00 1.00
## member 13.8130 23429.9085 0.00 1.00
## receiv 0.5765 15848.4961 0.00 1.00
## site 8.6886 14955.3526 0.00 1.00
## anoth -8.7440 20316.9364 0.00 1.00
## associ 9.0494 19093.5413 0.00 1.00
## comment -3.2514 33870.0142 0.00 1.00
## corp 16.0550 27083.0385 0.00 1.00
## date -2.7862 16985.3060 0.00 1.00
## find -2.6228 9727.0946 0.00 1.00
## free 6.1132 8121.0418 0.00 1.00
## issu -37.0837 33960.7079 0.00 1.00
## long -14.8913 19336.4493 0.00 1.00
## move -38.3362 30112.4663 0.00 1.00
## particip -11.5427 17383.3058 0.00 1.00
## recent -2.0667 17795.1699 0.00 1.00
## respons -19.5960 36666.0058 0.00 1.00
## say 7.3662 22174.2442 0.00 1.00
## week -6.7950 10458.9864 0.00 1.00
## dear -2.3132 23063.8923 0.00 1.00
## regard -3.6681 15110.0149 0.00 1.00
## thank -38.9047 10586.9613 0.00 1.00
## address -4.6129 11134.3868 0.00 1.00
## contact 1.5300 12616.5255 0.00 1.00
## engin 26.6429 23936.0768 0.00 1.00
## etc 0.9470 15694.7652 0.00 1.00
## immedi 62.8533 33464.6929 0.00 1.00
## net 12.5616 21972.8129 0.00 1.00
## per 13.6749 12732.8339 0.00 1.00
## place 9.0053 36608.9650 0.00 1.00
## respond 29.7419 38879.3034 0.00 1.00
## sincer -20.7317 35145.2647 0.00 1.00
## type -14.4737 27548.2578 0.00 1.00
## come -1.1662 15107.7386 0.00 1.00
## confirm -12.9969 15139.7258 0.00 1.00
## analysi -24.0500 38603.0306 0.00 1.00
## bring 16.0664 67670.9680 0.00 1.00
## call -1.1450 11111.0678 0.00 1.00
## data -26.0909 22714.2774 0.00 1.00
## detail 11.9692 23008.8487 0.00 1.00
## happi 0.0194 12018.6881 0.00 1.00
## idea -18.4486 38918.5070 0.00 1.00
## info -1.2547 4857.1202 0.00 1.00
## send -24.2677 12224.2134 0.00 1.00
## success 4.3436 27830.4737 0.00 1.00
## sure -5.5027 20777.0982 0.00 1.00
## team 7.9405 25703.8499 0.00 1.00
## web 2.7907 16859.8165 0.00 1.00
## don 21.2866 14561.0671 0.00 1.00
## copi -42.7383 30699.5682 0.00 1.00
## help 17.3096 2790.8998 0.01 1.00
## part 4.5943 34830.4298 0.00 1.00
## life 58.1246 38643.0827 0.00 1.00
## meet -1.0626 12633.5575 0.00 1.00
## sever 20.4120 30927.2811 0.00 1.00
## question -34.6747 18588.4409 0.00 1.00
## write 44.0618 28249.1186 0.00 1.00
## think -12.1812 20772.9999 0.00 1.00
## point 5.4984 34025.6561 0.00 1.00
## let -27.6334 14620.6750 0.00 1.00
## link -6.9285 13446.9461 0.00 1.00
## communic 15.7955 8958.0878 0.00 1.00
## contract -12.9540 14984.7437 0.00 1.00
## either -27.4425 39997.0170 0.00 1.00
## final 8.0749 50075.4525 0.00 1.00
## howev -34.4927 35618.8571 0.00 1.00
## peopl -18.6379 14389.7479 0.00 1.00
## power -5.6431 11727.1593 0.00 1.00
## put -10.5189 26812.4322 0.00 1.00
## run -51.6220 44337.5156 0.00 1.00
## shall 19.2987 30748.7761 0.00 1.00
## soon 23.4975 37313.2839 0.00 1.00
## support -15.3927 19761.5524 0.00 1.00
## attach -10.3659 15343.2998 0.00 1.00
## abl -2.0485 20883.2671 0.00 1.00
## program 1.4441 11831.1619 0.00 1.00
## sorri 6.0356 22992.8231 0.00 1.00
## valu 0.9024 13599.5916 0.00 1.00
## check 1.4252 19631.4419 0.00 1.00
## feel 2.5959 23476.2770 0.00 1.00
## better 42.6315 23599.8879 0.00 1.00
## plan -18.3036 6320.4988 0.00 1.00
## experi 2.4597 22404.6552 0.00 1.00
## hope -14.3545 21794.8858 0.00 1.00
## begin 22.2801 29731.4126 0.00 1.00
## X2000 -36.3065 15559.7816 0.00 1.00
## case -33.7240 28804.2328 0.00 1.00
## depart -40.6847 25092.9541 0.00 1.00
## financi -9.7467 17271.8378 0.00 1.00
## houston -18.5450 7305.0368 0.00 1.00
## intern -7.9907 33512.7814 0.00 1.00
## john -0.5326 28562.0674 0.00 1.00
## juli -13.5778 30093.2708 0.00 1.00
## mark -33.5007 32080.8705 0.00 1.00
## open 21.1417 29613.7993 0.00 1.00
## public -52.4985 23410.5823 0.00 1.00
## sinc -3.4385 35455.9820 0.00 1.00
## still 3.8779 26222.2125 0.00 1.00
## thought 12.4329 30228.1125 0.00 1.00
## univers 12.2758 21969.4115 0.00 1.00
## appreci -21.4464 27616.2809 0.00 1.00
## keep 18.6660 27816.0700 0.00 1.00
## cours 16.6526 18338.3815 0.00 1.00
## direct -20.5061 31942.8882 0.00 1.00
## togeth -23.5481 18689.9723 0.00 1.00
## energi -16.1971 16457.8766 0.00 1.00
## london 6.7453 16419.7348 0.00 1.00
## updat -15.0978 14480.7185 0.00 1.00
## suggest -38.4217 44745.1860 0.00 1.00
## option -1.0852 9325.3243 0.00 1.00
## monday -1.0340 32330.8096 0.00 1.00
## kevin -37.7904 47379.7471 0.00 1.00
## book 4.3007 20235.7919 0.00 1.00
## deal -11.2937 14476.4873 0.00 1.00
## invit 4.3037 22150.2429 0.00 1.00
## tuesday -28.0830 39588.8687 0.00 1.00
## interview -16.4048 18733.9704 0.00 1.00
## schedul 1.9191 35796.8427 0.00 1.00
## school -3.8701 28823.4689 0.00 1.00
## model -22.9233 10487.3469 0.00 1.00
## financ -9.1224 7523.9504 0.00 1.00
## morn -26.4476 34027.8914 0.00 1.00
## attend -34.5055 32573.3227 0.00 1.00
## robert -20.9550 29071.4318 0.00 1.00
## student -18.1473 21856.4156 0.00 1.00
## april -26.2027 22080.5315 0.00 1.00
## talk -10.1057 20206.4181 0.00 1.00
## arrang 10.6947 21352.2139 0.00 1.00
## deriv -49.7106 35873.6724 0.00 1.00
## thursday -14.9135 32617.9203 0.00 1.00
## resum -9.2191 20996.1407 0.00 1.00
## doc -25.9712 26031.8370 0.00 1.00
## confer -0.7503 8557.3634 0.00 1.00
## wednesday -15.2636 26422.7645 0.00 1.00
## edu -0.2122 691.7410 0.00 1.00
## friday -11.4616 19964.7326 0.00 1.00
## ect 0.8685 5341.5129 0.00 1.00
## hou 6.8515 6436.8947 0.00 1.00
## vinc -37.3476 8647.1553 0.00 1.00
## X853 -1.2123 59416.8273 0.00 1.00
## shirley -71.3287 63289.3774 0.00 1.00
## enron -8.7888 5718.8782 0.00 1.00
## kaminski -18.1196 6029.0713 0.00 1.00
## X713 -24.2730 29138.3799 0.00 1.00
## crenshaw 99.9441 67692.0276 0.00 1.00
## vkamin -66.4898 57028.7697 0.00 1.00
## gibner 29.0119 24595.4818 0.00 1.00
## stinson -43.4535 26967.0175 0.00 1.00
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 4409.49 on 4009 degrees of freedom
## Residual deviance: 13.46 on 3679 degrees of freedom
## AIC: 675.5
##
## Number of Fisher Scoring iterations: 25
sum( summary(m.glm)$coef[,4] < 0.05 )
## [1] 0
【P3.3】How many of the word stems “enron”, “hou”, “vinc”, and “kaminski” appear in the CART tree?
prp(m.cart)
Recall that we suspect these word stems are specific to Vincent Kaminski and might affect the generalizability of a spam filter built with his ham data.
table(train$spam, p.glm > 0.5) %>% {sum(diag(.)) / sum(.)}
## [1] 0.999
colAUC(p.glm, train$spam)
## [,1]
## 0 vs. 1 1
p.cart = predict(m.cart)[,2]
table(train$spam, p.cart > 0.5) %>% {sum(diag(.)) / sum(.)}
## [1] 0.94239
colAUC(p.cart, train$spam)
## [,1]
## 0 vs. 1 0.9696
p.rf = predict(m.rf,type='prob')[,2]
table(train$spam, p.rf > 0.5) %>% {sum(diag(.)) / sum(.)}
## [1] 0.9808
colAUC(p.rf, train$spam)
## [,1]
## 0 vs. 1 0.9979
pred = data.frame(glm=p.glm, cart=p.cart, rf=p.rf)
rbind(
ACC= apply(pred, 2, function(x) {
table(train$spam, x > 0.5) %>% {sum(diag(.)) / sum(.)} } ),
colAUC(pred, train$spam)
) %>% t
## ACC 0 vs. 1
## glm 0.99900 1.0000
## cart 0.94239 0.9696
## rf 0.98080 0.9979
Obtain predicted probabilities for the testing set for each of the models,
pred2 = data.frame(
glm = predict(m.glm, test, type='response'),
cart = predict(m.cart, test)[,2],
rf = predict(m.rf, test, type='prob')[,2] )
rbind(
ACC = apply(pred2, 2, function(x) {
table(test$spam, x > 0.5) %>% {sum(diag(.)) / sum(.)} } ),
AUC = colAUC(pred2, test$spam) ) %>% t
## ACC 0 vs. 1
## glm 0.95052 0.96275
## cart 0.93946 0.96318
## rf 0.97555 0.99777
see the table above