Data splitting
library(caret)
## Loading required package: lattice
## Loading required package: ggplot2
library(kernlab)
data(spam)
inTrain<- createDataPartition(y=spam$type,p=0.75,list=FALSE) #Split the 75% of the data, the type column, in a matrix
training<- spam[inTrain,] #Training data, 75%
testing<- spam[-inTrain,] #Testing data, 25%
dim(training)
## [1] 3451 58
Fit a model
set.seed(32343)
library(e1071)
modelFit<- train(type~.,data=training,method="glm")
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
modelFit
## Generalized Linear Model
##
## 3451 samples
## 57 predictor
## 2 classes: 'nonspam', 'spam'
##
## No pre-processing
## Resampling: Bootstrapped (25 reps)
##
## Summary of sample sizes: 3451, 3451, 3451, 3451, 3451, 3451, ...
##
## Resampling results
##
## Accuracy Kappa Accuracy SD Kappa SD
## 0.921164 0.8339177 0.00601738 0.01288323
##
##
summary(modelFit)
##
## Call:
## NULL
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -3.9566 -0.1988 0.0000 0.1242 4.9092
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -1.579e+00 1.619e-01 -9.752 < 2e-16 ***
## make -3.598e-01 2.671e-01 -1.347 0.177926
## address -1.257e-01 7.184e-02 -1.750 0.080188 .
## all 9.836e-02 1.255e-01 0.784 0.433080
## num3d 2.280e+00 1.431e+00 1.594 0.111042
## our 6.673e-01 1.286e-01 5.188 2.13e-07 ***
## over 1.027e+00 3.189e-01 3.220 0.001281 **
## remove 1.756e+00 3.186e-01 5.512 3.56e-08 ***
## internet 4.674e-01 1.817e-01 2.573 0.010088 *
## order 5.998e-01 3.163e-01 1.897 0.057881 .
## mail 1.460e-01 8.301e-02 1.758 0.078667 .
## receive -5.669e-01 3.307e-01 -1.714 0.086461 .
## will -8.716e-02 8.366e-02 -1.042 0.297504
## people -1.024e-01 2.559e-01 -0.400 0.689139
## report 1.806e-01 1.489e-01 1.213 0.225296
## addresses 9.648e-01 7.649e-01 1.261 0.207177
## free 1.026e+00 1.623e-01 6.324 2.55e-10 ***
## business 8.192e-01 2.470e-01 3.317 0.000910 ***
## email 1.014e-01 1.317e-01 0.770 0.441338
## you 8.846e-02 3.956e-02 2.236 0.025332 *
## credit 7.439e-01 4.795e-01 1.552 0.120762
## your 3.293e-01 6.558e-02 5.022 5.11e-07 ***
## font 2.349e-01 1.754e-01 1.339 0.180593
## num000 2.402e+00 5.530e-01 4.344 1.40e-05 ***
## money 3.278e-01 1.560e-01 2.101 0.035684 *
## hp -1.754e+00 3.173e-01 -5.528 3.23e-08 ***
## hpl -1.236e+00 5.110e-01 -2.419 0.015580 *
## george -9.891e+00 2.033e+00 -4.865 1.14e-06 ***
## num650 4.305e-01 1.882e-01 2.287 0.022192 *
## lab -4.176e+00 2.612e+00 -1.599 0.109904
## labs -7.129e-02 3.754e-01 -0.190 0.849378
## telnet -5.234e+00 2.560e+00 -2.044 0.040913 *
## num857 4.419e+00 3.487e+00 1.267 0.205131
## data -9.214e-01 3.886e-01 -2.371 0.017724 *
## num415 6.881e-01 1.565e+00 0.440 0.660091
## num85 -2.235e+00 8.335e-01 -2.681 0.007336 **
## technology 9.247e-01 3.659e-01 2.527 0.011506 *
## num1999 3.520e-02 1.966e-01 0.179 0.857875
## parts -5.715e-01 4.067e-01 -1.405 0.160012
## pm -8.840e-01 4.172e-01 -2.119 0.034105 *
## direct 1.324e+00 1.220e+00 1.085 0.277884
## cs -4.129e+01 3.304e+01 -1.250 0.211441
## meeting -2.360e+00 8.576e-01 -2.752 0.005923 **
## original -1.018e+00 8.440e-01 -1.206 0.227827
## project -1.208e+00 5.303e-01 -2.278 0.022741 *
## re -7.891e-01 1.635e-01 -4.825 1.40e-06 ***
## edu -1.634e+00 3.342e-01 -4.890 1.01e-06 ***
## table -1.571e+00 2.264e+00 -0.694 0.487763
## conference -4.427e+00 2.002e+00 -2.212 0.026995 *
## charSemicolon -1.387e+00 5.058e-01 -2.741 0.006117 **
## charRoundbracket -5.144e-01 3.932e-01 -1.308 0.190750
## charSquarebracket -1.533e+00 1.797e+00 -0.853 0.393829
## charExclamation 2.301e-01 6.131e-02 3.753 0.000175 ***
## charDollar 5.087e+00 7.571e-01 6.718 1.84e-11 ***
## charHash 2.649e+00 8.989e-01 2.947 0.003208 **
## capitalAve 6.771e-03 1.950e-02 0.347 0.728385
## capitalLong 7.335e-03 2.771e-03 2.647 0.008129 **
## capitalTotal 1.174e-03 2.568e-04 4.571 4.86e-06 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 4628.1 on 3450 degrees of freedom
## Residual deviance: 1366.4 on 3393 degrees of freedom
## AIC: 1482.4
##
## Number of Fisher Scoring iterations: 13
Final model
modelFit<- train(type~.,data=training,method="glm")
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
modelFit$finalModel
##
## Call: NULL
##
## Coefficients:
## (Intercept) make address
## -1.579104 -0.359826 -0.125683
## all num3d our
## 0.098357 2.279740 0.667319
## over remove internet
## 1.026864 1.756243 0.467408
## order mail receive
## 0.599819 0.145971 -0.566864
## will people report
## -0.087158 -0.102352 0.180577
## addresses free business
## 0.964799 1.026374 0.819168
## email you credit
## 0.101436 0.088458 0.743928
## your font num000
## 0.329343 0.234893 2.402310
## money hp hpl
## 0.327769 -1.753944 -1.235949
## george num650 lab
## -9.891090 0.430453 -4.176326
## labs telnet num857
## -0.071294 -5.233978 4.418629
## data num415 num85
## -0.921409 0.688109 -2.234776
## technology num1999 parts
## 0.924736 0.035201 -0.571491
## pm direct cs
## -0.883972 1.323550 -41.289004
## meeting original project
## -2.360067 -1.017857 -1.207804
## re edu table
## -0.789071 -1.634148 -1.570612
## conference charSemicolon charRoundbracket
## -4.426681 -1.386503 -0.514393
## charSquarebracket charExclamation charDollar
## -1.532533 0.230127 5.086500
## charHash capitalAve capitalLong
## 2.648971 0.006771 0.007335
## capitalTotal
## 0.001174
##
## Degrees of Freedom: 3450 Total (i.e. Null); 3393 Residual
## Null Deviance: 4628
## Residual Deviance: 1366 AIC: 1482
Prediction
predictions<- predict(modelFit,newdata=testing) #Use the newdata to test
predictions
## [1] spam spam spam spam spam spam nonspam nonspam
## [9] nonspam spam nonspam spam spam spam spam nonspam
## [17] spam spam spam nonspam spam spam spam nonspam
## [25] nonspam spam spam spam spam spam spam spam
## [33] nonspam spam spam spam spam spam spam nonspam
## [41] spam spam spam spam spam spam spam spam
## [49] spam nonspam spam spam spam spam spam spam
## [57] spam spam spam spam spam spam spam spam
## [65] nonspam nonspam spam spam spam nonspam spam spam
## [73] nonspam nonspam nonspam spam spam spam spam spam
## [81] spam spam spam spam spam spam nonspam spam
## [89] spam spam spam nonspam spam spam spam nonspam
## [97] spam spam spam spam spam spam spam spam
## [105] spam spam spam spam spam spam nonspam spam
## [113] spam nonspam spam spam spam spam nonspam spam
## [121] spam spam nonspam spam spam spam spam spam
## [129] spam spam spam nonspam spam spam spam spam
## [137] spam spam spam spam spam spam spam spam
## [145] spam spam spam spam spam spam spam spam
## [153] spam spam nonspam nonspam spam spam spam spam
## [161] spam spam spam spam spam spam spam spam
## [169] spam spam nonspam spam spam spam spam spam
## [177] spam spam spam spam spam spam nonspam spam
## [185] spam spam spam spam spam spam spam spam
## [193] spam spam spam spam spam spam nonspam spam
## [201] spam spam spam spam spam spam spam spam
## [209] spam spam spam nonspam spam spam spam nonspam
## [217] nonspam spam spam spam spam spam spam spam
## [225] spam spam spam spam spam spam spam nonspam
## [233] spam spam spam spam spam nonspam spam spam
## [241] spam spam spam spam spam spam spam spam
## [249] spam spam spam spam spam spam spam spam
## [257] spam spam spam spam spam spam nonspam spam
## [265] spam spam spam spam spam spam spam spam
## [273] spam spam spam spam spam spam spam spam
## [281] spam spam spam spam spam spam spam spam
## [289] spam spam spam spam spam spam spam spam
## [297] spam spam spam spam nonspam spam spam spam
## [305] spam spam spam spam spam spam spam nonspam
## [313] spam spam nonspam spam spam spam spam spam
## [321] spam spam spam spam spam spam spam spam
## [329] nonspam spam spam spam spam spam nonspam spam
## [337] spam spam nonspam spam spam spam spam spam
## [345] spam spam spam spam spam nonspam spam spam
## [353] spam spam spam spam spam spam spam spam
## [361] spam spam spam spam spam spam nonspam spam
## [369] spam spam spam spam spam nonspam spam spam
## [377] spam spam spam spam spam spam spam spam
## [385] spam spam spam spam spam spam spam nonspam
## [393] spam spam spam spam spam spam spam spam
## [401] spam spam spam spam spam spam spam spam
## [409] spam spam spam spam nonspam nonspam spam spam
## [417] nonspam spam spam nonspam spam spam nonspam spam
## [425] nonspam spam spam nonspam spam spam spam nonspam
## [433] spam spam spam spam spam spam spam spam
## [441] spam spam nonspam spam spam spam spam spam
## [449] spam spam spam spam spam spam nonspam nonspam
## [457] nonspam nonspam nonspam nonspam nonspam nonspam nonspam nonspam
## [465] nonspam nonspam nonspam nonspam nonspam nonspam nonspam nonspam
## [473] nonspam nonspam nonspam nonspam nonspam nonspam nonspam nonspam
## [481] nonspam nonspam nonspam nonspam nonspam nonspam nonspam nonspam
## [489] nonspam nonspam nonspam nonspam nonspam nonspam nonspam nonspam
## [497] nonspam nonspam nonspam nonspam nonspam nonspam nonspam nonspam
## [505] nonspam nonspam nonspam nonspam nonspam nonspam nonspam nonspam
## [513] nonspam nonspam nonspam nonspam nonspam nonspam nonspam spam
## [521] nonspam nonspam nonspam nonspam nonspam nonspam nonspam nonspam
## [529] nonspam nonspam nonspam spam nonspam nonspam nonspam nonspam
## [537] nonspam nonspam nonspam nonspam nonspam nonspam nonspam nonspam
## [545] nonspam nonspam nonspam nonspam spam nonspam nonspam nonspam
## [553] nonspam nonspam nonspam nonspam nonspam nonspam nonspam nonspam
## [561] nonspam nonspam nonspam nonspam nonspam nonspam nonspam nonspam
## [569] nonspam nonspam nonspam nonspam nonspam nonspam nonspam spam
## [577] nonspam nonspam spam nonspam nonspam nonspam nonspam nonspam
## [585] nonspam nonspam nonspam nonspam nonspam nonspam nonspam nonspam
## [593] nonspam nonspam nonspam nonspam nonspam nonspam nonspam nonspam
## [601] nonspam nonspam nonspam nonspam nonspam nonspam nonspam nonspam
## [609] nonspam nonspam nonspam nonspam nonspam nonspam nonspam nonspam
## [617] nonspam nonspam nonspam nonspam nonspam nonspam nonspam nonspam
## [625] nonspam nonspam nonspam spam nonspam spam spam nonspam
## [633] nonspam nonspam nonspam nonspam nonspam nonspam nonspam nonspam
## [641] nonspam nonspam nonspam nonspam nonspam nonspam nonspam nonspam
## [649] nonspam nonspam spam nonspam nonspam nonspam nonspam spam
## [657] nonspam nonspam nonspam nonspam nonspam nonspam nonspam spam
## [665] nonspam nonspam nonspam nonspam nonspam nonspam nonspam nonspam
## [673] nonspam nonspam nonspam nonspam nonspam spam nonspam nonspam
## [681] nonspam nonspam nonspam nonspam nonspam nonspam nonspam nonspam
## [689] spam nonspam nonspam nonspam nonspam nonspam nonspam nonspam
## [697] nonspam nonspam nonspam nonspam nonspam nonspam nonspam nonspam
## [705] nonspam nonspam nonspam nonspam nonspam spam spam nonspam
## [713] nonspam nonspam nonspam nonspam nonspam nonspam nonspam nonspam
## [721] nonspam nonspam nonspam nonspam nonspam nonspam nonspam nonspam
## [729] nonspam nonspam nonspam nonspam nonspam nonspam nonspam nonspam
## [737] nonspam nonspam nonspam nonspam nonspam nonspam spam nonspam
## [745] nonspam nonspam nonspam nonspam nonspam nonspam nonspam nonspam
## [753] nonspam nonspam nonspam spam nonspam spam nonspam nonspam
## [761] nonspam nonspam nonspam nonspam nonspam nonspam nonspam nonspam
## [769] nonspam nonspam nonspam nonspam nonspam nonspam nonspam nonspam
## [777] nonspam nonspam nonspam nonspam nonspam nonspam nonspam nonspam
## [785] nonspam nonspam nonspam nonspam nonspam nonspam nonspam nonspam
## [793] nonspam nonspam nonspam nonspam nonspam nonspam nonspam nonspam
## [801] nonspam nonspam nonspam nonspam nonspam nonspam nonspam nonspam
## [809] nonspam nonspam nonspam nonspam nonspam nonspam nonspam nonspam
## [817] nonspam spam spam nonspam nonspam nonspam spam spam
## [825] nonspam spam nonspam nonspam nonspam nonspam nonspam nonspam
## [833] nonspam nonspam nonspam nonspam nonspam nonspam nonspam nonspam
## [841] nonspam nonspam nonspam spam nonspam nonspam nonspam nonspam
## [849] nonspam nonspam nonspam nonspam nonspam nonspam nonspam nonspam
## [857] nonspam nonspam nonspam nonspam nonspam nonspam spam nonspam
## [865] nonspam nonspam nonspam nonspam nonspam nonspam nonspam nonspam
## [873] nonspam nonspam nonspam nonspam nonspam nonspam nonspam nonspam
## [881] nonspam nonspam nonspam nonspam nonspam nonspam nonspam nonspam
## [889] nonspam nonspam nonspam nonspam nonspam nonspam nonspam nonspam
## [897] nonspam nonspam nonspam nonspam nonspam nonspam nonspam nonspam
## [905] nonspam nonspam nonspam nonspam nonspam nonspam nonspam nonspam
## [913] nonspam nonspam nonspam nonspam nonspam nonspam nonspam nonspam
## [921] nonspam nonspam nonspam nonspam nonspam nonspam nonspam nonspam
## [929] nonspam nonspam nonspam nonspam nonspam nonspam nonspam spam
## [937] nonspam nonspam nonspam nonspam nonspam nonspam nonspam nonspam
## [945] nonspam nonspam spam nonspam nonspam nonspam nonspam nonspam
## [953] nonspam nonspam nonspam nonspam nonspam nonspam nonspam nonspam
## [961] nonspam nonspam spam nonspam nonspam nonspam nonspam nonspam
## [969] nonspam nonspam nonspam nonspam nonspam nonspam nonspam nonspam
## [977] nonspam nonspam nonspam nonspam nonspam nonspam nonspam nonspam
## [985] nonspam nonspam nonspam nonspam nonspam nonspam nonspam nonspam
## [993] nonspam nonspam nonspam nonspam nonspam nonspam nonspam nonspam
## [1001] nonspam nonspam nonspam nonspam nonspam nonspam nonspam nonspam
## [1009] nonspam nonspam nonspam nonspam nonspam nonspam nonspam nonspam
## [1017] nonspam nonspam nonspam nonspam nonspam spam nonspam nonspam
## [1025] nonspam spam nonspam nonspam nonspam nonspam nonspam nonspam
## [1033] nonspam nonspam nonspam nonspam nonspam nonspam nonspam nonspam
## [1041] nonspam nonspam nonspam nonspam nonspam nonspam nonspam nonspam
## [1049] nonspam nonspam nonspam nonspam spam nonspam nonspam nonspam
## [1057] nonspam nonspam nonspam nonspam spam nonspam nonspam nonspam
## [1065] nonspam nonspam nonspam nonspam nonspam nonspam nonspam nonspam
## [1073] nonspam nonspam nonspam nonspam nonspam nonspam nonspam nonspam
## [1081] nonspam nonspam spam nonspam nonspam nonspam nonspam nonspam
## [1089] nonspam nonspam nonspam nonspam nonspam nonspam nonspam nonspam
## [1097] spam nonspam nonspam nonspam nonspam nonspam nonspam nonspam
## [1105] nonspam nonspam nonspam nonspam nonspam nonspam nonspam nonspam
## [1113] nonspam nonspam nonspam nonspam nonspam nonspam nonspam nonspam
## [1121] nonspam nonspam nonspam nonspam nonspam nonspam nonspam nonspam
## [1129] nonspam nonspam nonspam nonspam nonspam nonspam nonspam nonspam
## [1137] nonspam nonspam nonspam nonspam nonspam nonspam nonspam nonspam
## [1145] nonspam nonspam nonspam nonspam nonspam nonspam
## Levels: nonspam spam
Confusion Matrix-Evaluate the Final Model
confusionMatrix(predictions,testing$type)
## Confusion Matrix and Statistics
##
## Reference
## Prediction nonspam spam
## nonspam 662 55
## spam 35 398
##
## Accuracy : 0.9217
## 95% CI : (0.9047, 0.9366)
## No Information Rate : 0.6061
## P-Value [Acc > NIR] : <2e-16
##
## Kappa : 0.8348
## Mcnemar's Test P-Value : 0.0452
##
## Sensitivity : 0.9498
## Specificity : 0.8786
## Pos Pred Value : 0.9233
## Neg Pred Value : 0.9192
## Prevalence : 0.6061
## Detection Rate : 0.5757
## Detection Prevalence : 0.6235
## Balanced Accuracy : 0.9142
##
## 'Positive' Class : nonspam
##