Problem Defination
Determine whether a given email is spam or not, if its spam then status =1 or else 0

Approach We will use different versions of random forest model to check for the accuracy and will perform the prediction using the method with minimum error

  1. Random Forest Default
  2. Random Forest with 10 fold cross validation
  3. Random Forest (GBM)
  4. Random Forest (OOB)
  5. Random Forest Modified(w.r.t mtry or ntrees)

Load Libs

library(plyr)
library(tidyr)
library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:plyr':
## 
##     arrange, count, desc, failwith, id, mutate, rename, summarise,
##     summarize
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(ggplot2)
library(caret)
## Loading required package: lattice
library(randomForest)
## randomForest 4.6-12
## Type rfNews() to see new features/changes/bug fixes.
## 
## Attaching package: 'randomForest'
## The following object is masked from 'package:ggplot2':
## 
##     margin
## The following object is masked from 'package:dplyr':
## 
##     combine
library(gbm)
## Loading required package: survival
## 
## Attaching package: 'survival'
## The following object is masked from 'package:caret':
## 
##     cluster
## Loading required package: splines
## Loading required package: parallel
## Loaded gbm 2.1.3
library(corrgram)
## 
## Attaching package: 'corrgram'
## The following object is masked from 'package:plyr':
## 
##     baseball

Functions for detecting NA and to check for correlation

detectNA <- function(inp) {
  sum(is.na(inp))
}

detectCor <- function(x) {
  cor(as.numeric(dfrSpamMailData[, x]), 
    as.numeric(dfrSpamMailData$status), 
    method="spearman")
}

Load Dataset

dfrSpamMailData <- read.csv("./data/spambase.csv", header=T, stringsAsFactors=T)
head(dfrSpamMailData)
##   word_freq_make word_freq_address word_freq_all word_freq_3d
## 1           0.00              0.64          0.64            0
## 2           0.21              0.28          0.50            0
## 3           0.06              0.00          0.71            0
## 4           0.00              0.00          0.00            0
## 5           0.00              0.00          0.00            0
## 6           0.00              0.00          0.00            0
##   word_freq_our word_freq_over word_freq_remove word_freq_internet
## 1          0.32           0.00             0.00               0.00
## 2          0.14           0.28             0.21               0.07
## 3          1.23           0.19             0.19               0.12
## 4          0.63           0.00             0.31               0.63
## 5          0.63           0.00             0.31               0.63
## 6          1.85           0.00             0.00               1.85
##   word_freq_order word_freq_mail word_freq_receive word_freq_will
## 1            0.00           0.00              0.00           0.64
## 2            0.00           0.94              0.21           0.79
## 3            0.64           0.25              0.38           0.45
## 4            0.31           0.63              0.31           0.31
## 5            0.31           0.63              0.31           0.31
## 6            0.00           0.00              0.00           0.00
##   word_freq_people word_freq_report word_freq_addresses word_freq_free
## 1             0.00             0.00                0.00           0.32
## 2             0.65             0.21                0.14           0.14
## 3             0.12             0.00                1.75           0.06
## 4             0.31             0.00                0.00           0.31
## 5             0.31             0.00                0.00           0.31
## 6             0.00             0.00                0.00           0.00
##   word_freq_business word_freq_email word_freq_you word_freq_credit
## 1               0.00            1.29          1.93             0.00
## 2               0.07            0.28          3.47             0.00
## 3               0.06            1.03          1.36             0.32
## 4               0.00            0.00          3.18             0.00
## 5               0.00            0.00          3.18             0.00
## 6               0.00            0.00          0.00             0.00
##   word_freq_your word_freq_font word_freq_000 word_freq_money word_freq_hp
## 1           0.96              0          0.00            0.00            0
## 2           1.59              0          0.43            0.43            0
## 3           0.51              0          1.16            0.06            0
## 4           0.31              0          0.00            0.00            0
## 5           0.31              0          0.00            0.00            0
## 6           0.00              0          0.00            0.00            0
##   word_freq_hpl word_freq_george word_freq_650 word_freq_lab
## 1             0                0             0             0
## 2             0                0             0             0
## 3             0                0             0             0
## 4             0                0             0             0
## 5             0                0             0             0
## 6             0                0             0             0
##   word_freq_labs word_freq_telnet word_freq_857 word_freq_data
## 1              0                0             0              0
## 2              0                0             0              0
## 3              0                0             0              0
## 4              0                0             0              0
## 5              0                0             0              0
## 6              0                0             0              0
##   word_freq_415 word_freq_85 word_freq_technology word_freq_1999
## 1             0            0                    0           0.00
## 2             0            0                    0           0.07
## 3             0            0                    0           0.00
## 4             0            0                    0           0.00
## 5             0            0                    0           0.00
## 6             0            0                    0           0.00
##   word_freq_parts word_freq_pm word_freq_direct word_freq_cs
## 1               0            0             0.00            0
## 2               0            0             0.00            0
## 3               0            0             0.06            0
## 4               0            0             0.00            0
## 5               0            0             0.00            0
## 6               0            0             0.00            0
##   word_freq_meeting word_freq_original word_freq_project word_freq_re
## 1                 0               0.00                 0         0.00
## 2                 0               0.00                 0         0.00
## 3                 0               0.12                 0         0.06
## 4                 0               0.00                 0         0.00
## 5                 0               0.00                 0         0.00
## 6                 0               0.00                 0         0.00
##   word_freq_edu word_freq_table word_freq_conference char_freq_.
## 1          0.00               0                    0        0.00
## 2          0.00               0                    0        0.00
## 3          0.06               0                    0        0.01
## 4          0.00               0                    0        0.00
## 5          0.00               0                    0        0.00
## 6          0.00               0                    0        0.00
##   char_freq_..1 char_freq_..2 char_freq_..3 char_freq_..4 char_freq_..5
## 1         0.000             0         0.778         0.000         0.000
## 2         0.132             0         0.372         0.180         0.048
## 3         0.143             0         0.276         0.184         0.010
## 4         0.137             0         0.137         0.000         0.000
## 5         0.135             0         0.135         0.000         0.000
## 6         0.223             0         0.000         0.000         0.000
##   capital_run_length_average capital_run_length_longest
## 1                      3.756                         61
## 2                      5.114                        101
## 3                      9.821                        485
## 4                      3.537                         40
## 5                      3.537                         40
## 6                      3.000                         15
##   capital_run_length_total status
## 1                      278      1
## 2                     1028      1
## 3                     2259      1
## 4                      191      1
## 5                      191      1
## 6                       54      1

Dataframe Stucture

str(dfrSpamMailData)
## 'data.frame':    4601 obs. of  58 variables:
##  $ word_freq_make            : num  0 0.21 0.06 0 0 0 0 0 0.15 0.06 ...
##  $ word_freq_address         : num  0.64 0.28 0 0 0 0 0 0 0 0.12 ...
##  $ word_freq_all             : num  0.64 0.5 0.71 0 0 0 0 0 0.46 0.77 ...
##  $ word_freq_3d              : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ word_freq_our             : num  0.32 0.14 1.23 0.63 0.63 1.85 1.92 1.88 0.61 0.19 ...
##  $ word_freq_over            : num  0 0.28 0.19 0 0 0 0 0 0 0.32 ...
##  $ word_freq_remove          : num  0 0.21 0.19 0.31 0.31 0 0 0 0.3 0.38 ...
##  $ word_freq_internet        : num  0 0.07 0.12 0.63 0.63 1.85 0 1.88 0 0 ...
##  $ word_freq_order           : num  0 0 0.64 0.31 0.31 0 0 0 0.92 0.06 ...
##  $ word_freq_mail            : num  0 0.94 0.25 0.63 0.63 0 0.64 0 0.76 0 ...
##  $ word_freq_receive         : num  0 0.21 0.38 0.31 0.31 0 0.96 0 0.76 0 ...
##  $ word_freq_will            : num  0.64 0.79 0.45 0.31 0.31 0 1.28 0 0.92 0.64 ...
##  $ word_freq_people          : num  0 0.65 0.12 0.31 0.31 0 0 0 0 0.25 ...
##  $ word_freq_report          : num  0 0.21 0 0 0 0 0 0 0 0 ...
##  $ word_freq_addresses       : num  0 0.14 1.75 0 0 0 0 0 0 0.12 ...
##  $ word_freq_free            : num  0.32 0.14 0.06 0.31 0.31 0 0.96 0 0 0 ...
##  $ word_freq_business        : num  0 0.07 0.06 0 0 0 0 0 0 0 ...
##  $ word_freq_email           : num  1.29 0.28 1.03 0 0 0 0.32 0 0.15 0.12 ...
##  $ word_freq_you             : num  1.93 3.47 1.36 3.18 3.18 0 3.85 0 1.23 1.67 ...
##  $ word_freq_credit          : num  0 0 0.32 0 0 0 0 0 3.53 0.06 ...
##  $ word_freq_your            : num  0.96 1.59 0.51 0.31 0.31 0 0.64 0 2 0.71 ...
##  $ word_freq_font            : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ word_freq_000             : num  0 0.43 1.16 0 0 0 0 0 0 0.19 ...
##  $ word_freq_money           : num  0 0.43 0.06 0 0 0 0 0 0.15 0 ...
##  $ word_freq_hp              : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ word_freq_hpl             : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ word_freq_george          : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ word_freq_650             : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ word_freq_lab             : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ word_freq_labs            : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ word_freq_telnet          : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ word_freq_857             : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ word_freq_data            : num  0 0 0 0 0 0 0 0 0.15 0 ...
##  $ word_freq_415             : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ word_freq_85              : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ word_freq_technology      : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ word_freq_1999            : num  0 0.07 0 0 0 0 0 0 0 0 ...
##  $ word_freq_parts           : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ word_freq_pm              : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ word_freq_direct          : num  0 0 0.06 0 0 0 0 0 0 0 ...
##  $ word_freq_cs              : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ word_freq_meeting         : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ word_freq_original        : num  0 0 0.12 0 0 0 0 0 0.3 0 ...
##  $ word_freq_project         : num  0 0 0 0 0 0 0 0 0 0.06 ...
##  $ word_freq_re              : num  0 0 0.06 0 0 0 0 0 0 0 ...
##  $ word_freq_edu             : num  0 0 0.06 0 0 0 0 0 0 0 ...
##  $ word_freq_table           : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ word_freq_conference      : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ char_freq_.               : num  0 0 0.01 0 0 0 0 0 0 0.04 ...
##  $ char_freq_..1             : num  0 0.132 0.143 0.137 0.135 0.223 0.054 0.206 0.271 0.03 ...
##  $ char_freq_..2             : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ char_freq_..3             : num  0.778 0.372 0.276 0.137 0.135 0 0.164 0 0.181 0.244 ...
##  $ char_freq_..4             : num  0 0.18 0.184 0 0 0 0.054 0 0.203 0.081 ...
##  $ char_freq_..5             : num  0 0.048 0.01 0 0 0 0 0 0.022 0 ...
##  $ capital_run_length_average: num  3.76 5.11 9.82 3.54 3.54 ...
##  $ capital_run_length_longest: int  61 101 485 40 40 15 4 11 445 43 ...
##  $ capital_run_length_total  : int  278 1028 2259 191 191 54 112 49 1257 749 ...
##  $ status                    : int  1 1 1 1 1 1 1 1 1 1 ...

Dataframe Summary

lapply(dfrSpamMailData, FUN=summary)
## $word_freq_make
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##  0.0000  0.0000  0.0000  0.1046  0.0000  4.5400 
## 
## $word_freq_address
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   0.000   0.000   0.000   0.213   0.000  14.280 
## 
## $word_freq_all
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##  0.0000  0.0000  0.0000  0.2807  0.4200  5.1000 
## 
## $word_freq_3d
##     Min.  1st Qu.   Median     Mean  3rd Qu.     Max. 
##  0.00000  0.00000  0.00000  0.06542  0.00000 42.81000 
## 
## $word_freq_our
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##  0.0000  0.0000  0.0000  0.3122  0.3800 10.0000 
## 
## $word_freq_over
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##  0.0000  0.0000  0.0000  0.0959  0.0000  5.8800 
## 
## $word_freq_remove
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##  0.0000  0.0000  0.0000  0.1142  0.0000  7.2700 
## 
## $word_freq_internet
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##  0.0000  0.0000  0.0000  0.1053  0.0000 11.1100 
## 
## $word_freq_order
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
## 0.00000 0.00000 0.00000 0.09007 0.00000 5.26000 
## 
## $word_freq_mail
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##  0.0000  0.0000  0.0000  0.2394  0.1600 18.1800 
## 
## $word_freq_receive
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
## 0.00000 0.00000 0.00000 0.05982 0.00000 2.61000 
## 
## $word_freq_will
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##  0.0000  0.0000  0.1000  0.5417  0.8000  9.6700 
## 
## $word_freq_people
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
## 0.00000 0.00000 0.00000 0.09393 0.00000 5.55000 
## 
## $word_freq_report
##     Min.  1st Qu.   Median     Mean  3rd Qu.     Max. 
##  0.00000  0.00000  0.00000  0.05863  0.00000 10.00000 
## 
## $word_freq_addresses
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##  0.0000  0.0000  0.0000  0.0492  0.0000  4.4100 
## 
## $word_freq_free
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##  0.0000  0.0000  0.0000  0.2488  0.1000 20.0000 
## 
## $word_freq_business
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##  0.0000  0.0000  0.0000  0.1426  0.0000  7.1400 
## 
## $word_freq_email
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##  0.0000  0.0000  0.0000  0.1847  0.0000  9.0900 
## 
## $word_freq_you
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   0.000   0.000   1.310   1.662   2.640  18.750 
## 
## $word_freq_credit
##     Min.  1st Qu.   Median     Mean  3rd Qu.     Max. 
##  0.00000  0.00000  0.00000  0.08558  0.00000 18.18000 
## 
## $word_freq_your
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##  0.0000  0.0000  0.2200  0.8098  1.2700 11.1100 
## 
## $word_freq_font
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##  0.0000  0.0000  0.0000  0.1212  0.0000 17.1000 
## 
## $word_freq_000
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##  0.0000  0.0000  0.0000  0.1016  0.0000  5.4500 
## 
## $word_freq_money
##     Min.  1st Qu.   Median     Mean  3rd Qu.     Max. 
##  0.00000  0.00000  0.00000  0.09427  0.00000 12.50000 
## 
## $word_freq_hp
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##  0.0000  0.0000  0.0000  0.5495  0.0000 20.8300 
## 
## $word_freq_hpl
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##  0.0000  0.0000  0.0000  0.2654  0.0000 16.6600 
## 
## $word_freq_george
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##  0.0000  0.0000  0.0000  0.7673  0.0000 33.3300 
## 
## $word_freq_650
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##  0.0000  0.0000  0.0000  0.1248  0.0000  9.0900 
## 
## $word_freq_lab
##     Min.  1st Qu.   Median     Mean  3rd Qu.     Max. 
##  0.00000  0.00000  0.00000  0.09892  0.00000 14.28000 
## 
## $word_freq_labs
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##  0.0000  0.0000  0.0000  0.1029  0.0000  5.8800 
## 
## $word_freq_telnet
##     Min.  1st Qu.   Median     Mean  3rd Qu.     Max. 
##  0.00000  0.00000  0.00000  0.06475  0.00000 12.50000 
## 
## $word_freq_857
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
## 0.00000 0.00000 0.00000 0.04705 0.00000 4.76000 
## 
## $word_freq_data
##     Min.  1st Qu.   Median     Mean  3rd Qu.     Max. 
##  0.00000  0.00000  0.00000  0.09723  0.00000 18.18000 
## 
## $word_freq_415
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
## 0.00000 0.00000 0.00000 0.04784 0.00000 4.76000 
## 
## $word_freq_85
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##  0.0000  0.0000  0.0000  0.1054  0.0000 20.0000 
## 
## $word_freq_technology
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
## 0.00000 0.00000 0.00000 0.09748 0.00000 7.69000 
## 
## $word_freq_1999
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   0.000   0.000   0.000   0.137   0.000   6.890 
## 
## $word_freq_parts
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##  0.0000  0.0000  0.0000  0.0132  0.0000  8.3300 
## 
## $word_freq_pm
##     Min.  1st Qu.   Median     Mean  3rd Qu.     Max. 
##  0.00000  0.00000  0.00000  0.07863  0.00000 11.11000 
## 
## $word_freq_direct
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
## 0.00000 0.00000 0.00000 0.06483 0.00000 4.76000 
## 
## $word_freq_cs
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
## 0.00000 0.00000 0.00000 0.04367 0.00000 7.14000 
## 
## $word_freq_meeting
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##  0.0000  0.0000  0.0000  0.1323  0.0000 14.2800 
## 
## $word_freq_original
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##  0.0000  0.0000  0.0000  0.0461  0.0000  3.5700 
## 
## $word_freq_project
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##  0.0000  0.0000  0.0000  0.0792  0.0000 20.0000 
## 
## $word_freq_re
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##  0.0000  0.0000  0.0000  0.3012  0.1100 21.4200 
## 
## $word_freq_edu
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##  0.0000  0.0000  0.0000  0.1798  0.0000 22.0500 
## 
## $word_freq_table
##     Min.  1st Qu.   Median     Mean  3rd Qu.     Max. 
## 0.000000 0.000000 0.000000 0.005444 0.000000 2.170000 
## 
## $word_freq_conference
##     Min.  1st Qu.   Median     Mean  3rd Qu.     Max. 
##  0.00000  0.00000  0.00000  0.03187  0.00000 10.00000 
## 
## $char_freq_.
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
## 0.00000 0.00000 0.00000 0.03857 0.00000 4.38500 
## 
## $char_freq_..1
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   0.000   0.000   0.065   0.139   0.188   9.752 
## 
## $char_freq_..2
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
## 0.00000 0.00000 0.00000 0.01698 0.00000 4.08100 
## 
## $char_freq_..3
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##  0.0000  0.0000  0.0000  0.2691  0.3150 32.4800 
## 
## $char_freq_..4
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
## 0.00000 0.00000 0.00000 0.07581 0.05200 6.00300 
## 
## $char_freq_..5
##     Min.  1st Qu.   Median     Mean  3rd Qu.     Max. 
##  0.00000  0.00000  0.00000  0.04424  0.00000 19.83000 
## 
## $capital_run_length_average
##     Min.  1st Qu.   Median     Mean  3rd Qu.     Max. 
##    1.000    1.588    2.276    5.192    3.706 1102.000 
## 
## $capital_run_length_longest
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##    1.00    6.00   15.00   52.17   43.00 9989.00 
## 
## $capital_run_length_total
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##     1.0    35.0    95.0   283.3   266.0 15840.0 
## 
## $status
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   0.000   0.000   0.000   0.394   1.000   1.000

Missing Data

lapply(dfrSpamMailData, FUN=detectNA)
## $word_freq_make
## [1] 0
## 
## $word_freq_address
## [1] 0
## 
## $word_freq_all
## [1] 0
## 
## $word_freq_3d
## [1] 0
## 
## $word_freq_our
## [1] 0
## 
## $word_freq_over
## [1] 0
## 
## $word_freq_remove
## [1] 0
## 
## $word_freq_internet
## [1] 0
## 
## $word_freq_order
## [1] 0
## 
## $word_freq_mail
## [1] 0
## 
## $word_freq_receive
## [1] 0
## 
## $word_freq_will
## [1] 0
## 
## $word_freq_people
## [1] 0
## 
## $word_freq_report
## [1] 0
## 
## $word_freq_addresses
## [1] 0
## 
## $word_freq_free
## [1] 0
## 
## $word_freq_business
## [1] 0
## 
## $word_freq_email
## [1] 0
## 
## $word_freq_you
## [1] 0
## 
## $word_freq_credit
## [1] 0
## 
## $word_freq_your
## [1] 0
## 
## $word_freq_font
## [1] 0
## 
## $word_freq_000
## [1] 0
## 
## $word_freq_money
## [1] 0
## 
## $word_freq_hp
## [1] 0
## 
## $word_freq_hpl
## [1] 0
## 
## $word_freq_george
## [1] 0
## 
## $word_freq_650
## [1] 0
## 
## $word_freq_lab
## [1] 0
## 
## $word_freq_labs
## [1] 0
## 
## $word_freq_telnet
## [1] 0
## 
## $word_freq_857
## [1] 0
## 
## $word_freq_data
## [1] 0
## 
## $word_freq_415
## [1] 0
## 
## $word_freq_85
## [1] 0
## 
## $word_freq_technology
## [1] 0
## 
## $word_freq_1999
## [1] 0
## 
## $word_freq_parts
## [1] 0
## 
## $word_freq_pm
## [1] 0
## 
## $word_freq_direct
## [1] 0
## 
## $word_freq_cs
## [1] 0
## 
## $word_freq_meeting
## [1] 0
## 
## $word_freq_original
## [1] 0
## 
## $word_freq_project
## [1] 0
## 
## $word_freq_re
## [1] 0
## 
## $word_freq_edu
## [1] 0
## 
## $word_freq_table
## [1] 0
## 
## $word_freq_conference
## [1] 0
## 
## $char_freq_.
## [1] 0
## 
## $char_freq_..1
## [1] 0
## 
## $char_freq_..2
## [1] 0
## 
## $char_freq_..3
## [1] 0
## 
## $char_freq_..4
## [1] 0
## 
## $char_freq_..5
## [1] 0
## 
## $capital_run_length_average
## [1] 0
## 
## $capital_run_length_longest
## [1] 0
## 
## $capital_run_length_total
## [1] 0
## 
## $status
## [1] 0

Observation
There are no missing values

Check output

dfrSpamMailFreq <- summarise(group_by(dfrSpamMailData, status), count=n())
# boxplot of mpg by car cylinders
ggplot(dfrSpamMailFreq, aes(x=status, y=count)) +
    geom_bar(stat="identity", aes(fill=count)) +
    labs(title="Status Frequency Distribution") +
    labs(x="Status") +
    labs(y="Counts")

Find Corelations

## find correlations
vcnCorsData <- abs(sapply(colnames(dfrSpamMailData), detectCor))
summary(vcnCorsData)
##     Min.  1st Qu.   Median     Mean  3rd Qu.     Max. 
## 0.002525 0.148800 0.253300 0.276900 0.354700 1.000000

Observation
No Multi Collinearity exists.

Show Corelations

vcnCorsData
##             word_freq_make          word_freq_address 
##                 0.24069974                 0.29750940 
##              word_freq_all               word_freq_3d 
##                 0.33283147                 0.09077776 
##              word_freq_our             word_freq_over 
##                 0.40913946                 0.31864550 
##           word_freq_remove         word_freq_internet 
##                 0.51877779                 0.34379623 
##            word_freq_order             word_freq_mail 
##                 0.30073703                 0.29682394 
##          word_freq_receive             word_freq_will 
##                 0.35496682                 0.14847653 
##           word_freq_people           word_freq_report 
##                 0.21287588                 0.14977533 
##        word_freq_addresses             word_freq_free 
##                 0.26515743                 0.50416922 
##         word_freq_business            word_freq_email 
##                 0.35290749                 0.29909391 
##              word_freq_you           word_freq_credit 
##                 0.36110406                 0.32418657 
##             word_freq_your             word_freq_font 
##                 0.50159062                 0.13797471 
##              word_freq_000            word_freq_money 
##                 0.42580256                 0.47215455 
##               word_freq_hp              word_freq_hpl 
##                 0.39981558                 0.34188069 
##           word_freq_george              word_freq_650 
##                 0.35393063                 0.22619064 
##              word_freq_lab             word_freq_labs 
##                 0.22068802                 0.24580530 
##           word_freq_telnet              word_freq_857 
##                 0.20467400                 0.16983798 
##             word_freq_data              word_freq_415 
##                 0.15756347                 0.15802818 
##               word_freq_85       word_freq_technology 
##                 0.21413087                 0.16680254 
##             word_freq_1999            word_freq_parts 
##                 0.26070752                 0.00252536 
##               word_freq_pm           word_freq_direct 
##                 0.14721389                 0.02813193 
##               word_freq_cs          word_freq_meeting 
##                 0.14453750                 0.19574176 
##         word_freq_original          word_freq_project 
##                 0.10781412                 0.14453744 
##               word_freq_re              word_freq_edu 
##                 0.07176763                 0.19702549 
##            word_freq_table       word_freq_conference 
##                 0.02266674                 0.13903044 
##                char_freq_.              char_freq_..1 
##                 0.05683530                 0.03263555 
##              char_freq_..2              char_freq_..3 
##                 0.11122690                 0.59785363 
##              char_freq_..4              char_freq_..5 
##                 0.56563314                 0.26668614 
## capital_run_length_average capital_run_length_longest 
##                 0.48794983                 0.51515693 
##   capital_run_length_total                     status 
##                 0.44397367                 1.00000000

Plot Corelations

corrgram(dfrSpamMailData)

High Corelations

vcnCorsData[vcnCorsData>0.8]
## status 
##      1

Creating the dependent variable as a facor

class(dfrSpamMailData$status)
## [1] "integer"
dfrSpamMailData$status <- as.factor(dfrSpamMailData$status)
class(dfrSpamMailData$status)
## [1] "factor"
table(dfrSpamMailData$status)
## 
##    0    1 
## 2788 1813

Dataset Split

set.seed(707)
vctTrnRecs <- createDataPartition(y=dfrSpamMailData$status, p=0.8, list=FALSE)
dfrTrnData <- dfrSpamMailData[vctTrnRecs,]
dfrTstData <- dfrSpamMailData[-vctTrnRecs,]

Training Dataset RowCount & ColCount

dim(dfrTrnData)
## [1] 3682   58

Testing Dataset Head

head(dfrTstData)
##    word_freq_make word_freq_address word_freq_all word_freq_3d
## 3            0.06              0.00          0.71            0
## 4            0.00              0.00          0.00            0
## 5            0.00              0.00          0.00            0
## 13           0.00              0.69          0.34            0
## 14           0.00              0.00          0.00            0
## 23           0.00              0.00          0.00            0
##    word_freq_our word_freq_over word_freq_remove word_freq_internet
## 3           1.23           0.19             0.19               0.12
## 4           0.63           0.00             0.31               0.63
## 5           0.63           0.00             0.31               0.63
## 13          0.34           0.00             0.00               0.00
## 14          0.90           0.00             0.90               0.00
## 23          2.94           0.00             0.00               0.00
##    word_freq_order word_freq_mail word_freq_receive word_freq_will
## 3             0.64           0.25              0.38           0.45
## 4             0.31           0.63              0.31           0.31
## 5             0.31           0.63              0.31           0.31
## 13            0.00           0.00              0.00           0.69
## 14            0.00           0.90              0.90           0.00
## 23            0.00           0.00              0.00           0.00
##    word_freq_people word_freq_report word_freq_addresses word_freq_free
## 3              0.12                0                1.75           0.06
## 4              0.31                0                0.00           0.31
## 5              0.31                0                0.00           0.31
## 13             0.00                0                0.00           0.34
## 14             0.90                0                0.00           0.00
## 23             0.00                0                0.00           2.94
##    word_freq_business word_freq_email word_freq_you word_freq_credit
## 3                0.06            1.03          1.36             0.32
## 4                0.00            0.00          3.18             0.00
## 5                0.00            0.00          3.18             0.00
## 13               0.00            1.39          2.09             0.00
## 14               0.00            0.00          2.72             0.00
## 23               0.00            0.00          0.00             0.00
##    word_freq_your word_freq_font word_freq_000 word_freq_money
## 3            0.51              0          1.16            0.06
## 4            0.31              0          0.00            0.00
## 5            0.31              0          0.00            0.00
## 13           1.04              0          0.00            0.00
## 14           0.90              0          0.00            0.00
## 23           0.00              0          0.00            0.00
##    word_freq_hp word_freq_hpl word_freq_george word_freq_650 word_freq_lab
## 3             0             0                0             0             0
## 4             0             0                0             0             0
## 5             0             0                0             0             0
## 13            0             0                0             0             0
## 14            0             0                0             0             0
## 23            0             0                0             0             0
##    word_freq_labs word_freq_telnet word_freq_857 word_freq_data
## 3               0                0             0              0
## 4               0                0             0              0
## 5               0                0             0              0
## 13              0                0             0              0
## 14              0                0             0              0
## 23              0                0             0              0
##    word_freq_415 word_freq_85 word_freq_technology word_freq_1999
## 3              0            0                    0              0
## 4              0            0                    0              0
## 5              0            0                    0              0
## 13             0            0                    0              0
## 14             0            0                    0              0
## 23             0            0                    0              0
##    word_freq_parts word_freq_pm word_freq_direct word_freq_cs
## 3                0            0             0.06            0
## 4                0            0             0.00            0
## 5                0            0             0.00            0
## 13               0            0             0.00            0
## 14               0            0             0.00            0
## 23               0            0             0.00            0
##    word_freq_meeting word_freq_original word_freq_project word_freq_re
## 3                  0               0.12                 0         0.06
## 4                  0               0.00                 0         0.00
## 5                  0               0.00                 0         0.00
## 13                 0               0.00                 0         0.00
## 14                 0               0.00                 0         0.00
## 23                 0               0.00                 0         0.00
##    word_freq_edu word_freq_table word_freq_conference char_freq_.
## 3           0.06               0                    0       0.010
## 4           0.00               0                    0       0.000
## 5           0.00               0                    0       0.000
## 13          0.00               0                    0       0.000
## 14          0.00               0                    0       0.000
## 23          0.00               0                    0       0.404
##    char_freq_..1 char_freq_..2 char_freq_..3 char_freq_..4 char_freq_..5
## 3          0.143             0         0.276         0.184          0.01
## 4          0.137             0         0.137         0.000          0.00
## 5          0.135             0         0.135         0.000          0.00
## 13         0.056             0         0.786         0.000          0.00
## 14         0.000             0         0.000         0.000          0.00
## 23         0.404             0         0.809         0.000          0.00
##    capital_run_length_average capital_run_length_longest
## 3                       9.821                        485
## 4                       3.537                         40
## 5                       3.537                         40
## 13                      3.728                         61
## 14                      2.083                          7
## 23                      4.857                         12
##    capital_run_length_total status
## 3                      2259      1
## 4                       191      1
## 5                       191      1
## 13                      261      1
## 14                       25      1
## 23                       34      1

Training Dataset Summary

lapply(dfrTrnData, FUN=summary)
## $word_freq_make
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##  0.0000  0.0000  0.0000  0.1068  0.0000  4.3400 
## 
## $word_freq_address
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##  0.0000  0.0000  0.0000  0.2149  0.0000 14.2800 
## 
## $word_freq_all
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##  0.0000  0.0000  0.0000  0.2828  0.4200  5.1000 
## 
## $word_freq_3d
##     Min.  1st Qu.   Median     Mean  3rd Qu.     Max. 
##  0.00000  0.00000  0.00000  0.06731  0.00000 42.81000 
## 
## $word_freq_our
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##  0.0000  0.0000  0.0000  0.3125  0.3800 10.0000 
## 
## $word_freq_over
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
## 0.00000 0.00000 0.00000 0.09717 0.00000 5.88000 
## 
## $word_freq_remove
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##  0.0000  0.0000  0.0000  0.1178  0.0000  7.2700 
## 
## $word_freq_internet
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##  0.0000  0.0000  0.0000  0.1048  0.0000  6.0600 
## 
## $word_freq_order
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
## 0.00000 0.00000 0.00000 0.09014 0.00000 3.33000 
## 
## $word_freq_mail
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   0.000   0.000   0.000   0.229   0.140  11.110 
## 
## $word_freq_receive
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
## 0.00000 0.00000 0.00000 0.06156 0.00000 2.61000 
## 
## $word_freq_will
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##  0.0000  0.0000  0.1200  0.5519  0.8000  9.6700 
## 
## $word_freq_people
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
## 0.00000 0.00000 0.00000 0.09498 0.00000 5.55000 
## 
## $word_freq_report
##     Min.  1st Qu.   Median     Mean  3rd Qu.     Max. 
##  0.00000  0.00000  0.00000  0.06067  0.00000 10.00000 
## 
## $word_freq_addresses
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
## 0.00000 0.00000 0.00000 0.04662 0.00000 4.41000 
## 
## $word_freq_free
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##  0.0000  0.0000  0.0000  0.2533  0.1000 20.0000 
## 
## $word_freq_business
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##  0.0000  0.0000  0.0000  0.1436  0.0000  7.1400 
## 
## $word_freq_email
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##  0.0000  0.0000  0.0000  0.1788  0.0000  6.6600 
## 
## $word_freq_you
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   0.000   0.000   1.300   1.671   2.650  18.750 
## 
## $word_freq_credit
##     Min.  1st Qu.   Median     Mean  3rd Qu.     Max. 
##  0.00000  0.00000  0.00000  0.08922  0.00000 18.18000 
## 
## $word_freq_your
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##  0.0000  0.0000  0.1900  0.8071  1.2800 10.7100 
## 
## $word_freq_font
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##  0.0000  0.0000  0.0000  0.1231  0.0000 17.1000 
## 
## $word_freq_000
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##  0.0000  0.0000  0.0000  0.1018  0.0000  5.4500 
## 
## $word_freq_money
##     Min.  1st Qu.   Median     Mean  3rd Qu.     Max. 
##  0.00000  0.00000  0.00000  0.09628  0.00000 12.50000 
## 
## $word_freq_hp
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##  0.0000  0.0000  0.0000  0.5557  0.0000 20.8300 
## 
## $word_freq_hpl
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##  0.0000  0.0000  0.0000  0.2683  0.0000 16.6600 
## 
## $word_freq_george
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##  0.0000  0.0000  0.0000  0.7617  0.0000 33.3300 
## 
## $word_freq_650
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##  0.0000  0.0000  0.0000  0.1215  0.0000  5.8800 
## 
## $word_freq_lab
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##  0.0000  0.0000  0.0000  0.1063  0.0000 14.2800 
## 
## $word_freq_labs
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##  0.0000  0.0000  0.0000  0.1066  0.0000  5.8800 
## 
## $word_freq_telnet
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
## 0.00000 0.00000 0.00000 0.06621 0.00000 4.76000 
## 
## $word_freq_857
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##  0.0000  0.0000  0.0000  0.0508  0.0000  4.7600 
## 
## $word_freq_data
##     Min.  1st Qu.   Median     Mean  3rd Qu.     Max. 
##  0.00000  0.00000  0.00000  0.09997  0.00000 18.18000 
## 
## $word_freq_415
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
## 0.00000 0.00000 0.00000 0.05179 0.00000 4.76000 
## 
## $word_freq_85
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##  0.0000  0.0000  0.0000  0.1079  0.0000 20.0000 
## 
## $word_freq_technology
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##  0.0000  0.0000  0.0000  0.1011  0.0000  7.6900 
## 
## $word_freq_1999
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   0.000   0.000   0.000   0.136   0.000   6.890 
## 
## $word_freq_parts
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
## 0.00000 0.00000 0.00000 0.01449 0.00000 8.33000 
## 
## $word_freq_pm
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
## 0.00000 0.00000 0.00000 0.07446 0.00000 9.75000 
## 
## $word_freq_direct
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
## 0.00000 0.00000 0.00000 0.06766 0.00000 4.76000 
## 
## $word_freq_cs
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
## 0.00000 0.00000 0.00000 0.04687 0.00000 7.14000 
## 
## $word_freq_meeting
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   0.000   0.000   0.000   0.144   0.000  14.280 
## 
## $word_freq_original
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
## 0.00000 0.00000 0.00000 0.04646 0.00000 3.57000 
## 
## $word_freq_project
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##  0.0000  0.0000  0.0000  0.0761  0.0000 20.0000 
## 
## $word_freq_re
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##  0.0000  0.0000  0.0000  0.2919  0.1000 20.0000 
## 
## $word_freq_edu
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##  0.0000  0.0000  0.0000  0.1773  0.0000 16.7000 
## 
## $word_freq_table
##     Min.  1st Qu.   Median     Mean  3rd Qu.     Max. 
## 0.000000 0.000000 0.000000 0.005185 0.000000 2.120000 
## 
## $word_freq_conference
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##  0.0000  0.0000  0.0000  0.0332  0.0000  8.3300 
## 
## $char_freq_.
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
## 0.00000 0.00000 0.00000 0.04055 0.00000 4.38500 
## 
## $char_freq_..1
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##  0.0000  0.0000  0.0640  0.1375  0.1887  5.2770 
## 
## $char_freq_..2
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
## 0.00000 0.00000 0.00000 0.01619 0.00000 2.77700 
## 
## $char_freq_..3
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##  0.0000  0.0000  0.0000  0.2719  0.3290 32.4800 
## 
## $char_freq_..4
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
## 0.00000 0.00000 0.00000 0.07571 0.05300 6.00300 
## 
## $char_freq_..5
##     Min.  1st Qu.   Median     Mean  3rd Qu.     Max. 
##  0.00000  0.00000  0.00000  0.04609  0.00000 19.83000 
## 
## $capital_run_length_average
##     Min.  1st Qu.   Median     Mean  3rd Qu.     Max. 
##    1.000    1.600    2.272    5.138    3.707 1102.000 
## 
## $capital_run_length_longest
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##    1.00    6.00   14.50   50.16   43.00 2204.00 
## 
## $capital_run_length_total
##     Min.  1st Qu.   Median     Mean  3rd Qu.     Max. 
##     1.00    34.25    95.00   285.10   269.50 15840.00 
## 
## $status
##    0    1 
## 2231 1451

Testing Dataset Summary

lapply(dfrTstData, FUN=summary)
## $word_freq_make
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
## 0.00000 0.00000 0.00000 0.09565 0.00000 4.54000 
## 
## $word_freq_address
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##  0.0000  0.0000  0.0000  0.2056  0.0000 14.2800 
## 
## $word_freq_all
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##  0.0000  0.0000  0.0000  0.2722  0.4050  4.5400 
## 
## $word_freq_3d
##     Min.  1st Qu.   Median     Mean  3rd Qu.     Max. 
##  0.00000  0.00000  0.00000  0.05789  0.00000 42.73000 
## 
## $word_freq_our
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##  0.0000  0.0000  0.0000  0.3111  0.4100  7.6900 
## 
## $word_freq_over
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
## 0.00000 0.00000 0.00000 0.09081 0.00000 1.86000 
## 
## $word_freq_remove
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
## 0.00000 0.00000 0.00000 0.09997 0.00000 7.27000 
## 
## $word_freq_internet
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##  0.0000  0.0000  0.0000  0.1075  0.0000 11.1100 
## 
## $word_freq_order
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
## 0.00000 0.00000 0.00000 0.08978 0.00000 5.26000 
## 
## $word_freq_mail
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   0.000   0.000   0.000   0.281   0.220  18.180 
## 
## $word_freq_receive
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
## 0.00000 0.00000 0.00000 0.05285 0.00000 2.00000 
## 
## $word_freq_will
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   0.000   0.000   0.000   0.501   0.780   5.000 
## 
## $word_freq_people
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
## 0.00000 0.00000 0.00000 0.08971 0.00000 2.63000 
## 
## $word_freq_report
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
## 0.00000 0.00000 0.00000 0.05045 0.00000 2.32000 
## 
## $word_freq_addresses
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
## 0.00000 0.00000 0.00000 0.05955 0.00000 2.38000 
## 
## $word_freq_free
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##  0.0000  0.0000  0.0000  0.2309  0.1250  6.4500 
## 
## $word_freq_business
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##  0.0000  0.0000  0.0000  0.1386  0.0000  4.8700 
## 
## $word_freq_email
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##  0.0000  0.0000  0.0000  0.2084  0.0000  9.0900 
## 
## $word_freq_you
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   0.000   0.000   1.310   1.628   2.590  10.630 
## 
## $word_freq_credit
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
## 0.00000 0.00000 0.00000 0.07098 0.00000 5.19000 
## 
## $word_freq_your
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##  0.0000  0.0000  0.3100  0.8203  1.2500 11.1100 
## 
## $word_freq_font
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##  0.0000  0.0000  0.0000  0.1135  0.0000 15.4300 
## 
## $word_freq_000
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##  0.0000  0.0000  0.0000  0.1012  0.0000  3.3800 
## 
## $word_freq_money
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
## 0.00000 0.00000 0.00000 0.08619 0.00000 9.09000 
## 
## $word_freq_hp
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##  0.0000  0.0000  0.0000  0.5247  0.0000 16.6600 
## 
## $word_freq_hpl
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##  0.0000  0.0000  0.0000  0.2536  0.0000  8.0000 
## 
## $word_freq_george
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##  0.0000  0.0000  0.0000  0.7897  0.0000 33.3300 
## 
## $word_freq_650
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##  0.0000  0.0000  0.0000  0.1383  0.0000  9.0900 
## 
## $word_freq_lab
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
## 0.00000 0.00000 0.00000 0.06946 0.00000 9.09000 
## 
## $word_freq_labs
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
## 0.00000 0.00000 0.00000 0.08775 0.00000 4.34000 
## 
## $word_freq_telnet
##     Min.  1st Qu.   Median     Mean  3rd Qu.     Max. 
##  0.00000  0.00000  0.00000  0.05892  0.00000 12.50000 
## 
## $word_freq_857
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   0.000   0.000   0.000   0.032   0.000   4.700 
## 
## $word_freq_data
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
## 0.00000 0.00000 0.00000 0.08626 0.00000 4.76000 
## 
## $word_freq_415
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   0.000   0.000   0.000   0.032   0.000   4.700 
## 
## $word_freq_85
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
## 0.00000 0.00000 0.00000 0.09528 0.00000 4.65000 
## 
## $word_freq_technology
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
## 0.00000 0.00000 0.00000 0.08283 0.00000 4.16000 
## 
## $word_freq_1999
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##  0.0000  0.0000  0.0000  0.1408  0.0000  4.5400 
## 
## $word_freq_parts
##     Min.  1st Qu.   Median     Mean  3rd Qu.     Max. 
## 0.000000 0.000000 0.000000 0.008041 0.000000 4.000000 
## 
## $word_freq_pm
##     Min.  1st Qu.   Median     Mean  3rd Qu.     Max. 
##  0.00000  0.00000  0.00000  0.09532  0.00000 11.11000 
## 
## $word_freq_direct
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
## 0.00000 0.00000 0.00000 0.05349 0.00000 4.16000 
## 
## $word_freq_cs
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
## 0.00000 0.00000 0.00000 0.03082 0.00000 7.14000 
## 
## $word_freq_meeting
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
## 0.00000 0.00000 0.00000 0.08565 0.00000 9.09000 
## 
## $word_freq_original
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
## 0.00000 0.00000 0.00000 0.04465 0.00000 1.75000 
## 
## $word_freq_project
##     Min.  1st Qu.   Median     Mean  3rd Qu.     Max. 
##  0.00000  0.00000  0.00000  0.09159  0.00000 16.66000 
## 
## $word_freq_re
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##  0.0000  0.0000  0.0000  0.3385  0.1350 21.4200 
## 
## $word_freq_edu
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##  0.0000  0.0000  0.0000  0.1898  0.0000 22.0500 
## 
## $word_freq_table
##     Min.  1st Qu.   Median     Mean  3rd Qu.     Max. 
## 0.000000 0.000000 0.000000 0.006485 0.000000 2.170000 
## 
## $word_freq_conference
##     Min.  1st Qu.   Median     Mean  3rd Qu.     Max. 
##  0.00000  0.00000  0.00000  0.02653  0.00000 10.00000 
## 
## $char_freq_.
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
## 0.00000 0.00000 0.00000 0.03068 0.00000 3.83800 
## 
## $char_freq_..1
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##  0.0000  0.0000  0.0670  0.1450  0.1845  9.7520 
## 
## $char_freq_..2
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
## 0.00000 0.00000 0.00000 0.02011 0.00000 4.08100 
## 
## $char_freq_..3
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##  0.0000  0.0000  0.0000  0.2579  0.2660 19.1300 
## 
## $char_freq_..4
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
## 0.00000 0.00000 0.00000 0.07621 0.04650 3.30500 
## 
## $char_freq_..5
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##  0.0000  0.0000  0.0000  0.0368  0.0000  7.4070 
## 
## $capital_run_length_average
##     Min.  1st Qu.   Median     Mean  3rd Qu.     Max. 
##    1.000    1.558    2.309    5.406    3.704 1022.000 
## 
## $capital_run_length_longest
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##    1.00    6.50   15.00   60.22   41.50 9989.00 
## 
## $capital_run_length_total
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##     1.0    37.0    94.0   275.9   260.0 10060.0 
## 
## $status
##   0   1 
## 557 362

Create Model - Random Forest (Default)

## set seed
set.seed(707)
# mtry
myMtry=sqrt(ncol(dfrTrnData)-1)
myNtrees=500
# start time
vctProcStrt <- proc.time()
# random forest (default)
#mdlRndForDef <- randomForest(taste~.-quality, data=dfrTrnData)
mdlRndForDef <- randomForest(status~., data=dfrTrnData, 
                             mtry=myMtry, ntree=myNtrees)
# end time
vctProcEnds <- proc.time()
# It gives vectors of three times
# print
print(paste("Model Created ...",vctProcEnds[1] - vctProcStrt[1]))
## [1] "Model Created ... 12.83"

View Model - Default Random Forest

mdlRndForDef
## 
## Call:
##  randomForest(formula = status ~ ., data = dfrTrnData, mtry = myMtry,      ntree = myNtrees) 
##                Type of random forest: classification
##                      Number of trees: 500
## No. of variables tried at each split: 8
## 
##         OOB estimate of  error rate: 4.89%
## Confusion matrix:
##      0    1 class.error
## 0 2161   70  0.03137606
## 1  110 1341  0.07580979

Observation
The default model has 4.89% error

View Model Summary - Default Random Forest

summary(mdlRndForDef)
##                 Length Class  Mode     
## call               5   -none- call     
## type               1   -none- character
## predicted       3682   factor numeric  
## err.rate        1500   -none- numeric  
## confusion          6   -none- numeric  
## votes           7364   matrix numeric  
## oob.times       3682   -none- numeric  
## classes            2   -none- character
## importance        57   -none- numeric  
## importanceSD       0   -none- NULL     
## localImportance    0   -none- NULL     
## proximity          0   -none- NULL     
## ntree              1   -none- numeric  
## mtry               1   -none- numeric  
## forest            14   -none- list     
## y               3682   factor numeric  
## test               0   -none- NULL     
## inbag              0   -none- NULL     
## terms              3   terms  call

Create Model - Random Forest (RFM)

## set seed
set.seed(707)
# start time
vctProcStrt <- proc.time()
# random forest (default)
myControl <- trainControl(method="cv", number=10, repeats=3)
myMetric <- "Accuracy"
myMtry <- sqrt(ncol(dfrTrnData)-1)
myNtrees <- 500
myTuneGrid <-  expand.grid(.mtry=myMtry)
mdlRndForRfm <- train(status~., data=dfrTrnData, method="rf",
                        verbose=F, metric=myMetric, trControl=myControl,
                        tuneGrid=myTuneGrid, ntree=myNtrees)
# end time
vctProcEnds <- proc.time()
# print
print(paste("Model Created ...",vctProcEnds[1] - vctProcStrt[1]))
## [1] "Model Created ... 120.08"

View Model - Random Forest (RFM)

mdlRndForRfm
## Random Forest 
## 
## 3682 samples
##   57 predictor
##    2 classes: '0', '1' 
## 
## No pre-processing
## Resampling: Cross-Validated (10 fold) 
## Summary of sample sizes: 3313, 3314, 3314, 3314, 3314, 3314, ... 
## Resampling results:
## 
##   Accuracy   Kappa    
##   0.9513837  0.8976903
## 
## Tuning parameter 'mtry' was held constant at a value of 7.549834

Observation
The random forest with 10 fold cross validation model has 4.89% error

Create Model - Random Forest (GBM)

## set seed
set.seed(707)
# start time
vctProcStrt <- proc.time()
# random forest (default)
myControl <- trainControl(method="repeatedcv", number=10, repeats=3)
myMetric <- "Accuracy"
myMtry <- sqrt(ncol(dfrTrnData)-1)
#myNtrees <- 500
myTuneGrid <- expand.grid(.mtry=myMtry)
mdlRndForGbm <- train(status~., data=dfrTrnData, method="gbm",
                 verbose=F, metric=myMetric, trControl=myControl)
#                        tuneGrid=myTuneGrid, ntree=myNtrees)
# end time
vctProcEnds <- proc.time()
# print
print(paste("Model Created ...",vctProcEnds[1] - vctProcStrt[1]))
## [1] "Model Created ... 127.28"

View Model - Random Forest (GBM)

mdlRndForGbm
## Stochastic Gradient Boosting 
## 
## 3682 samples
##   57 predictor
##    2 classes: '0', '1' 
## 
## No pre-processing
## Resampling: Cross-Validated (10 fold, repeated 3 times) 
## Summary of sample sizes: 3313, 3314, 3314, 3314, 3314, 3314, ... 
## Resampling results across tuning parameters:
## 
##   interaction.depth  n.trees  Accuracy   Kappa    
##   1                   50      0.9166200  0.8217745
##   1                  100      0.9317404  0.8553362
##   1                  150      0.9391638  0.8713401
##   2                   50      0.9316474  0.8552624
##   2                  100      0.9409724  0.8753850
##   2                  150      0.9464058  0.8869276
##   3                   50      0.9370819  0.8670327
##   3                  100      0.9442331  0.8823408
##   3                  150      0.9472207  0.8888336
## 
## Tuning parameter 'shrinkage' was held constant at a value of 0.1
## 
## Tuning parameter 'n.minobsinnode' was held constant at a value of 10
## Accuracy was used to select the optimal model using  the largest value.
## The final values used for the model were n.trees = 150,
##  interaction.depth = 3, shrinkage = 0.1 and n.minobsinnode = 10.

Observation
The random forest(GBM) with 10 fold repeated cross validation model has 5.3% error

Create Model - Random Forest (OOB)

## set seed
set.seed(707)
# start time
vctProcStrt <- proc.time()
# random forest (default)
myControl <- trainControl(method="oob", number=10, repeats=3)
myMetric <- "Accuracy"
myMtry <- sqrt(ncol(dfrTrnData)-1)
myNtrees <- 500
myTuneGrid <- expand.grid(.mtry=myMtry)
mdlRndForOob <- train(status~., data=dfrTrnData, 
                    verbose=F, metric=myMetric, trControl=myControl, 
                    tuneGrid=myTuneGrid, ntree=myNtrees)
# end time
vctProcEnds <- proc.time()
# print
print(paste("Model Created ...",vctProcEnds[1] - vctProcStrt[1]))
## [1] "Model Created ... 24.56"

View Model - Random Formbest (OOB)

mdlRndForOob
## Random Forest 
## 
## 3682 samples
##   57 predictor
##    2 classes: '0', '1' 
## 
## No pre-processing
## Resampling results:
## 
##   Accuracy   Kappa    
##   0.9508419  0.8964783
## 
## Tuning parameter 'mtry' was held constant at a value of 7.549834

Observation
The random forest(OOB) model has 4.92% error

Create Model - Random Forest (Mod1)

## set seed
set.seed(707)
# mtry
myMtry=6.5
myNtrees=500
# start time
vctProcStrt <- proc.time()
# random forest (default)
#mdlRndForDef <- randomForest(taste~.-quality, data=dfrTrnData)
mdlRndForMod1 <- randomForest(status~., data=dfrTrnData, 
                             mtry=myMtry, ntree=myNtrees)
# end time
vctProcEnds <- proc.time()
# It gives vectors of three times
# print
print(paste("Model Created ...",vctProcEnds[1] - vctProcStrt[1]))
## [1] "Model Created ... 11.97"

View Model - Mod1 Random Forest

mdlRndForMod1
## 
## Call:
##  randomForest(formula = status ~ ., data = dfrTrnData, mtry = myMtry,      ntree = myNtrees) 
##                Type of random forest: classification
##                      Number of trees: 500
## No. of variables tried at each split: 6
## 
##         OOB estimate of  error rate: 4.48%
## Confusion matrix:
##      0    1 class.error
## 0 2174   57  0.02554908
## 1  108 1343  0.07443143

Observation
The random forest(Mod1) model has 4.48% error hence we will use this model for classification and prediction.

Prediction - Test Data - Random Forest (Default)

vctRndForDef <- predict(mdlRndForMod1, newdata=dfrTstData)
cmxRndForDef  <- confusionMatrix(vctRndForDef, dfrTstData$status)
cmxRndForDef
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction   0   1
##          0 539  29
##          1  18 333
##                                           
##                Accuracy : 0.9489          
##                  95% CI : (0.9326, 0.9622)
##     No Information Rate : 0.6061          
##     P-Value [Acc > NIR] : <2e-16          
##                                           
##                   Kappa : 0.8923          
##  Mcnemar's Test P-Value : 0.1447          
##                                           
##             Sensitivity : 0.9677          
##             Specificity : 0.9199          
##          Pos Pred Value : 0.9489          
##          Neg Pred Value : 0.9487          
##              Prevalence : 0.6061          
##          Detection Rate : 0.5865          
##    Detection Prevalence : 0.6181          
##       Balanced Accuracy : 0.9438          
##                                           
##        'Positive' Class : 0               
## 

Observation
The prediction was done with 94.99% accuracy.

End Of Report