setwd("D:/R-BA/R-Scripts")
Introduction Random Forest
The steps to predict using Random Forest:
* Step 1
* Step 2
Problem Defination
Predict whether an email will be considered as spam or not Using Random Forest
Load Libs
library(plyr)
library(tidyr)
library(dplyr)
library(ggplot2)
#install.packages("caret")
library(caret)
#install.packages("randomForest")
library(randomForest)
#install.packages("gbm")
library(gbm)
#install.packages("corrgram")
library(corrgram)
Functions
detectNA <- function(inp) {
sum(is.na(inp))
}
detectCor <- function(x) {
cor(as.numeric(dfrDataset[, x]),
as.numeric(dfrDataset$status),
method="spearman")
}
Load Dataset
dfrDataset <- read.csv("D:/R-BA/R-Scripts/data/spambase.csv", sep=",", header=T, stringsAsFactors=T)
head(dfrDataset)
## word_freq_make word_freq_address word_freq_all word_freq_3d
## 1 0.00 0.64 0.64 0
## 2 0.21 0.28 0.50 0
## 3 0.06 0.00 0.71 0
## 4 0.00 0.00 0.00 0
## 5 0.00 0.00 0.00 0
## 6 0.00 0.00 0.00 0
## word_freq_our word_freq_over word_freq_remove word_freq_internet
## 1 0.32 0.00 0.00 0.00
## 2 0.14 0.28 0.21 0.07
## 3 1.23 0.19 0.19 0.12
## 4 0.63 0.00 0.31 0.63
## 5 0.63 0.00 0.31 0.63
## 6 1.85 0.00 0.00 1.85
## word_freq_order word_freq_mail word_freq_receive word_freq_will
## 1 0.00 0.00 0.00 0.64
## 2 0.00 0.94 0.21 0.79
## 3 0.64 0.25 0.38 0.45
## 4 0.31 0.63 0.31 0.31
## 5 0.31 0.63 0.31 0.31
## 6 0.00 0.00 0.00 0.00
## word_freq_people word_freq_report word_freq_addresses word_freq_free
## 1 0.00 0.00 0.00 0.32
## 2 0.65 0.21 0.14 0.14
## 3 0.12 0.00 1.75 0.06
## 4 0.31 0.00 0.00 0.31
## 5 0.31 0.00 0.00 0.31
## 6 0.00 0.00 0.00 0.00
## word_freq_business word_freq_email word_freq_you word_freq_credit
## 1 0.00 1.29 1.93 0.00
## 2 0.07 0.28 3.47 0.00
## 3 0.06 1.03 1.36 0.32
## 4 0.00 0.00 3.18 0.00
## 5 0.00 0.00 3.18 0.00
## 6 0.00 0.00 0.00 0.00
## word_freq_your word_freq_font word_freq_000 word_freq_money word_freq_hp
## 1 0.96 0 0.00 0.00 0
## 2 1.59 0 0.43 0.43 0
## 3 0.51 0 1.16 0.06 0
## 4 0.31 0 0.00 0.00 0
## 5 0.31 0 0.00 0.00 0
## 6 0.00 0 0.00 0.00 0
## word_freq_hpl word_freq_george word_freq_650 word_freq_lab
## 1 0 0 0 0
## 2 0 0 0 0
## 3 0 0 0 0
## 4 0 0 0 0
## 5 0 0 0 0
## 6 0 0 0 0
## word_freq_labs word_freq_telnet word_freq_857 word_freq_data
## 1 0 0 0 0
## 2 0 0 0 0
## 3 0 0 0 0
## 4 0 0 0 0
## 5 0 0 0 0
## 6 0 0 0 0
## word_freq_415 word_freq_85 word_freq_technology word_freq_1999
## 1 0 0 0 0.00
## 2 0 0 0 0.07
## 3 0 0 0 0.00
## 4 0 0 0 0.00
## 5 0 0 0 0.00
## 6 0 0 0 0.00
## word_freq_parts word_freq_pm word_freq_direct word_freq_cs
## 1 0 0 0.00 0
## 2 0 0 0.00 0
## 3 0 0 0.06 0
## 4 0 0 0.00 0
## 5 0 0 0.00 0
## 6 0 0 0.00 0
## word_freq_meeting word_freq_original word_freq_project word_freq_re
## 1 0 0.00 0 0.00
## 2 0 0.00 0 0.00
## 3 0 0.12 0 0.06
## 4 0 0.00 0 0.00
## 5 0 0.00 0 0.00
## 6 0 0.00 0 0.00
## word_freq_edu word_freq_table word_freq_conference char_freq_.
## 1 0.00 0 0 0.00
## 2 0.00 0 0 0.00
## 3 0.06 0 0 0.01
## 4 0.00 0 0 0.00
## 5 0.00 0 0 0.00
## 6 0.00 0 0 0.00
## char_freq_..1 char_freq_..2 char_freq_..3 char_freq_..4 char_freq_..5
## 1 0.000 0 0.778 0.000 0.000
## 2 0.132 0 0.372 0.180 0.048
## 3 0.143 0 0.276 0.184 0.010
## 4 0.137 0 0.137 0.000 0.000
## 5 0.135 0 0.135 0.000 0.000
## 6 0.223 0 0.000 0.000 0.000
## capital_run_length_average capital_run_length_longest
## 1 3.756 61
## 2 5.114 101
## 3 9.821 485
## 4 3.537 40
## 5 3.537 40
## 6 3.000 15
## capital_run_length_total status
## 1 278 1
## 2 1028 1
## 3 2259 1
## 4 191 1
## 5 191 1
## 6 54 1
Dataframe Stucture
str(dfrDataset)
## 'data.frame': 4601 obs. of 58 variables:
## $ word_freq_make : num 0 0.21 0.06 0 0 0 0 0 0.15 0.06 ...
## $ word_freq_address : num 0.64 0.28 0 0 0 0 0 0 0 0.12 ...
## $ word_freq_all : num 0.64 0.5 0.71 0 0 0 0 0 0.46 0.77 ...
## $ word_freq_3d : num 0 0 0 0 0 0 0 0 0 0 ...
## $ word_freq_our : num 0.32 0.14 1.23 0.63 0.63 1.85 1.92 1.88 0.61 0.19 ...
## $ word_freq_over : num 0 0.28 0.19 0 0 0 0 0 0 0.32 ...
## $ word_freq_remove : num 0 0.21 0.19 0.31 0.31 0 0 0 0.3 0.38 ...
## $ word_freq_internet : num 0 0.07 0.12 0.63 0.63 1.85 0 1.88 0 0 ...
## $ word_freq_order : num 0 0 0.64 0.31 0.31 0 0 0 0.92 0.06 ...
## $ word_freq_mail : num 0 0.94 0.25 0.63 0.63 0 0.64 0 0.76 0 ...
## $ word_freq_receive : num 0 0.21 0.38 0.31 0.31 0 0.96 0 0.76 0 ...
## $ word_freq_will : num 0.64 0.79 0.45 0.31 0.31 0 1.28 0 0.92 0.64 ...
## $ word_freq_people : num 0 0.65 0.12 0.31 0.31 0 0 0 0 0.25 ...
## $ word_freq_report : num 0 0.21 0 0 0 0 0 0 0 0 ...
## $ word_freq_addresses : num 0 0.14 1.75 0 0 0 0 0 0 0.12 ...
## $ word_freq_free : num 0.32 0.14 0.06 0.31 0.31 0 0.96 0 0 0 ...
## $ word_freq_business : num 0 0.07 0.06 0 0 0 0 0 0 0 ...
## $ word_freq_email : num 1.29 0.28 1.03 0 0 0 0.32 0 0.15 0.12 ...
## $ word_freq_you : num 1.93 3.47 1.36 3.18 3.18 0 3.85 0 1.23 1.67 ...
## $ word_freq_credit : num 0 0 0.32 0 0 0 0 0 3.53 0.06 ...
## $ word_freq_your : num 0.96 1.59 0.51 0.31 0.31 0 0.64 0 2 0.71 ...
## $ word_freq_font : num 0 0 0 0 0 0 0 0 0 0 ...
## $ word_freq_000 : num 0 0.43 1.16 0 0 0 0 0 0 0.19 ...
## $ word_freq_money : num 0 0.43 0.06 0 0 0 0 0 0.15 0 ...
## $ word_freq_hp : num 0 0 0 0 0 0 0 0 0 0 ...
## $ word_freq_hpl : num 0 0 0 0 0 0 0 0 0 0 ...
## $ word_freq_george : num 0 0 0 0 0 0 0 0 0 0 ...
## $ word_freq_650 : num 0 0 0 0 0 0 0 0 0 0 ...
## $ word_freq_lab : num 0 0 0 0 0 0 0 0 0 0 ...
## $ word_freq_labs : num 0 0 0 0 0 0 0 0 0 0 ...
## $ word_freq_telnet : num 0 0 0 0 0 0 0 0 0 0 ...
## $ word_freq_857 : num 0 0 0 0 0 0 0 0 0 0 ...
## $ word_freq_data : num 0 0 0 0 0 0 0 0 0.15 0 ...
## $ word_freq_415 : num 0 0 0 0 0 0 0 0 0 0 ...
## $ word_freq_85 : num 0 0 0 0 0 0 0 0 0 0 ...
## $ word_freq_technology : num 0 0 0 0 0 0 0 0 0 0 ...
## $ word_freq_1999 : num 0 0.07 0 0 0 0 0 0 0 0 ...
## $ word_freq_parts : num 0 0 0 0 0 0 0 0 0 0 ...
## $ word_freq_pm : num 0 0 0 0 0 0 0 0 0 0 ...
## $ word_freq_direct : num 0 0 0.06 0 0 0 0 0 0 0 ...
## $ word_freq_cs : num 0 0 0 0 0 0 0 0 0 0 ...
## $ word_freq_meeting : num 0 0 0 0 0 0 0 0 0 0 ...
## $ word_freq_original : num 0 0 0.12 0 0 0 0 0 0.3 0 ...
## $ word_freq_project : num 0 0 0 0 0 0 0 0 0 0.06 ...
## $ word_freq_re : num 0 0 0.06 0 0 0 0 0 0 0 ...
## $ word_freq_edu : num 0 0 0.06 0 0 0 0 0 0 0 ...
## $ word_freq_table : num 0 0 0 0 0 0 0 0 0 0 ...
## $ word_freq_conference : num 0 0 0 0 0 0 0 0 0 0 ...
## $ char_freq_. : num 0 0 0.01 0 0 0 0 0 0 0.04 ...
## $ char_freq_..1 : num 0 0.132 0.143 0.137 0.135 0.223 0.054 0.206 0.271 0.03 ...
## $ char_freq_..2 : num 0 0 0 0 0 0 0 0 0 0 ...
## $ char_freq_..3 : num 0.778 0.372 0.276 0.137 0.135 0 0.164 0 0.181 0.244 ...
## $ char_freq_..4 : num 0 0.18 0.184 0 0 0 0.054 0 0.203 0.081 ...
## $ char_freq_..5 : num 0 0.048 0.01 0 0 0 0 0 0.022 0 ...
## $ capital_run_length_average: num 3.76 5.11 9.82 3.54 3.54 ...
## $ capital_run_length_longest: int 61 101 485 40 40 15 4 11 445 43 ...
## $ capital_run_length_total : int 278 1028 2259 191 191 54 112 49 1257 749 ...
## $ status : int 1 1 1 1 1 1 1 1 1 1 ...
Dataframe Summary
lapply(dfrDataset, FUN=summary)
## $word_freq_make
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.0000 0.0000 0.0000 0.1046 0.0000 4.5400
##
## $word_freq_address
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.000 0.000 0.000 0.213 0.000 14.280
##
## $word_freq_all
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.0000 0.0000 0.0000 0.2807 0.4200 5.1000
##
## $word_freq_3d
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.00000 0.00000 0.00000 0.06542 0.00000 42.81000
##
## $word_freq_our
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.0000 0.0000 0.0000 0.3122 0.3800 10.0000
##
## $word_freq_over
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.0000 0.0000 0.0000 0.0959 0.0000 5.8800
##
## $word_freq_remove
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.0000 0.0000 0.0000 0.1142 0.0000 7.2700
##
## $word_freq_internet
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.0000 0.0000 0.0000 0.1053 0.0000 11.1100
##
## $word_freq_order
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.00000 0.00000 0.00000 0.09007 0.00000 5.26000
##
## $word_freq_mail
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.0000 0.0000 0.0000 0.2394 0.1600 18.1800
##
## $word_freq_receive
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.00000 0.00000 0.00000 0.05982 0.00000 2.61000
##
## $word_freq_will
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.0000 0.0000 0.1000 0.5417 0.8000 9.6700
##
## $word_freq_people
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.00000 0.00000 0.00000 0.09393 0.00000 5.55000
##
## $word_freq_report
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.00000 0.00000 0.00000 0.05863 0.00000 10.00000
##
## $word_freq_addresses
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.0000 0.0000 0.0000 0.0492 0.0000 4.4100
##
## $word_freq_free
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.0000 0.0000 0.0000 0.2488 0.1000 20.0000
##
## $word_freq_business
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.0000 0.0000 0.0000 0.1426 0.0000 7.1400
##
## $word_freq_email
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.0000 0.0000 0.0000 0.1847 0.0000 9.0900
##
## $word_freq_you
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.000 0.000 1.310 1.662 2.640 18.750
##
## $word_freq_credit
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.00000 0.00000 0.00000 0.08558 0.00000 18.18000
##
## $word_freq_your
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.0000 0.0000 0.2200 0.8098 1.2700 11.1100
##
## $word_freq_font
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.0000 0.0000 0.0000 0.1212 0.0000 17.1000
##
## $word_freq_000
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.0000 0.0000 0.0000 0.1016 0.0000 5.4500
##
## $word_freq_money
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.00000 0.00000 0.00000 0.09427 0.00000 12.50000
##
## $word_freq_hp
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.0000 0.0000 0.0000 0.5495 0.0000 20.8300
##
## $word_freq_hpl
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.0000 0.0000 0.0000 0.2654 0.0000 16.6600
##
## $word_freq_george
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.0000 0.0000 0.0000 0.7673 0.0000 33.3300
##
## $word_freq_650
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.0000 0.0000 0.0000 0.1248 0.0000 9.0900
##
## $word_freq_lab
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.00000 0.00000 0.00000 0.09892 0.00000 14.28000
##
## $word_freq_labs
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.0000 0.0000 0.0000 0.1029 0.0000 5.8800
##
## $word_freq_telnet
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.00000 0.00000 0.00000 0.06475 0.00000 12.50000
##
## $word_freq_857
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.00000 0.00000 0.00000 0.04705 0.00000 4.76000
##
## $word_freq_data
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.00000 0.00000 0.00000 0.09723 0.00000 18.18000
##
## $word_freq_415
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.00000 0.00000 0.00000 0.04784 0.00000 4.76000
##
## $word_freq_85
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.0000 0.0000 0.0000 0.1054 0.0000 20.0000
##
## $word_freq_technology
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.00000 0.00000 0.00000 0.09748 0.00000 7.69000
##
## $word_freq_1999
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.000 0.000 0.000 0.137 0.000 6.890
##
## $word_freq_parts
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.0000 0.0000 0.0000 0.0132 0.0000 8.3300
##
## $word_freq_pm
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.00000 0.00000 0.00000 0.07863 0.00000 11.11000
##
## $word_freq_direct
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.00000 0.00000 0.00000 0.06483 0.00000 4.76000
##
## $word_freq_cs
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.00000 0.00000 0.00000 0.04367 0.00000 7.14000
##
## $word_freq_meeting
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.0000 0.0000 0.0000 0.1323 0.0000 14.2800
##
## $word_freq_original
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.0000 0.0000 0.0000 0.0461 0.0000 3.5700
##
## $word_freq_project
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.0000 0.0000 0.0000 0.0792 0.0000 20.0000
##
## $word_freq_re
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.0000 0.0000 0.0000 0.3012 0.1100 21.4200
##
## $word_freq_edu
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.0000 0.0000 0.0000 0.1798 0.0000 22.0500
##
## $word_freq_table
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.000000 0.000000 0.000000 0.005444 0.000000 2.170000
##
## $word_freq_conference
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.00000 0.00000 0.00000 0.03187 0.00000 10.00000
##
## $char_freq_.
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.00000 0.00000 0.00000 0.03857 0.00000 4.38500
##
## $char_freq_..1
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.000 0.000 0.065 0.139 0.188 9.752
##
## $char_freq_..2
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.00000 0.00000 0.00000 0.01698 0.00000 4.08100
##
## $char_freq_..3
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.0000 0.0000 0.0000 0.2691 0.3150 32.4800
##
## $char_freq_..4
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.00000 0.00000 0.00000 0.07581 0.05200 6.00300
##
## $char_freq_..5
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.00000 0.00000 0.00000 0.04424 0.00000 19.83000
##
## $capital_run_length_average
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 1.000 1.588 2.276 5.192 3.706 1102.000
##
## $capital_run_length_longest
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 1.00 6.00 15.00 52.17 43.00 9989.00
##
## $capital_run_length_total
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 1.0 35.0 95.0 283.3 266.0 15840.0
##
## $status
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.000 0.000 0.000 0.394 1.000 1.000
Missing Data
lapply(dfrDataset, FUN=detectNA)
## $word_freq_make
## [1] 0
##
## $word_freq_address
## [1] 0
##
## $word_freq_all
## [1] 0
##
## $word_freq_3d
## [1] 0
##
## $word_freq_our
## [1] 0
##
## $word_freq_over
## [1] 0
##
## $word_freq_remove
## [1] 0
##
## $word_freq_internet
## [1] 0
##
## $word_freq_order
## [1] 0
##
## $word_freq_mail
## [1] 0
##
## $word_freq_receive
## [1] 0
##
## $word_freq_will
## [1] 0
##
## $word_freq_people
## [1] 0
##
## $word_freq_report
## [1] 0
##
## $word_freq_addresses
## [1] 0
##
## $word_freq_free
## [1] 0
##
## $word_freq_business
## [1] 0
##
## $word_freq_email
## [1] 0
##
## $word_freq_you
## [1] 0
##
## $word_freq_credit
## [1] 0
##
## $word_freq_your
## [1] 0
##
## $word_freq_font
## [1] 0
##
## $word_freq_000
## [1] 0
##
## $word_freq_money
## [1] 0
##
## $word_freq_hp
## [1] 0
##
## $word_freq_hpl
## [1] 0
##
## $word_freq_george
## [1] 0
##
## $word_freq_650
## [1] 0
##
## $word_freq_lab
## [1] 0
##
## $word_freq_labs
## [1] 0
##
## $word_freq_telnet
## [1] 0
##
## $word_freq_857
## [1] 0
##
## $word_freq_data
## [1] 0
##
## $word_freq_415
## [1] 0
##
## $word_freq_85
## [1] 0
##
## $word_freq_technology
## [1] 0
##
## $word_freq_1999
## [1] 0
##
## $word_freq_parts
## [1] 0
##
## $word_freq_pm
## [1] 0
##
## $word_freq_direct
## [1] 0
##
## $word_freq_cs
## [1] 0
##
## $word_freq_meeting
## [1] 0
##
## $word_freq_original
## [1] 0
##
## $word_freq_project
## [1] 0
##
## $word_freq_re
## [1] 0
##
## $word_freq_edu
## [1] 0
##
## $word_freq_table
## [1] 0
##
## $word_freq_conference
## [1] 0
##
## $char_freq_.
## [1] 0
##
## $char_freq_..1
## [1] 0
##
## $char_freq_..2
## [1] 0
##
## $char_freq_..3
## [1] 0
##
## $char_freq_..4
## [1] 0
##
## $char_freq_..5
## [1] 0
##
## $capital_run_length_average
## [1] 0
##
## $capital_run_length_longest
## [1] 0
##
## $capital_run_length_total
## [1] 0
##
## $status
## [1] 0
Check output
dfrStatus <- summarise(group_by(dfrDataset, status), count=n())
ggplot(dfrStatus, aes(x=status, y=count)) +
geom_bar(stat="identity", aes(fill=count)) +
labs(title="Status Frequency Distribution") +
labs(x="Status") +
labs(y="Counts")
Find Corelations
## find correlations
vcnCorsData <- abs(sapply(colnames(dfrDataset), detectCor))
summary(vcnCorsData)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.002525 0.148800 0.253300 0.276900 0.354700 1.000000
Show Corelations
vcnCorsData
## word_freq_make word_freq_address
## 0.24069974 0.29750940
## word_freq_all word_freq_3d
## 0.33283147 0.09077776
## word_freq_our word_freq_over
## 0.40913946 0.31864550
## word_freq_remove word_freq_internet
## 0.51877779 0.34379623
## word_freq_order word_freq_mail
## 0.30073703 0.29682394
## word_freq_receive word_freq_will
## 0.35496682 0.14847653
## word_freq_people word_freq_report
## 0.21287588 0.14977533
## word_freq_addresses word_freq_free
## 0.26515743 0.50416922
## word_freq_business word_freq_email
## 0.35290749 0.29909391
## word_freq_you word_freq_credit
## 0.36110406 0.32418657
## word_freq_your word_freq_font
## 0.50159062 0.13797471
## word_freq_000 word_freq_money
## 0.42580256 0.47215455
## word_freq_hp word_freq_hpl
## 0.39981558 0.34188069
## word_freq_george word_freq_650
## 0.35393063 0.22619064
## word_freq_lab word_freq_labs
## 0.22068802 0.24580530
## word_freq_telnet word_freq_857
## 0.20467400 0.16983798
## word_freq_data word_freq_415
## 0.15756347 0.15802818
## word_freq_85 word_freq_technology
## 0.21413087 0.16680254
## word_freq_1999 word_freq_parts
## 0.26070752 0.00252536
## word_freq_pm word_freq_direct
## 0.14721389 0.02813193
## word_freq_cs word_freq_meeting
## 0.14453750 0.19574176
## word_freq_original word_freq_project
## 0.10781412 0.14453744
## word_freq_re word_freq_edu
## 0.07176763 0.19702549
## word_freq_table word_freq_conference
## 0.02266674 0.13903044
## char_freq_. char_freq_..1
## 0.05683530 0.03263555
## char_freq_..2 char_freq_..3
## 0.11122690 0.59785363
## char_freq_..4 char_freq_..5
## 0.56563314 0.26668614
## capital_run_length_average capital_run_length_longest
## 0.48794983 0.51515693
## capital_run_length_total status
## 0.44397367 1.00000000
Plot Corelations
corrgram(dfrDataset)
High Corelations
vcnCorsData[vcnCorsData>0.6]
## status
## 1
Count of spam & nospam values
dfrDataset$status <- as.factor(dfrDataset$status)
table(dfrDataset$status)
##
## 0 1
## 2788 1813
Dataset Split
set.seed(707)
vctTrnRecs <- createDataPartition(y=dfrDataset$status, p=0.8, list=FALSE)
dfrTrnData <- dfrDataset[vctTrnRecs,]
dfrTstData <- dfrDataset[-vctTrnRecs,]
Training Dataset RowCount & ColCount
dim(dfrTrnData)
## [1] 3682 58
Testing Dataset RowCount & ColCount
dim(dfrTstData)
## [1] 919 58
Training Dataset Head
head(dfrTrnData)
## word_freq_make word_freq_address word_freq_all word_freq_3d
## 1 0.00 0.64 0.64 0
## 2 0.21 0.28 0.50 0
## 6 0.00 0.00 0.00 0
## 7 0.00 0.00 0.00 0
## 8 0.00 0.00 0.00 0
## 9 0.15 0.00 0.46 0
## word_freq_our word_freq_over word_freq_remove word_freq_internet
## 1 0.32 0.00 0.00 0.00
## 2 0.14 0.28 0.21 0.07
## 6 1.85 0.00 0.00 1.85
## 7 1.92 0.00 0.00 0.00
## 8 1.88 0.00 0.00 1.88
## 9 0.61 0.00 0.30 0.00
## word_freq_order word_freq_mail word_freq_receive word_freq_will
## 1 0.00 0.00 0.00 0.64
## 2 0.00 0.94 0.21 0.79
## 6 0.00 0.00 0.00 0.00
## 7 0.00 0.64 0.96 1.28
## 8 0.00 0.00 0.00 0.00
## 9 0.92 0.76 0.76 0.92
## word_freq_people word_freq_report word_freq_addresses word_freq_free
## 1 0.00 0.00 0.00 0.32
## 2 0.65 0.21 0.14 0.14
## 6 0.00 0.00 0.00 0.00
## 7 0.00 0.00 0.00 0.96
## 8 0.00 0.00 0.00 0.00
## 9 0.00 0.00 0.00 0.00
## word_freq_business word_freq_email word_freq_you word_freq_credit
## 1 0.00 1.29 1.93 0.00
## 2 0.07 0.28 3.47 0.00
## 6 0.00 0.00 0.00 0.00
## 7 0.00 0.32 3.85 0.00
## 8 0.00 0.00 0.00 0.00
## 9 0.00 0.15 1.23 3.53
## word_freq_your word_freq_font word_freq_000 word_freq_money word_freq_hp
## 1 0.96 0 0.00 0.00 0
## 2 1.59 0 0.43 0.43 0
## 6 0.00 0 0.00 0.00 0
## 7 0.64 0 0.00 0.00 0
## 8 0.00 0 0.00 0.00 0
## 9 2.00 0 0.00 0.15 0
## word_freq_hpl word_freq_george word_freq_650 word_freq_lab
## 1 0 0 0 0
## 2 0 0 0 0
## 6 0 0 0 0
## 7 0 0 0 0
## 8 0 0 0 0
## 9 0 0 0 0
## word_freq_labs word_freq_telnet word_freq_857 word_freq_data
## 1 0 0 0 0.00
## 2 0 0 0 0.00
## 6 0 0 0 0.00
## 7 0 0 0 0.00
## 8 0 0 0 0.00
## 9 0 0 0 0.15
## word_freq_415 word_freq_85 word_freq_technology word_freq_1999
## 1 0 0 0 0.00
## 2 0 0 0 0.07
## 6 0 0 0 0.00
## 7 0 0 0 0.00
## 8 0 0 0 0.00
## 9 0 0 0 0.00
## word_freq_parts word_freq_pm word_freq_direct word_freq_cs
## 1 0 0 0 0
## 2 0 0 0 0
## 6 0 0 0 0
## 7 0 0 0 0
## 8 0 0 0 0
## 9 0 0 0 0
## word_freq_meeting word_freq_original word_freq_project word_freq_re
## 1 0 0.0 0 0
## 2 0 0.0 0 0
## 6 0 0.0 0 0
## 7 0 0.0 0 0
## 8 0 0.0 0 0
## 9 0 0.3 0 0
## word_freq_edu word_freq_table word_freq_conference char_freq_.
## 1 0 0 0 0
## 2 0 0 0 0
## 6 0 0 0 0
## 7 0 0 0 0
## 8 0 0 0 0
## 9 0 0 0 0
## char_freq_..1 char_freq_..2 char_freq_..3 char_freq_..4 char_freq_..5
## 1 0.000 0 0.778 0.000 0.000
## 2 0.132 0 0.372 0.180 0.048
## 6 0.223 0 0.000 0.000 0.000
## 7 0.054 0 0.164 0.054 0.000
## 8 0.206 0 0.000 0.000 0.000
## 9 0.271 0 0.181 0.203 0.022
## capital_run_length_average capital_run_length_longest
## 1 3.756 61
## 2 5.114 101
## 6 3.000 15
## 7 1.671 4
## 8 2.450 11
## 9 9.744 445
## capital_run_length_total status
## 1 278 1
## 2 1028 1
## 6 54 1
## 7 112 1
## 8 49 1
## 9 1257 1
Testing Dataset Head
head(dfrTstData)
## word_freq_make word_freq_address word_freq_all word_freq_3d
## 3 0.06 0.00 0.71 0
## 4 0.00 0.00 0.00 0
## 5 0.00 0.00 0.00 0
## 13 0.00 0.69 0.34 0
## 14 0.00 0.00 0.00 0
## 23 0.00 0.00 0.00 0
## word_freq_our word_freq_over word_freq_remove word_freq_internet
## 3 1.23 0.19 0.19 0.12
## 4 0.63 0.00 0.31 0.63
## 5 0.63 0.00 0.31 0.63
## 13 0.34 0.00 0.00 0.00
## 14 0.90 0.00 0.90 0.00
## 23 2.94 0.00 0.00 0.00
## word_freq_order word_freq_mail word_freq_receive word_freq_will
## 3 0.64 0.25 0.38 0.45
## 4 0.31 0.63 0.31 0.31
## 5 0.31 0.63 0.31 0.31
## 13 0.00 0.00 0.00 0.69
## 14 0.00 0.90 0.90 0.00
## 23 0.00 0.00 0.00 0.00
## word_freq_people word_freq_report word_freq_addresses word_freq_free
## 3 0.12 0 1.75 0.06
## 4 0.31 0 0.00 0.31
## 5 0.31 0 0.00 0.31
## 13 0.00 0 0.00 0.34
## 14 0.90 0 0.00 0.00
## 23 0.00 0 0.00 2.94
## word_freq_business word_freq_email word_freq_you word_freq_credit
## 3 0.06 1.03 1.36 0.32
## 4 0.00 0.00 3.18 0.00
## 5 0.00 0.00 3.18 0.00
## 13 0.00 1.39 2.09 0.00
## 14 0.00 0.00 2.72 0.00
## 23 0.00 0.00 0.00 0.00
## word_freq_your word_freq_font word_freq_000 word_freq_money
## 3 0.51 0 1.16 0.06
## 4 0.31 0 0.00 0.00
## 5 0.31 0 0.00 0.00
## 13 1.04 0 0.00 0.00
## 14 0.90 0 0.00 0.00
## 23 0.00 0 0.00 0.00
## word_freq_hp word_freq_hpl word_freq_george word_freq_650 word_freq_lab
## 3 0 0 0 0 0
## 4 0 0 0 0 0
## 5 0 0 0 0 0
## 13 0 0 0 0 0
## 14 0 0 0 0 0
## 23 0 0 0 0 0
## word_freq_labs word_freq_telnet word_freq_857 word_freq_data
## 3 0 0 0 0
## 4 0 0 0 0
## 5 0 0 0 0
## 13 0 0 0 0
## 14 0 0 0 0
## 23 0 0 0 0
## word_freq_415 word_freq_85 word_freq_technology word_freq_1999
## 3 0 0 0 0
## 4 0 0 0 0
## 5 0 0 0 0
## 13 0 0 0 0
## 14 0 0 0 0
## 23 0 0 0 0
## word_freq_parts word_freq_pm word_freq_direct word_freq_cs
## 3 0 0 0.06 0
## 4 0 0 0.00 0
## 5 0 0 0.00 0
## 13 0 0 0.00 0
## 14 0 0 0.00 0
## 23 0 0 0.00 0
## word_freq_meeting word_freq_original word_freq_project word_freq_re
## 3 0 0.12 0 0.06
## 4 0 0.00 0 0.00
## 5 0 0.00 0 0.00
## 13 0 0.00 0 0.00
## 14 0 0.00 0 0.00
## 23 0 0.00 0 0.00
## word_freq_edu word_freq_table word_freq_conference char_freq_.
## 3 0.06 0 0 0.010
## 4 0.00 0 0 0.000
## 5 0.00 0 0 0.000
## 13 0.00 0 0 0.000
## 14 0.00 0 0 0.000
## 23 0.00 0 0 0.404
## char_freq_..1 char_freq_..2 char_freq_..3 char_freq_..4 char_freq_..5
## 3 0.143 0 0.276 0.184 0.01
## 4 0.137 0 0.137 0.000 0.00
## 5 0.135 0 0.135 0.000 0.00
## 13 0.056 0 0.786 0.000 0.00
## 14 0.000 0 0.000 0.000 0.00
## 23 0.404 0 0.809 0.000 0.00
## capital_run_length_average capital_run_length_longest
## 3 9.821 485
## 4 3.537 40
## 5 3.537 40
## 13 3.728 61
## 14 2.083 7
## 23 4.857 12
## capital_run_length_total status
## 3 2259 1
## 4 191 1
## 5 191 1
## 13 261 1
## 14 25 1
## 23 34 1
Training Dataset Summary
lapply(dfrTrnData, FUN=summary)
## $word_freq_make
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.0000 0.0000 0.0000 0.1068 0.0000 4.3400
##
## $word_freq_address
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.0000 0.0000 0.0000 0.2149 0.0000 14.2800
##
## $word_freq_all
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.0000 0.0000 0.0000 0.2828 0.4200 5.1000
##
## $word_freq_3d
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.00000 0.00000 0.00000 0.06731 0.00000 42.81000
##
## $word_freq_our
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.0000 0.0000 0.0000 0.3125 0.3800 10.0000
##
## $word_freq_over
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.00000 0.00000 0.00000 0.09717 0.00000 5.88000
##
## $word_freq_remove
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.0000 0.0000 0.0000 0.1178 0.0000 7.2700
##
## $word_freq_internet
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.0000 0.0000 0.0000 0.1048 0.0000 6.0600
##
## $word_freq_order
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.00000 0.00000 0.00000 0.09014 0.00000 3.33000
##
## $word_freq_mail
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.000 0.000 0.000 0.229 0.140 11.110
##
## $word_freq_receive
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.00000 0.00000 0.00000 0.06156 0.00000 2.61000
##
## $word_freq_will
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.0000 0.0000 0.1200 0.5519 0.8000 9.6700
##
## $word_freq_people
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.00000 0.00000 0.00000 0.09498 0.00000 5.55000
##
## $word_freq_report
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.00000 0.00000 0.00000 0.06067 0.00000 10.00000
##
## $word_freq_addresses
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.00000 0.00000 0.00000 0.04662 0.00000 4.41000
##
## $word_freq_free
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.0000 0.0000 0.0000 0.2533 0.1000 20.0000
##
## $word_freq_business
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.0000 0.0000 0.0000 0.1436 0.0000 7.1400
##
## $word_freq_email
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.0000 0.0000 0.0000 0.1788 0.0000 6.6600
##
## $word_freq_you
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.000 0.000 1.300 1.671 2.650 18.750
##
## $word_freq_credit
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.00000 0.00000 0.00000 0.08922 0.00000 18.18000
##
## $word_freq_your
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.0000 0.0000 0.1900 0.8071 1.2800 10.7100
##
## $word_freq_font
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.0000 0.0000 0.0000 0.1231 0.0000 17.1000
##
## $word_freq_000
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.0000 0.0000 0.0000 0.1018 0.0000 5.4500
##
## $word_freq_money
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.00000 0.00000 0.00000 0.09628 0.00000 12.50000
##
## $word_freq_hp
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.0000 0.0000 0.0000 0.5557 0.0000 20.8300
##
## $word_freq_hpl
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.0000 0.0000 0.0000 0.2683 0.0000 16.6600
##
## $word_freq_george
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.0000 0.0000 0.0000 0.7617 0.0000 33.3300
##
## $word_freq_650
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.0000 0.0000 0.0000 0.1215 0.0000 5.8800
##
## $word_freq_lab
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.0000 0.0000 0.0000 0.1063 0.0000 14.2800
##
## $word_freq_labs
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.0000 0.0000 0.0000 0.1066 0.0000 5.8800
##
## $word_freq_telnet
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.00000 0.00000 0.00000 0.06621 0.00000 4.76000
##
## $word_freq_857
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.0000 0.0000 0.0000 0.0508 0.0000 4.7600
##
## $word_freq_data
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.00000 0.00000 0.00000 0.09997 0.00000 18.18000
##
## $word_freq_415
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.00000 0.00000 0.00000 0.05179 0.00000 4.76000
##
## $word_freq_85
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.0000 0.0000 0.0000 0.1079 0.0000 20.0000
##
## $word_freq_technology
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.0000 0.0000 0.0000 0.1011 0.0000 7.6900
##
## $word_freq_1999
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.000 0.000 0.000 0.136 0.000 6.890
##
## $word_freq_parts
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.00000 0.00000 0.00000 0.01449 0.00000 8.33000
##
## $word_freq_pm
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.00000 0.00000 0.00000 0.07446 0.00000 9.75000
##
## $word_freq_direct
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.00000 0.00000 0.00000 0.06766 0.00000 4.76000
##
## $word_freq_cs
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.00000 0.00000 0.00000 0.04687 0.00000 7.14000
##
## $word_freq_meeting
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.000 0.000 0.000 0.144 0.000 14.280
##
## $word_freq_original
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.00000 0.00000 0.00000 0.04646 0.00000 3.57000
##
## $word_freq_project
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.0000 0.0000 0.0000 0.0761 0.0000 20.0000
##
## $word_freq_re
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.0000 0.0000 0.0000 0.2919 0.1000 20.0000
##
## $word_freq_edu
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.0000 0.0000 0.0000 0.1773 0.0000 16.7000
##
## $word_freq_table
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.000000 0.000000 0.000000 0.005185 0.000000 2.120000
##
## $word_freq_conference
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.0000 0.0000 0.0000 0.0332 0.0000 8.3300
##
## $char_freq_.
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.00000 0.00000 0.00000 0.04055 0.00000 4.38500
##
## $char_freq_..1
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.0000 0.0000 0.0640 0.1375 0.1887 5.2770
##
## $char_freq_..2
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.00000 0.00000 0.00000 0.01619 0.00000 2.77700
##
## $char_freq_..3
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.0000 0.0000 0.0000 0.2719 0.3290 32.4800
##
## $char_freq_..4
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.00000 0.00000 0.00000 0.07571 0.05300 6.00300
##
## $char_freq_..5
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.00000 0.00000 0.00000 0.04609 0.00000 19.83000
##
## $capital_run_length_average
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 1.000 1.600 2.272 5.138 3.707 1102.000
##
## $capital_run_length_longest
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 1.00 6.00 14.50 50.16 43.00 2204.00
##
## $capital_run_length_total
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 1.00 34.25 95.00 285.10 269.50 15840.00
##
## $status
## 0 1
## 2231 1451
Testing Dataset Summary
lapply(dfrTstData, FUN=summary)
## $word_freq_make
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.00000 0.00000 0.00000 0.09565 0.00000 4.54000
##
## $word_freq_address
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.0000 0.0000 0.0000 0.2056 0.0000 14.2800
##
## $word_freq_all
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.0000 0.0000 0.0000 0.2722 0.4050 4.5400
##
## $word_freq_3d
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.00000 0.00000 0.00000 0.05789 0.00000 42.73000
##
## $word_freq_our
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.0000 0.0000 0.0000 0.3111 0.4100 7.6900
##
## $word_freq_over
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.00000 0.00000 0.00000 0.09081 0.00000 1.86000
##
## $word_freq_remove
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.00000 0.00000 0.00000 0.09997 0.00000 7.27000
##
## $word_freq_internet
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.0000 0.0000 0.0000 0.1075 0.0000 11.1100
##
## $word_freq_order
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.00000 0.00000 0.00000 0.08978 0.00000 5.26000
##
## $word_freq_mail
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.000 0.000 0.000 0.281 0.220 18.180
##
## $word_freq_receive
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.00000 0.00000 0.00000 0.05285 0.00000 2.00000
##
## $word_freq_will
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.000 0.000 0.000 0.501 0.780 5.000
##
## $word_freq_people
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.00000 0.00000 0.00000 0.08971 0.00000 2.63000
##
## $word_freq_report
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.00000 0.00000 0.00000 0.05045 0.00000 2.32000
##
## $word_freq_addresses
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.00000 0.00000 0.00000 0.05955 0.00000 2.38000
##
## $word_freq_free
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.0000 0.0000 0.0000 0.2309 0.1250 6.4500
##
## $word_freq_business
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.0000 0.0000 0.0000 0.1386 0.0000 4.8700
##
## $word_freq_email
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.0000 0.0000 0.0000 0.2084 0.0000 9.0900
##
## $word_freq_you
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.000 0.000 1.310 1.628 2.590 10.630
##
## $word_freq_credit
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.00000 0.00000 0.00000 0.07098 0.00000 5.19000
##
## $word_freq_your
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.0000 0.0000 0.3100 0.8203 1.2500 11.1100
##
## $word_freq_font
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.0000 0.0000 0.0000 0.1135 0.0000 15.4300
##
## $word_freq_000
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.0000 0.0000 0.0000 0.1012 0.0000 3.3800
##
## $word_freq_money
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.00000 0.00000 0.00000 0.08619 0.00000 9.09000
##
## $word_freq_hp
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.0000 0.0000 0.0000 0.5247 0.0000 16.6600
##
## $word_freq_hpl
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.0000 0.0000 0.0000 0.2536 0.0000 8.0000
##
## $word_freq_george
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.0000 0.0000 0.0000 0.7897 0.0000 33.3300
##
## $word_freq_650
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.0000 0.0000 0.0000 0.1383 0.0000 9.0900
##
## $word_freq_lab
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.00000 0.00000 0.00000 0.06946 0.00000 9.09000
##
## $word_freq_labs
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.00000 0.00000 0.00000 0.08775 0.00000 4.34000
##
## $word_freq_telnet
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.00000 0.00000 0.00000 0.05892 0.00000 12.50000
##
## $word_freq_857
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.000 0.000 0.000 0.032 0.000 4.700
##
## $word_freq_data
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.00000 0.00000 0.00000 0.08626 0.00000 4.76000
##
## $word_freq_415
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.000 0.000 0.000 0.032 0.000 4.700
##
## $word_freq_85
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.00000 0.00000 0.00000 0.09528 0.00000 4.65000
##
## $word_freq_technology
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.00000 0.00000 0.00000 0.08283 0.00000 4.16000
##
## $word_freq_1999
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.0000 0.0000 0.0000 0.1408 0.0000 4.5400
##
## $word_freq_parts
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.000000 0.000000 0.000000 0.008041 0.000000 4.000000
##
## $word_freq_pm
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.00000 0.00000 0.00000 0.09532 0.00000 11.11000
##
## $word_freq_direct
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.00000 0.00000 0.00000 0.05349 0.00000 4.16000
##
## $word_freq_cs
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.00000 0.00000 0.00000 0.03082 0.00000 7.14000
##
## $word_freq_meeting
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.00000 0.00000 0.00000 0.08565 0.00000 9.09000
##
## $word_freq_original
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.00000 0.00000 0.00000 0.04465 0.00000 1.75000
##
## $word_freq_project
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.00000 0.00000 0.00000 0.09159 0.00000 16.66000
##
## $word_freq_re
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.0000 0.0000 0.0000 0.3385 0.1350 21.4200
##
## $word_freq_edu
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.0000 0.0000 0.0000 0.1898 0.0000 22.0500
##
## $word_freq_table
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.000000 0.000000 0.000000 0.006485 0.000000 2.170000
##
## $word_freq_conference
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.00000 0.00000 0.00000 0.02653 0.00000 10.00000
##
## $char_freq_.
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.00000 0.00000 0.00000 0.03068 0.00000 3.83800
##
## $char_freq_..1
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.0000 0.0000 0.0670 0.1450 0.1845 9.7520
##
## $char_freq_..2
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.00000 0.00000 0.00000 0.02011 0.00000 4.08100
##
## $char_freq_..3
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.0000 0.0000 0.0000 0.2579 0.2660 19.1300
##
## $char_freq_..4
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.00000 0.00000 0.00000 0.07621 0.04650 3.30500
##
## $char_freq_..5
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.0000 0.0000 0.0000 0.0368 0.0000 7.4070
##
## $capital_run_length_average
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 1.000 1.558 2.309 5.406 3.704 1022.000
##
## $capital_run_length_longest
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 1.00 6.50 15.00 60.22 41.50 9989.00
##
## $capital_run_length_total
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 1.0 37.0 94.0 275.9 260.0 10060.0
##
## $status
## 0 1
## 557 362
Random Forest Approach
myCtrl <- trainControl(method=“cv”, number=10, repeats=3) m1 <- train(predictor~., data=dataFrame, method=“rf”,
verbose=F, trControl=myCtrl) ## Boost Method cvCtrl <- trainControl(method=“repeatedcv”, number=10, repeats=3) m2 <- train(predictor~., data=dataFrame, method=“gmb”, verbose=F, trControl=cvCtrl) ## Custom Algorithm … notice method is not mentioned here myCtrl <- trainControl(method=“oob”, number=10, repeats=3) m3 <- train(predictor~., data=dataFrame, tuneGrid=data.frame(mtry=10), trControl=myCtrl)
We could also try one of these if required ## Support Vector Machines Model myCtrl <- trainControl(method=“cv”, number=10, repeats=3) o1 <- train(predictor~., data=dataFrame, method=“svm”, verbose=F, trControl=cvCtrl) ## KNN Model myCtrl <- trainControl(method=“cv”, number=10, repeats=3) o2 <- train(predictor~., data=dataFrame, method=“knn”, verbose=F, trControl=cvCtrl) ## Bagged Model myCtrl <- trainControl(method=“cv”, number=10, repeats=3) o3 <- train(predictor~., data=dataFrame, method=“bag”, verbose=F, trControl=cvCtrl)
Create Model - Random Forest (Default)
## set seed
set.seed(707)
# mtry
myMtry=sqrt(ncol(dfrTrnData)-1)
myNtrees=500
# start time
vctProcStrt <- proc.time()
# random forest (default)
mdlRndForDef <- randomForest(status~., data=dfrTrnData,
mtry=myMtry, ntree=myNtrees)
# end time
vctProcEnds <- proc.time()
# print
print(paste("Model Created ...",vctProcEnds[1] - vctProcStrt[1]))
## [1] "Model Created ... 10.96"
View Model - Default Random Forest
mdlRndForDef
##
## Call:
## randomForest(formula = status ~ ., data = dfrTrnData, mtry = myMtry, ntree = myNtrees)
## Type of random forest: classification
## Number of trees: 500
## No. of variables tried at each split: 8
##
## OOB estimate of error rate: 4.89%
## Confusion matrix:
## 0 1 class.error
## 0 2161 70 0.03137606
## 1 110 1341 0.07580979
View Model Summary - Default Random Forest
summary(mdlRndForDef)
## Length Class Mode
## call 5 -none- call
## type 1 -none- character
## predicted 3682 factor numeric
## err.rate 1500 -none- numeric
## confusion 6 -none- numeric
## votes 7364 matrix numeric
## oob.times 3682 -none- numeric
## classes 2 -none- character
## importance 57 -none- numeric
## importanceSD 0 -none- NULL
## localImportance 0 -none- NULL
## proximity 0 -none- NULL
## ntree 1 -none- numeric
## mtry 1 -none- numeric
## forest 14 -none- list
## y 3682 factor numeric
## test 0 -none- NULL
## inbag 0 -none- NULL
## terms 3 terms call
Prediction - Test Data - Random Forest (Default)
vctRndForDef <- predict(mdlRndForDef, newdata=dfrTstData)
cmxRndForDef <- confusionMatrix(vctRndForDef, dfrTstData$status)
cmxRndForDef
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 537 31
## 1 20 331
##
## Accuracy : 0.9445
## 95% CI : (0.9277, 0.9584)
## No Information Rate : 0.6061
## P-Value [Acc > NIR] : <2e-16
##
## Kappa : 0.8832
## Mcnemar's Test P-Value : 0.1614
##
## Sensitivity : 0.9641
## Specificity : 0.9144
## Pos Pred Value : 0.9454
## Neg Pred Value : 0.9430
## Prevalence : 0.6061
## Detection Rate : 0.5843
## Detection Prevalence : 0.6181
## Balanced Accuracy : 0.9392
##
## 'Positive' Class : 0
##
Create Model - Random Forest (RFM)
## set seed
set.seed(707)
# start time
vctProcStrt <- proc.time()
# random forest (default)
myControl <- trainControl(method="cv", number=10, repeats=3)
myMetric <- "Accuracy"
myMtry <- sqrt(ncol(dfrTrnData)-1)
#myNtrees <- 500
myTuneGrid <- expand.grid(.mtry=myMtry)
mdlRndForRfm <- train(status~., data=dfrTrnData, method="rf",
verbose=F, metric=myMetric, trControl=myControl,
tuneGrid=myTuneGrid)
# end time
vctProcEnds <- proc.time()
# print
print(paste("Model Created ...",vctProcEnds[1] - vctProcStrt[1]))
## [1] "Model Created ... 102.56"
View Model - Random Forest (RFM)
mdlRndForRfm
## Random Forest
##
## 3682 samples
## 57 predictor
## 2 classes: '0', '1'
##
## No pre-processing
## Resampling: Cross-Validated (10 fold)
## Summary of sample sizes: 3313, 3314, 3314, 3314, 3314, 3314, ...
## Resampling results:
##
## Accuracy Kappa
## 0.9513837 0.8976903
##
## Tuning parameter 'mtry' was held constant at a value of 7.549834
View Model Summary - Random Forest (RFM)
summary(mdlRndForRfm)
## Length Class Mode
## call 5 -none- call
## type 1 -none- character
## predicted 3682 factor numeric
## err.rate 1500 -none- numeric
## confusion 6 -none- numeric
## votes 7364 matrix numeric
## oob.times 3682 -none- numeric
## classes 2 -none- character
## importance 57 -none- numeric
## importanceSD 0 -none- NULL
## localImportance 0 -none- NULL
## proximity 0 -none- NULL
## ntree 1 -none- numeric
## mtry 1 -none- numeric
## forest 14 -none- list
## y 3682 factor numeric
## test 0 -none- NULL
## inbag 0 -none- NULL
## xNames 57 -none- character
## problemType 1 -none- character
## tuneValue 1 data.frame list
## obsLevels 2 -none- character
## param 1 -none- list
Prediction - Test Data - Random Forest (RFM)
vctRndForRfm <- predict(mdlRndForRfm, newdata=dfrTstData)
cmxRndForRfm <- confusionMatrix(vctRndForRfm, dfrTstData$status)
cmxRndForRfm
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 537 30
## 1 20 332
##
## Accuracy : 0.9456
## 95% CI : (0.9289, 0.9594)
## No Information Rate : 0.6061
## P-Value [Acc > NIR] : <2e-16
##
## Kappa : 0.8855
## Mcnemar's Test P-Value : 0.2031
##
## Sensitivity : 0.9641
## Specificity : 0.9171
## Pos Pred Value : 0.9471
## Neg Pred Value : 0.9432
## Prevalence : 0.6061
## Detection Rate : 0.5843
## Detection Prevalence : 0.6170
## Balanced Accuracy : 0.9406
##
## 'Positive' Class : 0
##
Create Model - Random Forest (GBM)
## set seed
set.seed(707)
# start time
vctProcStrt <- proc.time()
# random forest (default)
myControl <- trainControl(method="repeatedcv", number=10, repeats=3)
myMetric <- "Accuracy"
myMtry <- sqrt(ncol(dfrTrnData)-1)
#myNtrees <- 500
myTuneGrid <- expand.grid(.mtry=myMtry)
mdlRndForGbm <- train(status~., data=dfrTrnData, method="gbm",
verbose=F, metric=myMetric, trControl=myControl)
# tuneGrid=myTuneGrid, ntree=myNtrees)
# end time
vctProcEnds <- proc.time()
# print
print(paste("Model Created ...",vctProcEnds[1] - vctProcStrt[1]))
## [1] "Model Created ... 107.22"
View Model - Random Forest (GBM)
mdlRndForGbm
## Stochastic Gradient Boosting
##
## 3682 samples
## 57 predictor
## 2 classes: '0', '1'
##
## No pre-processing
## Resampling: Cross-Validated (10 fold, repeated 3 times)
## Summary of sample sizes: 3313, 3314, 3314, 3314, 3314, 3314, ...
## Resampling results across tuning parameters:
##
## interaction.depth n.trees Accuracy Kappa
## 1 50 0.9166200 0.8217745
## 1 100 0.9317404 0.8553362
## 1 150 0.9391638 0.8713401
## 2 50 0.9316474 0.8552624
## 2 100 0.9409724 0.8753850
## 2 150 0.9464058 0.8869276
## 3 50 0.9370819 0.8670327
## 3 100 0.9442331 0.8823408
## 3 150 0.9472207 0.8888336
##
## Tuning parameter 'shrinkage' was held constant at a value of 0.1
##
## Tuning parameter 'n.minobsinnode' was held constant at a value of 10
## Accuracy was used to select the optimal model using the largest value.
## The final values used for the model were n.trees = 150,
## interaction.depth = 3, shrinkage = 0.1 and n.minobsinnode = 10.
View Model Summary - Random Forest (GBM)
summary(mdlRndForGbm)
## var rel.inf
## char_freq_..3 char_freq_..3 25.521437639
## char_freq_..4 char_freq_..4 17.578240290
## word_freq_remove word_freq_remove 11.766301810
## word_freq_free word_freq_free 9.679062803
## capital_run_length_average capital_run_length_average 6.055904109
## word_freq_hp word_freq_hp 5.270472464
## word_freq_your word_freq_your 4.505565972
## capital_run_length_longest capital_run_length_longest 3.193041450
## capital_run_length_total capital_run_length_total 2.851656742
## word_freq_money word_freq_money 2.843308894
## word_freq_george word_freq_george 1.935274601
## word_freq_edu word_freq_edu 1.922367776
## word_freq_our word_freq_our 1.152916067
## word_freq_internet word_freq_internet 0.637329734
## word_freq_you word_freq_you 0.624814549
## word_freq_1999 word_freq_1999 0.486145458
## word_freq_re word_freq_re 0.465441449
## word_freq_650 word_freq_650 0.465321156
## word_freq_meeting word_freq_meeting 0.453443503
## word_freq_will word_freq_will 0.311550314
## word_freq_000 word_freq_000 0.228422002
## word_freq_over word_freq_over 0.220407506
## word_freq_business word_freq_business 0.219532854
## char_freq_. char_freq_. 0.198633300
## char_freq_..1 char_freq_..1 0.198119641
## word_freq_hpl word_freq_hpl 0.193733490
## word_freq_receive word_freq_receive 0.168121158
## word_freq_email word_freq_email 0.163393833
## word_freq_technology word_freq_technology 0.120982669
## word_freq_make word_freq_make 0.080538250
## word_freq_3d word_freq_3d 0.074419321
## word_freq_credit word_freq_credit 0.072294158
## word_freq_project word_freq_project 0.061061928
## word_freq_order word_freq_order 0.056973556
## word_freq_mail word_freq_mail 0.044588232
## word_freq_font word_freq_font 0.038618287
## char_freq_..5 char_freq_..5 0.032986403
## word_freq_report word_freq_report 0.021522766
## word_freq_all word_freq_all 0.020236487
## word_freq_parts word_freq_parts 0.019842847
## word_freq_data word_freq_data 0.012981264
## word_freq_address word_freq_address 0.012545248
## word_freq_pm word_freq_pm 0.010697917
## word_freq_people word_freq_people 0.009750106
## word_freq_addresses word_freq_addresses 0.000000000
## word_freq_lab word_freq_lab 0.000000000
## word_freq_labs word_freq_labs 0.000000000
## word_freq_telnet word_freq_telnet 0.000000000
## word_freq_857 word_freq_857 0.000000000
## word_freq_415 word_freq_415 0.000000000
## word_freq_85 word_freq_85 0.000000000
## word_freq_direct word_freq_direct 0.000000000
## word_freq_cs word_freq_cs 0.000000000
## word_freq_original word_freq_original 0.000000000
## word_freq_table word_freq_table 0.000000000
## word_freq_conference word_freq_conference 0.000000000
## char_freq_..2 char_freq_..2 0.000000000
Prediction - Test Data - Random Forest (GBM)
vctRndForGbm <- predict(mdlRndForGbm, newdata=dfrTstData)
cmxRndForGbm <- confusionMatrix(vctRndForGbm, dfrTstData$status)
cmxRndForGbm
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 536 33
## 1 21 329
##
## Accuracy : 0.9412
## 95% CI : (0.924, 0.9556)
## No Information Rate : 0.6061
## P-Value [Acc > NIR] : <2e-16
##
## Kappa : 0.8762
## Mcnemar's Test P-Value : 0.1344
##
## Sensitivity : 0.9623
## Specificity : 0.9088
## Pos Pred Value : 0.9420
## Neg Pred Value : 0.9400
## Prevalence : 0.6061
## Detection Rate : 0.5832
## Detection Prevalence : 0.6192
## Balanced Accuracy : 0.9356
##
## 'Positive' Class : 0
##
Create Model - Random Forest (OOB)
## set seed
set.seed(707)
# start time
vctProcStrt <- proc.time()
# random forest (default)
myControl <- trainControl(method="oob", number=10, repeats=3)
myMetric <- "Accuracy"
myMtry <- sqrt(ncol(dfrTrnData)-1)
myNtrees <- 500
myTuneGrid <- expand.grid(.mtry=myMtry)
mdlRndForOob <- train(status~., data=dfrTrnData,
verbose=F, metric=myMetric, trControl=myControl,
tuneGrid=myTuneGrid, ntree=myNtrees)
# end time
vctProcEnds <- proc.time()
# print
print(paste("Model Created ...",vctProcEnds[1] - vctProcStrt[1]))
## [1] "Model Created ... 21.31"
View Model - Random Formbest (OOB)
mdlRndForOob
## Random Forest
##
## 3682 samples
## 57 predictor
## 2 classes: '0', '1'
##
## No pre-processing
## Resampling results:
##
## Accuracy Kappa
## 0.9508419 0.8964783
##
## Tuning parameter 'mtry' was held constant at a value of 7.549834
View Model Summary - Random Forest (OOB)
summary(mdlRndForOob)
## Length Class Mode
## call 6 -none- call
## type 1 -none- character
## predicted 3682 factor numeric
## err.rate 1500 -none- numeric
## confusion 6 -none- numeric
## votes 7364 matrix numeric
## oob.times 3682 -none- numeric
## classes 2 -none- character
## importance 57 -none- numeric
## importanceSD 0 -none- NULL
## localImportance 0 -none- NULL
## proximity 0 -none- NULL
## ntree 1 -none- numeric
## mtry 1 -none- numeric
## forest 14 -none- list
## y 3682 factor numeric
## test 0 -none- NULL
## inbag 0 -none- NULL
## xNames 57 -none- character
## problemType 1 -none- character
## tuneValue 1 data.frame list
## obsLevels 2 -none- character
## param 2 -none- list
Prediction - Test Data - Random Forest (OOB)
vctRndForOob <- predict(mdlRndForOob, newdata=dfrTstData)
cmxRndForOob <- confusionMatrix(vctRndForOob, dfrTstData$status)
cmxRndForOob
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 536 31
## 1 21 331
##
## Accuracy : 0.9434
## 95% CI : (0.9265, 0.9575)
## No Information Rate : 0.6061
## P-Value [Acc > NIR] : <2e-16
##
## Kappa : 0.8809
## Mcnemar's Test P-Value : 0.212
##
## Sensitivity : 0.9623
## Specificity : 0.9144
## Pos Pred Value : 0.9453
## Neg Pred Value : 0.9403
## Prevalence : 0.6061
## Detection Rate : 0.5832
## Detection Prevalence : 0.6170
## Balanced Accuracy : 0.9383
##
## 'Positive' Class : 0
##