Problem Defination
Determine whether a given email is spam or not, if its spam then status =1 or else 0
Approach We will use different versions of random forest model to check for the accuracy and will perform the prediction using the method with minimum error
Load Libs
library(plyr)
library(tidyr)
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:plyr':
##
## arrange, count, desc, failwith, id, mutate, rename, summarise,
## summarize
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(ggplot2)
library(caret)
## Loading required package: lattice
library(randomForest)
## randomForest 4.6-12
## Type rfNews() to see new features/changes/bug fixes.
##
## Attaching package: 'randomForest'
## The following object is masked from 'package:ggplot2':
##
## margin
## The following object is masked from 'package:dplyr':
##
## combine
library(gbm)
## Loading required package: survival
##
## Attaching package: 'survival'
## The following object is masked from 'package:caret':
##
## cluster
## Loading required package: splines
## Loading required package: parallel
## Loaded gbm 2.1.3
library(corrgram)
##
## Attaching package: 'corrgram'
## The following object is masked from 'package:plyr':
##
## baseball
Functions for detecting NA and to check for correlation
detectNA <- function(inp) {
sum(is.na(inp))
}
detectCor <- function(x) {
cor(as.numeric(dfrSpamMailData[, x]),
as.numeric(dfrSpamMailData$status),
method="spearman")
}
Load Dataset
dfrSpamMailData <- read.csv("./data/spambase.csv", header=T, stringsAsFactors=T)
head(dfrSpamMailData)
## word_freq_make word_freq_address word_freq_all word_freq_3d
## 1 0.00 0.64 0.64 0
## 2 0.21 0.28 0.50 0
## 3 0.06 0.00 0.71 0
## 4 0.00 0.00 0.00 0
## 5 0.00 0.00 0.00 0
## 6 0.00 0.00 0.00 0
## word_freq_our word_freq_over word_freq_remove word_freq_internet
## 1 0.32 0.00 0.00 0.00
## 2 0.14 0.28 0.21 0.07
## 3 1.23 0.19 0.19 0.12
## 4 0.63 0.00 0.31 0.63
## 5 0.63 0.00 0.31 0.63
## 6 1.85 0.00 0.00 1.85
## word_freq_order word_freq_mail word_freq_receive word_freq_will
## 1 0.00 0.00 0.00 0.64
## 2 0.00 0.94 0.21 0.79
## 3 0.64 0.25 0.38 0.45
## 4 0.31 0.63 0.31 0.31
## 5 0.31 0.63 0.31 0.31
## 6 0.00 0.00 0.00 0.00
## word_freq_people word_freq_report word_freq_addresses word_freq_free
## 1 0.00 0.00 0.00 0.32
## 2 0.65 0.21 0.14 0.14
## 3 0.12 0.00 1.75 0.06
## 4 0.31 0.00 0.00 0.31
## 5 0.31 0.00 0.00 0.31
## 6 0.00 0.00 0.00 0.00
## word_freq_business word_freq_email word_freq_you word_freq_credit
## 1 0.00 1.29 1.93 0.00
## 2 0.07 0.28 3.47 0.00
## 3 0.06 1.03 1.36 0.32
## 4 0.00 0.00 3.18 0.00
## 5 0.00 0.00 3.18 0.00
## 6 0.00 0.00 0.00 0.00
## word_freq_your word_freq_font word_freq_000 word_freq_money word_freq_hp
## 1 0.96 0 0.00 0.00 0
## 2 1.59 0 0.43 0.43 0
## 3 0.51 0 1.16 0.06 0
## 4 0.31 0 0.00 0.00 0
## 5 0.31 0 0.00 0.00 0
## 6 0.00 0 0.00 0.00 0
## word_freq_hpl word_freq_george word_freq_650 word_freq_lab
## 1 0 0 0 0
## 2 0 0 0 0
## 3 0 0 0 0
## 4 0 0 0 0
## 5 0 0 0 0
## 6 0 0 0 0
## word_freq_labs word_freq_telnet word_freq_857 word_freq_data
## 1 0 0 0 0
## 2 0 0 0 0
## 3 0 0 0 0
## 4 0 0 0 0
## 5 0 0 0 0
## 6 0 0 0 0
## word_freq_415 word_freq_85 word_freq_technology word_freq_1999
## 1 0 0 0 0.00
## 2 0 0 0 0.07
## 3 0 0 0 0.00
## 4 0 0 0 0.00
## 5 0 0 0 0.00
## 6 0 0 0 0.00
## word_freq_parts word_freq_pm word_freq_direct word_freq_cs
## 1 0 0 0.00 0
## 2 0 0 0.00 0
## 3 0 0 0.06 0
## 4 0 0 0.00 0
## 5 0 0 0.00 0
## 6 0 0 0.00 0
## word_freq_meeting word_freq_original word_freq_project word_freq_re
## 1 0 0.00 0 0.00
## 2 0 0.00 0 0.00
## 3 0 0.12 0 0.06
## 4 0 0.00 0 0.00
## 5 0 0.00 0 0.00
## 6 0 0.00 0 0.00
## word_freq_edu word_freq_table word_freq_conference char_freq_.
## 1 0.00 0 0 0.00
## 2 0.00 0 0 0.00
## 3 0.06 0 0 0.01
## 4 0.00 0 0 0.00
## 5 0.00 0 0 0.00
## 6 0.00 0 0 0.00
## char_freq_..1 char_freq_..2 char_freq_..3 char_freq_..4 char_freq_..5
## 1 0.000 0 0.778 0.000 0.000
## 2 0.132 0 0.372 0.180 0.048
## 3 0.143 0 0.276 0.184 0.010
## 4 0.137 0 0.137 0.000 0.000
## 5 0.135 0 0.135 0.000 0.000
## 6 0.223 0 0.000 0.000 0.000
## capital_run_length_average capital_run_length_longest
## 1 3.756 61
## 2 5.114 101
## 3 9.821 485
## 4 3.537 40
## 5 3.537 40
## 6 3.000 15
## capital_run_length_total status
## 1 278 1
## 2 1028 1
## 3 2259 1
## 4 191 1
## 5 191 1
## 6 54 1
Dataframe Stucture
str(dfrSpamMailData)
## 'data.frame': 4601 obs. of 58 variables:
## $ word_freq_make : num 0 0.21 0.06 0 0 0 0 0 0.15 0.06 ...
## $ word_freq_address : num 0.64 0.28 0 0 0 0 0 0 0 0.12 ...
## $ word_freq_all : num 0.64 0.5 0.71 0 0 0 0 0 0.46 0.77 ...
## $ word_freq_3d : num 0 0 0 0 0 0 0 0 0 0 ...
## $ word_freq_our : num 0.32 0.14 1.23 0.63 0.63 1.85 1.92 1.88 0.61 0.19 ...
## $ word_freq_over : num 0 0.28 0.19 0 0 0 0 0 0 0.32 ...
## $ word_freq_remove : num 0 0.21 0.19 0.31 0.31 0 0 0 0.3 0.38 ...
## $ word_freq_internet : num 0 0.07 0.12 0.63 0.63 1.85 0 1.88 0 0 ...
## $ word_freq_order : num 0 0 0.64 0.31 0.31 0 0 0 0.92 0.06 ...
## $ word_freq_mail : num 0 0.94 0.25 0.63 0.63 0 0.64 0 0.76 0 ...
## $ word_freq_receive : num 0 0.21 0.38 0.31 0.31 0 0.96 0 0.76 0 ...
## $ word_freq_will : num 0.64 0.79 0.45 0.31 0.31 0 1.28 0 0.92 0.64 ...
## $ word_freq_people : num 0 0.65 0.12 0.31 0.31 0 0 0 0 0.25 ...
## $ word_freq_report : num 0 0.21 0 0 0 0 0 0 0 0 ...
## $ word_freq_addresses : num 0 0.14 1.75 0 0 0 0 0 0 0.12 ...
## $ word_freq_free : num 0.32 0.14 0.06 0.31 0.31 0 0.96 0 0 0 ...
## $ word_freq_business : num 0 0.07 0.06 0 0 0 0 0 0 0 ...
## $ word_freq_email : num 1.29 0.28 1.03 0 0 0 0.32 0 0.15 0.12 ...
## $ word_freq_you : num 1.93 3.47 1.36 3.18 3.18 0 3.85 0 1.23 1.67 ...
## $ word_freq_credit : num 0 0 0.32 0 0 0 0 0 3.53 0.06 ...
## $ word_freq_your : num 0.96 1.59 0.51 0.31 0.31 0 0.64 0 2 0.71 ...
## $ word_freq_font : num 0 0 0 0 0 0 0 0 0 0 ...
## $ word_freq_000 : num 0 0.43 1.16 0 0 0 0 0 0 0.19 ...
## $ word_freq_money : num 0 0.43 0.06 0 0 0 0 0 0.15 0 ...
## $ word_freq_hp : num 0 0 0 0 0 0 0 0 0 0 ...
## $ word_freq_hpl : num 0 0 0 0 0 0 0 0 0 0 ...
## $ word_freq_george : num 0 0 0 0 0 0 0 0 0 0 ...
## $ word_freq_650 : num 0 0 0 0 0 0 0 0 0 0 ...
## $ word_freq_lab : num 0 0 0 0 0 0 0 0 0 0 ...
## $ word_freq_labs : num 0 0 0 0 0 0 0 0 0 0 ...
## $ word_freq_telnet : num 0 0 0 0 0 0 0 0 0 0 ...
## $ word_freq_857 : num 0 0 0 0 0 0 0 0 0 0 ...
## $ word_freq_data : num 0 0 0 0 0 0 0 0 0.15 0 ...
## $ word_freq_415 : num 0 0 0 0 0 0 0 0 0 0 ...
## $ word_freq_85 : num 0 0 0 0 0 0 0 0 0 0 ...
## $ word_freq_technology : num 0 0 0 0 0 0 0 0 0 0 ...
## $ word_freq_1999 : num 0 0.07 0 0 0 0 0 0 0 0 ...
## $ word_freq_parts : num 0 0 0 0 0 0 0 0 0 0 ...
## $ word_freq_pm : num 0 0 0 0 0 0 0 0 0 0 ...
## $ word_freq_direct : num 0 0 0.06 0 0 0 0 0 0 0 ...
## $ word_freq_cs : num 0 0 0 0 0 0 0 0 0 0 ...
## $ word_freq_meeting : num 0 0 0 0 0 0 0 0 0 0 ...
## $ word_freq_original : num 0 0 0.12 0 0 0 0 0 0.3 0 ...
## $ word_freq_project : num 0 0 0 0 0 0 0 0 0 0.06 ...
## $ word_freq_re : num 0 0 0.06 0 0 0 0 0 0 0 ...
## $ word_freq_edu : num 0 0 0.06 0 0 0 0 0 0 0 ...
## $ word_freq_table : num 0 0 0 0 0 0 0 0 0 0 ...
## $ word_freq_conference : num 0 0 0 0 0 0 0 0 0 0 ...
## $ char_freq_. : num 0 0 0.01 0 0 0 0 0 0 0.04 ...
## $ char_freq_..1 : num 0 0.132 0.143 0.137 0.135 0.223 0.054 0.206 0.271 0.03 ...
## $ char_freq_..2 : num 0 0 0 0 0 0 0 0 0 0 ...
## $ char_freq_..3 : num 0.778 0.372 0.276 0.137 0.135 0 0.164 0 0.181 0.244 ...
## $ char_freq_..4 : num 0 0.18 0.184 0 0 0 0.054 0 0.203 0.081 ...
## $ char_freq_..5 : num 0 0.048 0.01 0 0 0 0 0 0.022 0 ...
## $ capital_run_length_average: num 3.76 5.11 9.82 3.54 3.54 ...
## $ capital_run_length_longest: int 61 101 485 40 40 15 4 11 445 43 ...
## $ capital_run_length_total : int 278 1028 2259 191 191 54 112 49 1257 749 ...
## $ status : int 1 1 1 1 1 1 1 1 1 1 ...
Dataframe Summary
lapply(dfrSpamMailData, FUN=summary)
## $word_freq_make
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.0000 0.0000 0.0000 0.1046 0.0000 4.5400
##
## $word_freq_address
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.000 0.000 0.000 0.213 0.000 14.280
##
## $word_freq_all
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.0000 0.0000 0.0000 0.2807 0.4200 5.1000
##
## $word_freq_3d
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.00000 0.00000 0.00000 0.06542 0.00000 42.81000
##
## $word_freq_our
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.0000 0.0000 0.0000 0.3122 0.3800 10.0000
##
## $word_freq_over
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.0000 0.0000 0.0000 0.0959 0.0000 5.8800
##
## $word_freq_remove
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.0000 0.0000 0.0000 0.1142 0.0000 7.2700
##
## $word_freq_internet
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.0000 0.0000 0.0000 0.1053 0.0000 11.1100
##
## $word_freq_order
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.00000 0.00000 0.00000 0.09007 0.00000 5.26000
##
## $word_freq_mail
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.0000 0.0000 0.0000 0.2394 0.1600 18.1800
##
## $word_freq_receive
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.00000 0.00000 0.00000 0.05982 0.00000 2.61000
##
## $word_freq_will
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.0000 0.0000 0.1000 0.5417 0.8000 9.6700
##
## $word_freq_people
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.00000 0.00000 0.00000 0.09393 0.00000 5.55000
##
## $word_freq_report
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.00000 0.00000 0.00000 0.05863 0.00000 10.00000
##
## $word_freq_addresses
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.0000 0.0000 0.0000 0.0492 0.0000 4.4100
##
## $word_freq_free
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.0000 0.0000 0.0000 0.2488 0.1000 20.0000
##
## $word_freq_business
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.0000 0.0000 0.0000 0.1426 0.0000 7.1400
##
## $word_freq_email
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.0000 0.0000 0.0000 0.1847 0.0000 9.0900
##
## $word_freq_you
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.000 0.000 1.310 1.662 2.640 18.750
##
## $word_freq_credit
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.00000 0.00000 0.00000 0.08558 0.00000 18.18000
##
## $word_freq_your
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.0000 0.0000 0.2200 0.8098 1.2700 11.1100
##
## $word_freq_font
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.0000 0.0000 0.0000 0.1212 0.0000 17.1000
##
## $word_freq_000
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.0000 0.0000 0.0000 0.1016 0.0000 5.4500
##
## $word_freq_money
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.00000 0.00000 0.00000 0.09427 0.00000 12.50000
##
## $word_freq_hp
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.0000 0.0000 0.0000 0.5495 0.0000 20.8300
##
## $word_freq_hpl
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.0000 0.0000 0.0000 0.2654 0.0000 16.6600
##
## $word_freq_george
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.0000 0.0000 0.0000 0.7673 0.0000 33.3300
##
## $word_freq_650
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.0000 0.0000 0.0000 0.1248 0.0000 9.0900
##
## $word_freq_lab
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.00000 0.00000 0.00000 0.09892 0.00000 14.28000
##
## $word_freq_labs
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.0000 0.0000 0.0000 0.1029 0.0000 5.8800
##
## $word_freq_telnet
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.00000 0.00000 0.00000 0.06475 0.00000 12.50000
##
## $word_freq_857
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.00000 0.00000 0.00000 0.04705 0.00000 4.76000
##
## $word_freq_data
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.00000 0.00000 0.00000 0.09723 0.00000 18.18000
##
## $word_freq_415
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.00000 0.00000 0.00000 0.04784 0.00000 4.76000
##
## $word_freq_85
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.0000 0.0000 0.0000 0.1054 0.0000 20.0000
##
## $word_freq_technology
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.00000 0.00000 0.00000 0.09748 0.00000 7.69000
##
## $word_freq_1999
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.000 0.000 0.000 0.137 0.000 6.890
##
## $word_freq_parts
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.0000 0.0000 0.0000 0.0132 0.0000 8.3300
##
## $word_freq_pm
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.00000 0.00000 0.00000 0.07863 0.00000 11.11000
##
## $word_freq_direct
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.00000 0.00000 0.00000 0.06483 0.00000 4.76000
##
## $word_freq_cs
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.00000 0.00000 0.00000 0.04367 0.00000 7.14000
##
## $word_freq_meeting
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.0000 0.0000 0.0000 0.1323 0.0000 14.2800
##
## $word_freq_original
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.0000 0.0000 0.0000 0.0461 0.0000 3.5700
##
## $word_freq_project
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.0000 0.0000 0.0000 0.0792 0.0000 20.0000
##
## $word_freq_re
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.0000 0.0000 0.0000 0.3012 0.1100 21.4200
##
## $word_freq_edu
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.0000 0.0000 0.0000 0.1798 0.0000 22.0500
##
## $word_freq_table
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.000000 0.000000 0.000000 0.005444 0.000000 2.170000
##
## $word_freq_conference
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.00000 0.00000 0.00000 0.03187 0.00000 10.00000
##
## $char_freq_.
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.00000 0.00000 0.00000 0.03857 0.00000 4.38500
##
## $char_freq_..1
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.000 0.000 0.065 0.139 0.188 9.752
##
## $char_freq_..2
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.00000 0.00000 0.00000 0.01698 0.00000 4.08100
##
## $char_freq_..3
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.0000 0.0000 0.0000 0.2691 0.3150 32.4800
##
## $char_freq_..4
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.00000 0.00000 0.00000 0.07581 0.05200 6.00300
##
## $char_freq_..5
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.00000 0.00000 0.00000 0.04424 0.00000 19.83000
##
## $capital_run_length_average
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 1.000 1.588 2.276 5.192 3.706 1102.000
##
## $capital_run_length_longest
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 1.00 6.00 15.00 52.17 43.00 9989.00
##
## $capital_run_length_total
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 1.0 35.0 95.0 283.3 266.0 15840.0
##
## $status
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.000 0.000 0.000 0.394 1.000 1.000
Missing Data
lapply(dfrSpamMailData, FUN=detectNA)
## $word_freq_make
## [1] 0
##
## $word_freq_address
## [1] 0
##
## $word_freq_all
## [1] 0
##
## $word_freq_3d
## [1] 0
##
## $word_freq_our
## [1] 0
##
## $word_freq_over
## [1] 0
##
## $word_freq_remove
## [1] 0
##
## $word_freq_internet
## [1] 0
##
## $word_freq_order
## [1] 0
##
## $word_freq_mail
## [1] 0
##
## $word_freq_receive
## [1] 0
##
## $word_freq_will
## [1] 0
##
## $word_freq_people
## [1] 0
##
## $word_freq_report
## [1] 0
##
## $word_freq_addresses
## [1] 0
##
## $word_freq_free
## [1] 0
##
## $word_freq_business
## [1] 0
##
## $word_freq_email
## [1] 0
##
## $word_freq_you
## [1] 0
##
## $word_freq_credit
## [1] 0
##
## $word_freq_your
## [1] 0
##
## $word_freq_font
## [1] 0
##
## $word_freq_000
## [1] 0
##
## $word_freq_money
## [1] 0
##
## $word_freq_hp
## [1] 0
##
## $word_freq_hpl
## [1] 0
##
## $word_freq_george
## [1] 0
##
## $word_freq_650
## [1] 0
##
## $word_freq_lab
## [1] 0
##
## $word_freq_labs
## [1] 0
##
## $word_freq_telnet
## [1] 0
##
## $word_freq_857
## [1] 0
##
## $word_freq_data
## [1] 0
##
## $word_freq_415
## [1] 0
##
## $word_freq_85
## [1] 0
##
## $word_freq_technology
## [1] 0
##
## $word_freq_1999
## [1] 0
##
## $word_freq_parts
## [1] 0
##
## $word_freq_pm
## [1] 0
##
## $word_freq_direct
## [1] 0
##
## $word_freq_cs
## [1] 0
##
## $word_freq_meeting
## [1] 0
##
## $word_freq_original
## [1] 0
##
## $word_freq_project
## [1] 0
##
## $word_freq_re
## [1] 0
##
## $word_freq_edu
## [1] 0
##
## $word_freq_table
## [1] 0
##
## $word_freq_conference
## [1] 0
##
## $char_freq_.
## [1] 0
##
## $char_freq_..1
## [1] 0
##
## $char_freq_..2
## [1] 0
##
## $char_freq_..3
## [1] 0
##
## $char_freq_..4
## [1] 0
##
## $char_freq_..5
## [1] 0
##
## $capital_run_length_average
## [1] 0
##
## $capital_run_length_longest
## [1] 0
##
## $capital_run_length_total
## [1] 0
##
## $status
## [1] 0
Observation
There are no missing values
Check output
dfrSpamMailFreq <- summarise(group_by(dfrSpamMailData, status), count=n())
# boxplot of mpg by car cylinders
ggplot(dfrSpamMailFreq, aes(x=status, y=count)) +
geom_bar(stat="identity", aes(fill=count)) +
labs(title="Status Frequency Distribution") +
labs(x="Status") +
labs(y="Counts")
Find Corelations
## find correlations
vcnCorsData <- abs(sapply(colnames(dfrSpamMailData), detectCor))
summary(vcnCorsData)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.002525 0.148800 0.253300 0.276900 0.354700 1.000000
Observation
No Multi Collinearity exists.
Show Corelations
vcnCorsData
## word_freq_make word_freq_address
## 0.24069974 0.29750940
## word_freq_all word_freq_3d
## 0.33283147 0.09077776
## word_freq_our word_freq_over
## 0.40913946 0.31864550
## word_freq_remove word_freq_internet
## 0.51877779 0.34379623
## word_freq_order word_freq_mail
## 0.30073703 0.29682394
## word_freq_receive word_freq_will
## 0.35496682 0.14847653
## word_freq_people word_freq_report
## 0.21287588 0.14977533
## word_freq_addresses word_freq_free
## 0.26515743 0.50416922
## word_freq_business word_freq_email
## 0.35290749 0.29909391
## word_freq_you word_freq_credit
## 0.36110406 0.32418657
## word_freq_your word_freq_font
## 0.50159062 0.13797471
## word_freq_000 word_freq_money
## 0.42580256 0.47215455
## word_freq_hp word_freq_hpl
## 0.39981558 0.34188069
## word_freq_george word_freq_650
## 0.35393063 0.22619064
## word_freq_lab word_freq_labs
## 0.22068802 0.24580530
## word_freq_telnet word_freq_857
## 0.20467400 0.16983798
## word_freq_data word_freq_415
## 0.15756347 0.15802818
## word_freq_85 word_freq_technology
## 0.21413087 0.16680254
## word_freq_1999 word_freq_parts
## 0.26070752 0.00252536
## word_freq_pm word_freq_direct
## 0.14721389 0.02813193
## word_freq_cs word_freq_meeting
## 0.14453750 0.19574176
## word_freq_original word_freq_project
## 0.10781412 0.14453744
## word_freq_re word_freq_edu
## 0.07176763 0.19702549
## word_freq_table word_freq_conference
## 0.02266674 0.13903044
## char_freq_. char_freq_..1
## 0.05683530 0.03263555
## char_freq_..2 char_freq_..3
## 0.11122690 0.59785363
## char_freq_..4 char_freq_..5
## 0.56563314 0.26668614
## capital_run_length_average capital_run_length_longest
## 0.48794983 0.51515693
## capital_run_length_total status
## 0.44397367 1.00000000
Plot Corelations
corrgram(dfrSpamMailData)
High Corelations
vcnCorsData[vcnCorsData>0.8]
## status
## 1
Creating the dependent variable as a facor
class(dfrSpamMailData$status)
## [1] "integer"
dfrSpamMailData$status <- as.factor(dfrSpamMailData$status)
class(dfrSpamMailData$status)
## [1] "factor"
table(dfrSpamMailData$status)
##
## 0 1
## 2788 1813
Dataset Split
set.seed(707)
vctTrnRecs <- createDataPartition(y=dfrSpamMailData$status, p=0.8, list=FALSE)
dfrTrnData <- dfrSpamMailData[vctTrnRecs,]
dfrTstData <- dfrSpamMailData[-vctTrnRecs,]
Training Dataset RowCount & ColCount
dim(dfrTrnData)
## [1] 3682 58
Testing Dataset Head
head(dfrTstData)
## word_freq_make word_freq_address word_freq_all word_freq_3d
## 3 0.06 0.00 0.71 0
## 4 0.00 0.00 0.00 0
## 5 0.00 0.00 0.00 0
## 13 0.00 0.69 0.34 0
## 14 0.00 0.00 0.00 0
## 23 0.00 0.00 0.00 0
## word_freq_our word_freq_over word_freq_remove word_freq_internet
## 3 1.23 0.19 0.19 0.12
## 4 0.63 0.00 0.31 0.63
## 5 0.63 0.00 0.31 0.63
## 13 0.34 0.00 0.00 0.00
## 14 0.90 0.00 0.90 0.00
## 23 2.94 0.00 0.00 0.00
## word_freq_order word_freq_mail word_freq_receive word_freq_will
## 3 0.64 0.25 0.38 0.45
## 4 0.31 0.63 0.31 0.31
## 5 0.31 0.63 0.31 0.31
## 13 0.00 0.00 0.00 0.69
## 14 0.00 0.90 0.90 0.00
## 23 0.00 0.00 0.00 0.00
## word_freq_people word_freq_report word_freq_addresses word_freq_free
## 3 0.12 0 1.75 0.06
## 4 0.31 0 0.00 0.31
## 5 0.31 0 0.00 0.31
## 13 0.00 0 0.00 0.34
## 14 0.90 0 0.00 0.00
## 23 0.00 0 0.00 2.94
## word_freq_business word_freq_email word_freq_you word_freq_credit
## 3 0.06 1.03 1.36 0.32
## 4 0.00 0.00 3.18 0.00
## 5 0.00 0.00 3.18 0.00
## 13 0.00 1.39 2.09 0.00
## 14 0.00 0.00 2.72 0.00
## 23 0.00 0.00 0.00 0.00
## word_freq_your word_freq_font word_freq_000 word_freq_money
## 3 0.51 0 1.16 0.06
## 4 0.31 0 0.00 0.00
## 5 0.31 0 0.00 0.00
## 13 1.04 0 0.00 0.00
## 14 0.90 0 0.00 0.00
## 23 0.00 0 0.00 0.00
## word_freq_hp word_freq_hpl word_freq_george word_freq_650 word_freq_lab
## 3 0 0 0 0 0
## 4 0 0 0 0 0
## 5 0 0 0 0 0
## 13 0 0 0 0 0
## 14 0 0 0 0 0
## 23 0 0 0 0 0
## word_freq_labs word_freq_telnet word_freq_857 word_freq_data
## 3 0 0 0 0
## 4 0 0 0 0
## 5 0 0 0 0
## 13 0 0 0 0
## 14 0 0 0 0
## 23 0 0 0 0
## word_freq_415 word_freq_85 word_freq_technology word_freq_1999
## 3 0 0 0 0
## 4 0 0 0 0
## 5 0 0 0 0
## 13 0 0 0 0
## 14 0 0 0 0
## 23 0 0 0 0
## word_freq_parts word_freq_pm word_freq_direct word_freq_cs
## 3 0 0 0.06 0
## 4 0 0 0.00 0
## 5 0 0 0.00 0
## 13 0 0 0.00 0
## 14 0 0 0.00 0
## 23 0 0 0.00 0
## word_freq_meeting word_freq_original word_freq_project word_freq_re
## 3 0 0.12 0 0.06
## 4 0 0.00 0 0.00
## 5 0 0.00 0 0.00
## 13 0 0.00 0 0.00
## 14 0 0.00 0 0.00
## 23 0 0.00 0 0.00
## word_freq_edu word_freq_table word_freq_conference char_freq_.
## 3 0.06 0 0 0.010
## 4 0.00 0 0 0.000
## 5 0.00 0 0 0.000
## 13 0.00 0 0 0.000
## 14 0.00 0 0 0.000
## 23 0.00 0 0 0.404
## char_freq_..1 char_freq_..2 char_freq_..3 char_freq_..4 char_freq_..5
## 3 0.143 0 0.276 0.184 0.01
## 4 0.137 0 0.137 0.000 0.00
## 5 0.135 0 0.135 0.000 0.00
## 13 0.056 0 0.786 0.000 0.00
## 14 0.000 0 0.000 0.000 0.00
## 23 0.404 0 0.809 0.000 0.00
## capital_run_length_average capital_run_length_longest
## 3 9.821 485
## 4 3.537 40
## 5 3.537 40
## 13 3.728 61
## 14 2.083 7
## 23 4.857 12
## capital_run_length_total status
## 3 2259 1
## 4 191 1
## 5 191 1
## 13 261 1
## 14 25 1
## 23 34 1
Training Dataset Summary
lapply(dfrTrnData, FUN=summary)
## $word_freq_make
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.0000 0.0000 0.0000 0.1068 0.0000 4.3400
##
## $word_freq_address
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.0000 0.0000 0.0000 0.2149 0.0000 14.2800
##
## $word_freq_all
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.0000 0.0000 0.0000 0.2828 0.4200 5.1000
##
## $word_freq_3d
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.00000 0.00000 0.00000 0.06731 0.00000 42.81000
##
## $word_freq_our
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.0000 0.0000 0.0000 0.3125 0.3800 10.0000
##
## $word_freq_over
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.00000 0.00000 0.00000 0.09717 0.00000 5.88000
##
## $word_freq_remove
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.0000 0.0000 0.0000 0.1178 0.0000 7.2700
##
## $word_freq_internet
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.0000 0.0000 0.0000 0.1048 0.0000 6.0600
##
## $word_freq_order
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.00000 0.00000 0.00000 0.09014 0.00000 3.33000
##
## $word_freq_mail
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.000 0.000 0.000 0.229 0.140 11.110
##
## $word_freq_receive
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.00000 0.00000 0.00000 0.06156 0.00000 2.61000
##
## $word_freq_will
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.0000 0.0000 0.1200 0.5519 0.8000 9.6700
##
## $word_freq_people
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.00000 0.00000 0.00000 0.09498 0.00000 5.55000
##
## $word_freq_report
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.00000 0.00000 0.00000 0.06067 0.00000 10.00000
##
## $word_freq_addresses
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.00000 0.00000 0.00000 0.04662 0.00000 4.41000
##
## $word_freq_free
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.0000 0.0000 0.0000 0.2533 0.1000 20.0000
##
## $word_freq_business
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.0000 0.0000 0.0000 0.1436 0.0000 7.1400
##
## $word_freq_email
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.0000 0.0000 0.0000 0.1788 0.0000 6.6600
##
## $word_freq_you
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.000 0.000 1.300 1.671 2.650 18.750
##
## $word_freq_credit
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.00000 0.00000 0.00000 0.08922 0.00000 18.18000
##
## $word_freq_your
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.0000 0.0000 0.1900 0.8071 1.2800 10.7100
##
## $word_freq_font
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.0000 0.0000 0.0000 0.1231 0.0000 17.1000
##
## $word_freq_000
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.0000 0.0000 0.0000 0.1018 0.0000 5.4500
##
## $word_freq_money
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.00000 0.00000 0.00000 0.09628 0.00000 12.50000
##
## $word_freq_hp
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.0000 0.0000 0.0000 0.5557 0.0000 20.8300
##
## $word_freq_hpl
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.0000 0.0000 0.0000 0.2683 0.0000 16.6600
##
## $word_freq_george
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.0000 0.0000 0.0000 0.7617 0.0000 33.3300
##
## $word_freq_650
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.0000 0.0000 0.0000 0.1215 0.0000 5.8800
##
## $word_freq_lab
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.0000 0.0000 0.0000 0.1063 0.0000 14.2800
##
## $word_freq_labs
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.0000 0.0000 0.0000 0.1066 0.0000 5.8800
##
## $word_freq_telnet
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.00000 0.00000 0.00000 0.06621 0.00000 4.76000
##
## $word_freq_857
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.0000 0.0000 0.0000 0.0508 0.0000 4.7600
##
## $word_freq_data
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.00000 0.00000 0.00000 0.09997 0.00000 18.18000
##
## $word_freq_415
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.00000 0.00000 0.00000 0.05179 0.00000 4.76000
##
## $word_freq_85
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.0000 0.0000 0.0000 0.1079 0.0000 20.0000
##
## $word_freq_technology
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.0000 0.0000 0.0000 0.1011 0.0000 7.6900
##
## $word_freq_1999
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.000 0.000 0.000 0.136 0.000 6.890
##
## $word_freq_parts
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.00000 0.00000 0.00000 0.01449 0.00000 8.33000
##
## $word_freq_pm
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.00000 0.00000 0.00000 0.07446 0.00000 9.75000
##
## $word_freq_direct
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.00000 0.00000 0.00000 0.06766 0.00000 4.76000
##
## $word_freq_cs
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.00000 0.00000 0.00000 0.04687 0.00000 7.14000
##
## $word_freq_meeting
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.000 0.000 0.000 0.144 0.000 14.280
##
## $word_freq_original
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.00000 0.00000 0.00000 0.04646 0.00000 3.57000
##
## $word_freq_project
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.0000 0.0000 0.0000 0.0761 0.0000 20.0000
##
## $word_freq_re
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.0000 0.0000 0.0000 0.2919 0.1000 20.0000
##
## $word_freq_edu
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.0000 0.0000 0.0000 0.1773 0.0000 16.7000
##
## $word_freq_table
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.000000 0.000000 0.000000 0.005185 0.000000 2.120000
##
## $word_freq_conference
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.0000 0.0000 0.0000 0.0332 0.0000 8.3300
##
## $char_freq_.
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.00000 0.00000 0.00000 0.04055 0.00000 4.38500
##
## $char_freq_..1
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.0000 0.0000 0.0640 0.1375 0.1887 5.2770
##
## $char_freq_..2
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.00000 0.00000 0.00000 0.01619 0.00000 2.77700
##
## $char_freq_..3
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.0000 0.0000 0.0000 0.2719 0.3290 32.4800
##
## $char_freq_..4
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.00000 0.00000 0.00000 0.07571 0.05300 6.00300
##
## $char_freq_..5
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.00000 0.00000 0.00000 0.04609 0.00000 19.83000
##
## $capital_run_length_average
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 1.000 1.600 2.272 5.138 3.707 1102.000
##
## $capital_run_length_longest
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 1.00 6.00 14.50 50.16 43.00 2204.00
##
## $capital_run_length_total
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 1.00 34.25 95.00 285.10 269.50 15840.00
##
## $status
## 0 1
## 2231 1451
Testing Dataset Summary
lapply(dfrTstData, FUN=summary)
## $word_freq_make
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.00000 0.00000 0.00000 0.09565 0.00000 4.54000
##
## $word_freq_address
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.0000 0.0000 0.0000 0.2056 0.0000 14.2800
##
## $word_freq_all
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.0000 0.0000 0.0000 0.2722 0.4050 4.5400
##
## $word_freq_3d
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.00000 0.00000 0.00000 0.05789 0.00000 42.73000
##
## $word_freq_our
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.0000 0.0000 0.0000 0.3111 0.4100 7.6900
##
## $word_freq_over
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.00000 0.00000 0.00000 0.09081 0.00000 1.86000
##
## $word_freq_remove
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.00000 0.00000 0.00000 0.09997 0.00000 7.27000
##
## $word_freq_internet
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.0000 0.0000 0.0000 0.1075 0.0000 11.1100
##
## $word_freq_order
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.00000 0.00000 0.00000 0.08978 0.00000 5.26000
##
## $word_freq_mail
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.000 0.000 0.000 0.281 0.220 18.180
##
## $word_freq_receive
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.00000 0.00000 0.00000 0.05285 0.00000 2.00000
##
## $word_freq_will
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.000 0.000 0.000 0.501 0.780 5.000
##
## $word_freq_people
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.00000 0.00000 0.00000 0.08971 0.00000 2.63000
##
## $word_freq_report
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.00000 0.00000 0.00000 0.05045 0.00000 2.32000
##
## $word_freq_addresses
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.00000 0.00000 0.00000 0.05955 0.00000 2.38000
##
## $word_freq_free
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.0000 0.0000 0.0000 0.2309 0.1250 6.4500
##
## $word_freq_business
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.0000 0.0000 0.0000 0.1386 0.0000 4.8700
##
## $word_freq_email
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.0000 0.0000 0.0000 0.2084 0.0000 9.0900
##
## $word_freq_you
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.000 0.000 1.310 1.628 2.590 10.630
##
## $word_freq_credit
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.00000 0.00000 0.00000 0.07098 0.00000 5.19000
##
## $word_freq_your
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.0000 0.0000 0.3100 0.8203 1.2500 11.1100
##
## $word_freq_font
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.0000 0.0000 0.0000 0.1135 0.0000 15.4300
##
## $word_freq_000
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.0000 0.0000 0.0000 0.1012 0.0000 3.3800
##
## $word_freq_money
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.00000 0.00000 0.00000 0.08619 0.00000 9.09000
##
## $word_freq_hp
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.0000 0.0000 0.0000 0.5247 0.0000 16.6600
##
## $word_freq_hpl
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.0000 0.0000 0.0000 0.2536 0.0000 8.0000
##
## $word_freq_george
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.0000 0.0000 0.0000 0.7897 0.0000 33.3300
##
## $word_freq_650
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.0000 0.0000 0.0000 0.1383 0.0000 9.0900
##
## $word_freq_lab
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.00000 0.00000 0.00000 0.06946 0.00000 9.09000
##
## $word_freq_labs
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.00000 0.00000 0.00000 0.08775 0.00000 4.34000
##
## $word_freq_telnet
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.00000 0.00000 0.00000 0.05892 0.00000 12.50000
##
## $word_freq_857
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.000 0.000 0.000 0.032 0.000 4.700
##
## $word_freq_data
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.00000 0.00000 0.00000 0.08626 0.00000 4.76000
##
## $word_freq_415
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.000 0.000 0.000 0.032 0.000 4.700
##
## $word_freq_85
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.00000 0.00000 0.00000 0.09528 0.00000 4.65000
##
## $word_freq_technology
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.00000 0.00000 0.00000 0.08283 0.00000 4.16000
##
## $word_freq_1999
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.0000 0.0000 0.0000 0.1408 0.0000 4.5400
##
## $word_freq_parts
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.000000 0.000000 0.000000 0.008041 0.000000 4.000000
##
## $word_freq_pm
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.00000 0.00000 0.00000 0.09532 0.00000 11.11000
##
## $word_freq_direct
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.00000 0.00000 0.00000 0.05349 0.00000 4.16000
##
## $word_freq_cs
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.00000 0.00000 0.00000 0.03082 0.00000 7.14000
##
## $word_freq_meeting
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.00000 0.00000 0.00000 0.08565 0.00000 9.09000
##
## $word_freq_original
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.00000 0.00000 0.00000 0.04465 0.00000 1.75000
##
## $word_freq_project
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.00000 0.00000 0.00000 0.09159 0.00000 16.66000
##
## $word_freq_re
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.0000 0.0000 0.0000 0.3385 0.1350 21.4200
##
## $word_freq_edu
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.0000 0.0000 0.0000 0.1898 0.0000 22.0500
##
## $word_freq_table
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.000000 0.000000 0.000000 0.006485 0.000000 2.170000
##
## $word_freq_conference
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.00000 0.00000 0.00000 0.02653 0.00000 10.00000
##
## $char_freq_.
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.00000 0.00000 0.00000 0.03068 0.00000 3.83800
##
## $char_freq_..1
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.0000 0.0000 0.0670 0.1450 0.1845 9.7520
##
## $char_freq_..2
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.00000 0.00000 0.00000 0.02011 0.00000 4.08100
##
## $char_freq_..3
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.0000 0.0000 0.0000 0.2579 0.2660 19.1300
##
## $char_freq_..4
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.00000 0.00000 0.00000 0.07621 0.04650 3.30500
##
## $char_freq_..5
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.0000 0.0000 0.0000 0.0368 0.0000 7.4070
##
## $capital_run_length_average
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 1.000 1.558 2.309 5.406 3.704 1022.000
##
## $capital_run_length_longest
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 1.00 6.50 15.00 60.22 41.50 9989.00
##
## $capital_run_length_total
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 1.0 37.0 94.0 275.9 260.0 10060.0
##
## $status
## 0 1
## 557 362
Create Model - Random Forest (Default)
## set seed
set.seed(707)
# mtry
myMtry=sqrt(ncol(dfrTrnData)-1)
myNtrees=500
# start time
vctProcStrt <- proc.time()
# random forest (default)
#mdlRndForDef <- randomForest(taste~.-quality, data=dfrTrnData)
mdlRndForDef <- randomForest(status~., data=dfrTrnData,
mtry=myMtry, ntree=myNtrees)
# end time
vctProcEnds <- proc.time()
# It gives vectors of three times
# print
print(paste("Model Created ...",vctProcEnds[1] - vctProcStrt[1]))
## [1] "Model Created ... 12.83"
View Model - Default Random Forest
mdlRndForDef
##
## Call:
## randomForest(formula = status ~ ., data = dfrTrnData, mtry = myMtry, ntree = myNtrees)
## Type of random forest: classification
## Number of trees: 500
## No. of variables tried at each split: 8
##
## OOB estimate of error rate: 4.89%
## Confusion matrix:
## 0 1 class.error
## 0 2161 70 0.03137606
## 1 110 1341 0.07580979
Observation
The default model has 4.89% error
View Model Summary - Default Random Forest
summary(mdlRndForDef)
## Length Class Mode
## call 5 -none- call
## type 1 -none- character
## predicted 3682 factor numeric
## err.rate 1500 -none- numeric
## confusion 6 -none- numeric
## votes 7364 matrix numeric
## oob.times 3682 -none- numeric
## classes 2 -none- character
## importance 57 -none- numeric
## importanceSD 0 -none- NULL
## localImportance 0 -none- NULL
## proximity 0 -none- NULL
## ntree 1 -none- numeric
## mtry 1 -none- numeric
## forest 14 -none- list
## y 3682 factor numeric
## test 0 -none- NULL
## inbag 0 -none- NULL
## terms 3 terms call
Create Model - Random Forest (RFM)
## set seed
set.seed(707)
# start time
vctProcStrt <- proc.time()
# random forest (default)
myControl <- trainControl(method="cv", number=10, repeats=3)
myMetric <- "Accuracy"
myMtry <- sqrt(ncol(dfrTrnData)-1)
myNtrees <- 500
myTuneGrid <- expand.grid(.mtry=myMtry)
mdlRndForRfm <- train(status~., data=dfrTrnData, method="rf",
verbose=F, metric=myMetric, trControl=myControl,
tuneGrid=myTuneGrid, ntree=myNtrees)
# end time
vctProcEnds <- proc.time()
# print
print(paste("Model Created ...",vctProcEnds[1] - vctProcStrt[1]))
## [1] "Model Created ... 120.08"
View Model - Random Forest (RFM)
mdlRndForRfm
## Random Forest
##
## 3682 samples
## 57 predictor
## 2 classes: '0', '1'
##
## No pre-processing
## Resampling: Cross-Validated (10 fold)
## Summary of sample sizes: 3313, 3314, 3314, 3314, 3314, 3314, ...
## Resampling results:
##
## Accuracy Kappa
## 0.9513837 0.8976903
##
## Tuning parameter 'mtry' was held constant at a value of 7.549834
Observation
The random forest with 10 fold cross validation model has 4.89% error
Create Model - Random Forest (GBM)
## set seed
set.seed(707)
# start time
vctProcStrt <- proc.time()
# random forest (default)
myControl <- trainControl(method="repeatedcv", number=10, repeats=3)
myMetric <- "Accuracy"
myMtry <- sqrt(ncol(dfrTrnData)-1)
#myNtrees <- 500
myTuneGrid <- expand.grid(.mtry=myMtry)
mdlRndForGbm <- train(status~., data=dfrTrnData, method="gbm",
verbose=F, metric=myMetric, trControl=myControl)
# tuneGrid=myTuneGrid, ntree=myNtrees)
# end time
vctProcEnds <- proc.time()
# print
print(paste("Model Created ...",vctProcEnds[1] - vctProcStrt[1]))
## [1] "Model Created ... 127.28"
View Model - Random Forest (GBM)
mdlRndForGbm
## Stochastic Gradient Boosting
##
## 3682 samples
## 57 predictor
## 2 classes: '0', '1'
##
## No pre-processing
## Resampling: Cross-Validated (10 fold, repeated 3 times)
## Summary of sample sizes: 3313, 3314, 3314, 3314, 3314, 3314, ...
## Resampling results across tuning parameters:
##
## interaction.depth n.trees Accuracy Kappa
## 1 50 0.9166200 0.8217745
## 1 100 0.9317404 0.8553362
## 1 150 0.9391638 0.8713401
## 2 50 0.9316474 0.8552624
## 2 100 0.9409724 0.8753850
## 2 150 0.9464058 0.8869276
## 3 50 0.9370819 0.8670327
## 3 100 0.9442331 0.8823408
## 3 150 0.9472207 0.8888336
##
## Tuning parameter 'shrinkage' was held constant at a value of 0.1
##
## Tuning parameter 'n.minobsinnode' was held constant at a value of 10
## Accuracy was used to select the optimal model using the largest value.
## The final values used for the model were n.trees = 150,
## interaction.depth = 3, shrinkage = 0.1 and n.minobsinnode = 10.
Observation
The random forest(GBM) with 10 fold repeated cross validation model has 5.3% error
Create Model - Random Forest (OOB)
## set seed
set.seed(707)
# start time
vctProcStrt <- proc.time()
# random forest (default)
myControl <- trainControl(method="oob", number=10, repeats=3)
myMetric <- "Accuracy"
myMtry <- sqrt(ncol(dfrTrnData)-1)
myNtrees <- 500
myTuneGrid <- expand.grid(.mtry=myMtry)
mdlRndForOob <- train(status~., data=dfrTrnData,
verbose=F, metric=myMetric, trControl=myControl,
tuneGrid=myTuneGrid, ntree=myNtrees)
# end time
vctProcEnds <- proc.time()
# print
print(paste("Model Created ...",vctProcEnds[1] - vctProcStrt[1]))
## [1] "Model Created ... 24.56"
View Model - Random Formbest (OOB)
mdlRndForOob
## Random Forest
##
## 3682 samples
## 57 predictor
## 2 classes: '0', '1'
##
## No pre-processing
## Resampling results:
##
## Accuracy Kappa
## 0.9508419 0.8964783
##
## Tuning parameter 'mtry' was held constant at a value of 7.549834
Observation
The random forest(OOB) model has 4.92% error
Create Model - Random Forest (Mod1)
## set seed
set.seed(707)
# mtry
myMtry=6.5
myNtrees=500
# start time
vctProcStrt <- proc.time()
# random forest (default)
#mdlRndForDef <- randomForest(taste~.-quality, data=dfrTrnData)
mdlRndForMod1 <- randomForest(status~., data=dfrTrnData,
mtry=myMtry, ntree=myNtrees)
# end time
vctProcEnds <- proc.time()
# It gives vectors of three times
# print
print(paste("Model Created ...",vctProcEnds[1] - vctProcStrt[1]))
## [1] "Model Created ... 11.97"
View Model - Mod1 Random Forest
mdlRndForMod1
##
## Call:
## randomForest(formula = status ~ ., data = dfrTrnData, mtry = myMtry, ntree = myNtrees)
## Type of random forest: classification
## Number of trees: 500
## No. of variables tried at each split: 6
##
## OOB estimate of error rate: 4.48%
## Confusion matrix:
## 0 1 class.error
## 0 2174 57 0.02554908
## 1 108 1343 0.07443143
Observation
The random forest(Mod1) model has 4.48% error hence we will use this model for classification and prediction.
Prediction - Test Data - Random Forest (Default)
vctRndForDef <- predict(mdlRndForMod1, newdata=dfrTstData)
cmxRndForDef <- confusionMatrix(vctRndForDef, dfrTstData$status)
cmxRndForDef
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 539 29
## 1 18 333
##
## Accuracy : 0.9489
## 95% CI : (0.9326, 0.9622)
## No Information Rate : 0.6061
## P-Value [Acc > NIR] : <2e-16
##
## Kappa : 0.8923
## Mcnemar's Test P-Value : 0.1447
##
## Sensitivity : 0.9677
## Specificity : 0.9199
## Pos Pred Value : 0.9489
## Neg Pred Value : 0.9487
## Prevalence : 0.6061
## Detection Rate : 0.5865
## Detection Prevalence : 0.6181
## Balanced Accuracy : 0.9438
##
## 'Positive' Class : 0
##
Observation
The prediction was done with 94.99% accuracy.
End Of Report