Load Libs
library(plyr)
library(tidyr)
library(dplyr)
library(ggplot2)
library(knitr)
library(RColorBrewer)
library(gridBase)
library(caret)
library(randomForest)
library(gbm)
library(corrgram)
library(gplots)
library(grid)
library(RColorBrewer)
library(gridBase)
library(ROCR)
library(foreign)
CUSTOM_COLORS_PLOT <- colorRampPalette(brewer.pal(10, "Set3"))
Functions
detectNA <- function(inp) {
sum(is.na(inp))
}
detectCor <- function(x) {
cor(as.numeric(dfrDataset[, x]),
as.numeric(dfrDataset$quality),
method="spearman")
}
**Loading Data and Naming Columns
spam.data = read.csv("http://archive.ics.uci.edu/ml/machine-learning-databases/spambase/spambase.data",header = FALSE)
names <- read.csv("http://thinktostart.com/data/names.csv",header=FALSE,sep=";")
names(spam.data) <- sapply((1:nrow(names)),function(i) toString(names[i,1]))
Observation The data frame we just created does not have useful column names as they were not defined in the csv.So we have to rename them properly.
**Exploratory analysis
head(spam.data)
## word_freq_make word_freq_address word_freq_all word_freq_3d
## 1 0.00 0.64 0.64 0
## 2 0.21 0.28 0.50 0
## 3 0.06 0.00 0.71 0
## 4 0.00 0.00 0.00 0
## 5 0.00 0.00 0.00 0
## 6 0.00 0.00 0.00 0
## word_freq_our word_freq_over word_freq_remove word_freq_internet
## 1 0.32 0.00 0.00 0.00
## 2 0.14 0.28 0.21 0.07
## 3 1.23 0.19 0.19 0.12
## 4 0.63 0.00 0.31 0.63
## 5 0.63 0.00 0.31 0.63
## 6 1.85 0.00 0.00 1.85
## word_freq_order word_freq_mail word_freq_receive word_freq_will
## 1 0.00 0.00 0.00 0.64
## 2 0.00 0.94 0.21 0.79
## 3 0.64 0.25 0.38 0.45
## 4 0.31 0.63 0.31 0.31
## 5 0.31 0.63 0.31 0.31
## 6 0.00 0.00 0.00 0.00
## word_freq_people word_freq_report word_freq_addresses word_freq_free
## 1 0.00 0.00 0.00 0.32
## 2 0.65 0.21 0.14 0.14
## 3 0.12 0.00 1.75 0.06
## 4 0.31 0.00 0.00 0.31
## 5 0.31 0.00 0.00 0.31
## 6 0.00 0.00 0.00 0.00
## word_freq_business word_freq_email word_freq_you word_freq_credit
## 1 0.00 1.29 1.93 0.00
## 2 0.07 0.28 3.47 0.00
## 3 0.06 1.03 1.36 0.32
## 4 0.00 0.00 3.18 0.00
## 5 0.00 0.00 3.18 0.00
## 6 0.00 0.00 0.00 0.00
## word_freq_your word_freq_font word_freq_000 word_freq_money word_freq_hp
## 1 0.96 0 0.00 0.00 0
## 2 1.59 0 0.43 0.43 0
## 3 0.51 0 1.16 0.06 0
## 4 0.31 0 0.00 0.00 0
## 5 0.31 0 0.00 0.00 0
## 6 0.00 0 0.00 0.00 0
## word_freq_hpl word_freq_george word_freq_650 word_freq_lab
## 1 0 0 0 0
## 2 0 0 0 0
## 3 0 0 0 0
## 4 0 0 0 0
## 5 0 0 0 0
## 6 0 0 0 0
## word_freq_labs word_freq_telnet word_freq_857 word_freq_data
## 1 0 0 0 0
## 2 0 0 0 0
## 3 0 0 0 0
## 4 0 0 0 0
## 5 0 0 0 0
## 6 0 0 0 0
## word_freq_415 word_freq_85 word_freq_technology word_freq_1999
## 1 0 0 0 0.00
## 2 0 0 0 0.07
## 3 0 0 0 0.00
## 4 0 0 0 0.00
## 5 0 0 0 0.00
## 6 0 0 0 0.00
## word_freq_parts word_freq_pm word_freq_direct word_freq_cs
## 1 0 0 0.00 0
## 2 0 0 0.00 0
## 3 0 0 0.06 0
## 4 0 0 0.00 0
## 5 0 0 0.00 0
## 6 0 0 0.00 0
## word_freq_meeting word_freq_original word_freq_project word_freq_re
## 1 0 0.00 0 0.00
## 2 0 0.00 0 0.00
## 3 0 0.12 0 0.06
## 4 0 0.00 0 0.00
## 5 0 0.00 0 0.00
## 6 0 0.00 0 0.00
## word_freq_edu word_freq_table word_freq_conference char_freq_;
## 1 0.00 0 0 0.00
## 2 0.00 0 0 0.00
## 3 0.06 0 0 0.01
## 4 0.00 0 0 0.00
## 5 0.00 0 0 0.00
## 6 0.00 0 0 0.00
## char_freq_( char_freq_[ char_freq_! char_freq_$ char_freq_#
## 1 0.000 0 0.778 0.000 0.000
## 2 0.132 0 0.372 0.180 0.048
## 3 0.143 0 0.276 0.184 0.010
## 4 0.137 0 0.137 0.000 0.000
## 5 0.135 0 0.135 0.000 0.000
## 6 0.223 0 0.000 0.000 0.000
## capital_run_length_average capital_run_length_longest
## 1 3.756 61
## 2 5.114 101
## 3 9.821 485
## 4 3.537 40
## 5 3.537 40
## 6 3.000 15
## capital_run_length_total y
## 1 278 1
## 2 1028 1
## 3 2259 1
## 4 191 1
## 5 191 1
## 6 54 1
dim(spam.data)
## [1] 4601 58
str(spam.data)
## 'data.frame': 4601 obs. of 58 variables:
## $ word_freq_make : num 0 0.21 0.06 0 0 0 0 0 0.15 0.06 ...
## $ word_freq_address : num 0.64 0.28 0 0 0 0 0 0 0 0.12 ...
## $ word_freq_all : num 0.64 0.5 0.71 0 0 0 0 0 0.46 0.77 ...
## $ word_freq_3d : num 0 0 0 0 0 0 0 0 0 0 ...
## $ word_freq_our : num 0.32 0.14 1.23 0.63 0.63 1.85 1.92 1.88 0.61 0.19 ...
## $ word_freq_over : num 0 0.28 0.19 0 0 0 0 0 0 0.32 ...
## $ word_freq_remove : num 0 0.21 0.19 0.31 0.31 0 0 0 0.3 0.38 ...
## $ word_freq_internet : num 0 0.07 0.12 0.63 0.63 1.85 0 1.88 0 0 ...
## $ word_freq_order : num 0 0 0.64 0.31 0.31 0 0 0 0.92 0.06 ...
## $ word_freq_mail : num 0 0.94 0.25 0.63 0.63 0 0.64 0 0.76 0 ...
## $ word_freq_receive : num 0 0.21 0.38 0.31 0.31 0 0.96 0 0.76 0 ...
## $ word_freq_will : num 0.64 0.79 0.45 0.31 0.31 0 1.28 0 0.92 0.64 ...
## $ word_freq_people : num 0 0.65 0.12 0.31 0.31 0 0 0 0 0.25 ...
## $ word_freq_report : num 0 0.21 0 0 0 0 0 0 0 0 ...
## $ word_freq_addresses : num 0 0.14 1.75 0 0 0 0 0 0 0.12 ...
## $ word_freq_free : num 0.32 0.14 0.06 0.31 0.31 0 0.96 0 0 0 ...
## $ word_freq_business : num 0 0.07 0.06 0 0 0 0 0 0 0 ...
## $ word_freq_email : num 1.29 0.28 1.03 0 0 0 0.32 0 0.15 0.12 ...
## $ word_freq_you : num 1.93 3.47 1.36 3.18 3.18 0 3.85 0 1.23 1.67 ...
## $ word_freq_credit : num 0 0 0.32 0 0 0 0 0 3.53 0.06 ...
## $ word_freq_your : num 0.96 1.59 0.51 0.31 0.31 0 0.64 0 2 0.71 ...
## $ word_freq_font : num 0 0 0 0 0 0 0 0 0 0 ...
## $ word_freq_000 : num 0 0.43 1.16 0 0 0 0 0 0 0.19 ...
## $ word_freq_money : num 0 0.43 0.06 0 0 0 0 0 0.15 0 ...
## $ word_freq_hp : num 0 0 0 0 0 0 0 0 0 0 ...
## $ word_freq_hpl : num 0 0 0 0 0 0 0 0 0 0 ...
## $ word_freq_george : num 0 0 0 0 0 0 0 0 0 0 ...
## $ word_freq_650 : num 0 0 0 0 0 0 0 0 0 0 ...
## $ word_freq_lab : num 0 0 0 0 0 0 0 0 0 0 ...
## $ word_freq_labs : num 0 0 0 0 0 0 0 0 0 0 ...
## $ word_freq_telnet : num 0 0 0 0 0 0 0 0 0 0 ...
## $ word_freq_857 : num 0 0 0 0 0 0 0 0 0 0 ...
## $ word_freq_data : num 0 0 0 0 0 0 0 0 0.15 0 ...
## $ word_freq_415 : num 0 0 0 0 0 0 0 0 0 0 ...
## $ word_freq_85 : num 0 0 0 0 0 0 0 0 0 0 ...
## $ word_freq_technology : num 0 0 0 0 0 0 0 0 0 0 ...
## $ word_freq_1999 : num 0 0.07 0 0 0 0 0 0 0 0 ...
## $ word_freq_parts : num 0 0 0 0 0 0 0 0 0 0 ...
## $ word_freq_pm : num 0 0 0 0 0 0 0 0 0 0 ...
## $ word_freq_direct : num 0 0 0.06 0 0 0 0 0 0 0 ...
## $ word_freq_cs : num 0 0 0 0 0 0 0 0 0 0 ...
## $ word_freq_meeting : num 0 0 0 0 0 0 0 0 0 0 ...
## $ word_freq_original : num 0 0 0.12 0 0 0 0 0 0.3 0 ...
## $ word_freq_project : num 0 0 0 0 0 0 0 0 0 0.06 ...
## $ word_freq_re : num 0 0 0.06 0 0 0 0 0 0 0 ...
## $ word_freq_edu : num 0 0 0.06 0 0 0 0 0 0 0 ...
## $ word_freq_table : num 0 0 0 0 0 0 0 0 0 0 ...
## $ word_freq_conference : num 0 0 0 0 0 0 0 0 0 0 ...
## $ char_freq_; : num 0 0 0.01 0 0 0 0 0 0 0.04 ...
## $ char_freq_( : num 0 0.132 0.143 0.137 0.135 0.223 0.054 0.206 0.271 0.03 ...
## $ char_freq_[ : num 0 0 0 0 0 0 0 0 0 0 ...
## $ char_freq_! : num 0.778 0.372 0.276 0.137 0.135 0 0.164 0 0.181 0.244 ...
## $ char_freq_$ : num 0 0.18 0.184 0 0 0 0.054 0 0.203 0.081 ...
## $ char_freq_# : num 0 0.048 0.01 0 0 0 0 0 0.022 0 ...
## $ capital_run_length_average: num 3.76 5.11 9.82 3.54 3.54 ...
## $ capital_run_length_longest: int 61 101 485 40 40 15 4 11 445 43 ...
## $ capital_run_length_total : int 278 1028 2259 191 191 54 112 49 1257 749 ...
## $ y : int 1 1 1 1 1 1 1 1 1 1 ...
Observation It appears that first 54 columns are about frequency of appearance of a particular word in mail.And next three columns are about the length of Capital words.
Dataframe Summary
summary(spam.data$capital_run_length_total)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 1.0 35.0 95.0 283.3 266.0 15841.0
summary(spam.data$capital_run_length_longest)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 1.00 6.00 15.00 52.17 43.00 9989.00
summary(spam.data$capital_run_length_average)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 1.000 1.588 2.276 5.191 3.706 1102.500
Observation there was no point in seeing the summary of whole dataset. only summary of important variables is taken.
Missing Data
lapply(spam.data, FUN=detectNA)
## $word_freq_make
## [1] 0
##
## $word_freq_address
## [1] 0
##
## $word_freq_all
## [1] 0
##
## $word_freq_3d
## [1] 0
##
## $word_freq_our
## [1] 0
##
## $word_freq_over
## [1] 0
##
## $word_freq_remove
## [1] 0
##
## $word_freq_internet
## [1] 0
##
## $word_freq_order
## [1] 0
##
## $word_freq_mail
## [1] 0
##
## $word_freq_receive
## [1] 0
##
## $word_freq_will
## [1] 0
##
## $word_freq_people
## [1] 0
##
## $word_freq_report
## [1] 0
##
## $word_freq_addresses
## [1] 0
##
## $word_freq_free
## [1] 0
##
## $word_freq_business
## [1] 0
##
## $word_freq_email
## [1] 0
##
## $word_freq_you
## [1] 0
##
## $word_freq_credit
## [1] 0
##
## $word_freq_your
## [1] 0
##
## $word_freq_font
## [1] 0
##
## $word_freq_000
## [1] 0
##
## $word_freq_money
## [1] 0
##
## $word_freq_hp
## [1] 0
##
## $word_freq_hpl
## [1] 0
##
## $word_freq_george
## [1] 0
##
## $word_freq_650
## [1] 0
##
## $word_freq_lab
## [1] 0
##
## $word_freq_labs
## [1] 0
##
## $word_freq_telnet
## [1] 0
##
## $word_freq_857
## [1] 0
##
## $word_freq_data
## [1] 0
##
## $word_freq_415
## [1] 0
##
## $word_freq_85
## [1] 0
##
## $word_freq_technology
## [1] 0
##
## $word_freq_1999
## [1] 0
##
## $word_freq_parts
## [1] 0
##
## $word_freq_pm
## [1] 0
##
## $word_freq_direct
## [1] 0
##
## $word_freq_cs
## [1] 0
##
## $word_freq_meeting
## [1] 0
##
## $word_freq_original
## [1] 0
##
## $word_freq_project
## [1] 0
##
## $word_freq_re
## [1] 0
##
## $word_freq_edu
## [1] 0
##
## $word_freq_table
## [1] 0
##
## $word_freq_conference
## [1] 0
##
## $`char_freq_;`
## [1] 0
##
## $`char_freq_(`
## [1] 0
##
## $`char_freq_[`
## [1] 0
##
## $`char_freq_!`
## [1] 0
##
## $`char_freq_$`
## [1] 0
##
## $`char_freq_#`
## [1] 0
##
## $capital_run_length_average
## [1] 0
##
## $capital_run_length_longest
## [1] 0
##
## $capital_run_length_total
## [1] 0
##
## $y
## [1] 0
Observation There are no missing data in dataframe.
Lets see no. of spam mails in dataset
sum(spam.data$y)
## [1] 1813
print(paste0("Percentage: ", round((sum(spam.data$y)/nrow(spam.data)) * 100, 2), "%"))
## [1] "Percentage: 39.4%"
Coverting numeric to factor
spam.data$y <- as.factor(spam.data$y)
Visualization
resTable <- table(spam.data$y)
par(mfrow = c(1, 2))
par(mar = c(5, 4, 4, 2) + 0.1) # increase y-axis margin.
plot <- plot(spam.data$y, col = CUSTOM_COLORS_PLOT(2), main = "Email vs. Spam",
ylim = c(0, 4000), ylab = "Examples Number")
text(x = plot, y = resTable + 200, labels = resTable)
percentage <- round(resTable/sum(resTable) * 100)
labels <- paste(row.names(resTable), percentage) # add percents to labels
labels <- paste(labels, "%", sep = "") # ad % to labels
pie(resTable, labels = labels, col = CUSTOM_COLORS_PLOT(2), main = "Email vs. Spam")
dataset.email <- sapply(spam.data[which(spam.data$y == "0"), 1:54], function(x) ifelse(is.numeric(x),
round(mean(x), 2), NA))
dataset.spam <- sapply(spam.data[which(spam.data$y == "1"), 1:54], function(x) ifelse(is.numeric(x),
round(mean(x), 2), NA))
dataset.email.order <- dataset.email[order(-dataset.email)[1:12]]
dataset.spam.order <- dataset.spam[order(-dataset.spam)[1:12]]
par(mfrow = c(1, 2))
par(mar = c(8, 4, 4, 2) + 0.5)
plot <- barplot(dataset.email.order, col = CUSTOM_COLORS_PLOT(12), main = "Email: Average Percentage",
names.arg = "", ylab = "Percentage Relative (%)")
vps <- baseViewports()
pushViewport(vps$inner, vps$figure, vps$plot)
grid.text(names(dataset.email.order), x = unit(plot, "native"), y = unit(-1,
"lines"), just = "right", rot = 50)
popViewport(3)
plot <- barplot(dataset.spam.order, col = CUSTOM_COLORS_PLOT(12), main = "Spam: Average Percentage",
names.arg = "", ylab = "Percentage Relative (%)")
vps <- baseViewports()
pushViewport(vps$inner, vps$figure, vps$plot)
grid.text(names(dataset.spam.order), x = unit(plot, "native"), y = unit(-1,
"lines"), just = "right", rot = 50)
popViewport(3)
Observations Frequency characteristics of spam and mail are plotted. Plot shows relative percentage of occurence of words in decreasing order. It casn be interepreted as words “you”,“your”,“free”,“our”..so on are tend to be occure in Spam mails.
*Lets Create train and test dataset
Dataset Split
set.seed(1234)
vctTrnRecs <- createDataPartition(y=spam.data$y, p=0.7, list=FALSE)
TrnData <- spam.data[vctTrnRecs,]
TstData <- spam.data[-vctTrnRecs,]
table(TrnData$y)
##
## 0 1
## 1952 1270
Obseration Data is split between train and test dataset. 70% data will be used for training purpose.
Training Dataset
dim(TrnData)
## [1] 3222 58
Testing Dataset RowCount & ColCount
dim(TstData)
## [1] 1379 58
Create Model - Random Forest (RFM)
## set seed
set.seed(707)
# start time
vctProcStrt <- proc.time()
# random forest (default)
myControl <- trainControl(method="repeatedcv", number=10, repeats=3)
myMetric <- "Accuracy"
myMtry <- sqrt(ncol(TrnData)-2)
myNtrees <- 100
myTuneGrid <- expand.grid(.mtry=myMtry)
mdlRndForRfm <- train(y~., data=TrnData, method="rf",
verbose=F, metric=myMetric, trControl=myControl,
tuneGrid=myTuneGrid, ntree=myNtrees)
# end time
vctProcEnds <- proc.time()
# print
print(paste("Model Created ...",vctProcEnds[1] - vctProcStrt[1]))
## [1] "Model Created ... 85.23"
View Model - Random Forest (RFM)
mdlRndForRfm
## Random Forest
##
## 3222 samples
## 57 predictor
## 2 classes: '0', '1'
##
## No pre-processing
## Resampling: Cross-Validated (10 fold, repeated 3 times)
## Summary of sample sizes: 2900, 2900, 2900, 2900, 2899, 2900, ...
## Resampling results:
##
## Accuracy Kappa
## 0.9515759 0.8978342
##
## Tuning parameter 'mtry' was held constant at a value of 7.483315
View Model Summary - Random Forest (RFM)
summary(mdlRndForRfm)
## Length Class Mode
## call 6 -none- call
## type 1 -none- character
## predicted 3222 factor numeric
## err.rate 300 -none- numeric
## confusion 6 -none- numeric
## votes 6444 matrix numeric
## oob.times 3222 -none- numeric
## classes 2 -none- character
## importance 57 -none- numeric
## importanceSD 0 -none- NULL
## localImportance 0 -none- NULL
## proximity 0 -none- NULL
## ntree 1 -none- numeric
## mtry 1 -none- numeric
## forest 14 -none- list
## y 3222 factor numeric
## test 0 -none- NULL
## inbag 0 -none- NULL
## xNames 57 -none- character
## problemType 1 -none- character
## tuneValue 1 data.frame list
## obsLevels 2 -none- character
## param 2 -none- list
Oservation MOdel is created.
Prediction - Test Data - Random Forest (RFM)
vctRndForRfm <- predict(mdlRndForRfm, newdata=TstData)
cmxRndForRfm <- confusionMatrix(vctRndForRfm, TstData$y)
cmxRndForRfm
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 796 40
## 1 40 503
##
## Accuracy : 0.942
## 95% CI : (0.9283, 0.9537)
## No Information Rate : 0.6062
## P-Value [Acc > NIR] : <2e-16
##
## Kappa : 0.8785
## Mcnemar's Test P-Value : 1
##
## Sensitivity : 0.9522
## Specificity : 0.9263
## Pos Pred Value : 0.9522
## Neg Pred Value : 0.9263
## Prevalence : 0.6062
## Detection Rate : 0.5772
## Detection Prevalence : 0.6062
## Balanced Accuracy : 0.9392
##
## 'Positive' Class : 0
##
Obserations Model results are demonstrated on train dataset. Accuracy of model is checked with changing “myNtrees” parameter.(there is trade off for processing time) with nmytree = 100 accuracy is 100. Thanks.