This is an R Markdown document. Markdown is a simple formatting syntax for authoring HTML, PDF, and MS Word documents. For more details on using R Markdown see http://rmarkdown.rstudio.com.
When you click the Knit button a document will be generated that includes both content as well as the output of any embedded R code chunks within the document. You can embed an R code chunk like this:
#Load Packages
if(!require("pacman")) install.packages("pacman")
## Loading required package: pacman
pacman::p_load(tidyverse, reshape, gplots, ggmap, MASS,
mlbench, data.table,leaps, pivottabler, forecast, dplyr,caret)
#Read Data
Spam_Data <- fread ("spambase.data")
Spam_DF <- setDF (Spam_Data)
str(Spam_DF)
## 'data.frame': 4601 obs. of 58 variables:
## $ V1 : num 0 0.21 0.06 0 0 0 0 0 0.15 0.06 ...
## $ V2 : num 0.64 0.28 0 0 0 0 0 0 0 0.12 ...
## $ V3 : num 0.64 0.5 0.71 0 0 0 0 0 0.46 0.77 ...
## $ V4 : num 0 0 0 0 0 0 0 0 0 0 ...
## $ V5 : num 0.32 0.14 1.23 0.63 0.63 1.85 1.92 1.88 0.61 0.19 ...
## $ V6 : num 0 0.28 0.19 0 0 0 0 0 0 0.32 ...
## $ V7 : num 0 0.21 0.19 0.31 0.31 0 0 0 0.3 0.38 ...
## $ V8 : num 0 0.07 0.12 0.63 0.63 1.85 0 1.88 0 0 ...
## $ V9 : num 0 0 0.64 0.31 0.31 0 0 0 0.92 0.06 ...
## $ V10: num 0 0.94 0.25 0.63 0.63 0 0.64 0 0.76 0 ...
## $ V11: num 0 0.21 0.38 0.31 0.31 0 0.96 0 0.76 0 ...
## $ V12: num 0.64 0.79 0.45 0.31 0.31 0 1.28 0 0.92 0.64 ...
## $ V13: num 0 0.65 0.12 0.31 0.31 0 0 0 0 0.25 ...
## $ V14: num 0 0.21 0 0 0 0 0 0 0 0 ...
## $ V15: num 0 0.14 1.75 0 0 0 0 0 0 0.12 ...
## $ V16: num 0.32 0.14 0.06 0.31 0.31 0 0.96 0 0 0 ...
## $ V17: num 0 0.07 0.06 0 0 0 0 0 0 0 ...
## $ V18: num 1.29 0.28 1.03 0 0 0 0.32 0 0.15 0.12 ...
## $ V19: num 1.93 3.47 1.36 3.18 3.18 0 3.85 0 1.23 1.67 ...
## $ V20: num 0 0 0.32 0 0 0 0 0 3.53 0.06 ...
## $ V21: num 0.96 1.59 0.51 0.31 0.31 0 0.64 0 2 0.71 ...
## $ V22: num 0 0 0 0 0 0 0 0 0 0 ...
## $ V23: num 0 0.43 1.16 0 0 0 0 0 0 0.19 ...
## $ V24: num 0 0.43 0.06 0 0 0 0 0 0.15 0 ...
## $ V25: num 0 0 0 0 0 0 0 0 0 0 ...
## $ V26: num 0 0 0 0 0 0 0 0 0 0 ...
## $ V27: num 0 0 0 0 0 0 0 0 0 0 ...
## $ V28: num 0 0 0 0 0 0 0 0 0 0 ...
## $ V29: num 0 0 0 0 0 0 0 0 0 0 ...
## $ V30: num 0 0 0 0 0 0 0 0 0 0 ...
## $ V31: num 0 0 0 0 0 0 0 0 0 0 ...
## $ V32: num 0 0 0 0 0 0 0 0 0 0 ...
## $ V33: num 0 0 0 0 0 0 0 0 0.15 0 ...
## $ V34: num 0 0 0 0 0 0 0 0 0 0 ...
## $ V35: num 0 0 0 0 0 0 0 0 0 0 ...
## $ V36: num 0 0 0 0 0 0 0 0 0 0 ...
## $ V37: num 0 0.07 0 0 0 0 0 0 0 0 ...
## $ V38: num 0 0 0 0 0 0 0 0 0 0 ...
## $ V39: num 0 0 0 0 0 0 0 0 0 0 ...
## $ V40: num 0 0 0.06 0 0 0 0 0 0 0 ...
## $ V41: num 0 0 0 0 0 0 0 0 0 0 ...
## $ V42: num 0 0 0 0 0 0 0 0 0 0 ...
## $ V43: num 0 0 0.12 0 0 0 0 0 0.3 0 ...
## $ V44: num 0 0 0 0 0 0 0 0 0 0.06 ...
## $ V45: num 0 0 0.06 0 0 0 0 0 0 0 ...
## $ V46: num 0 0 0.06 0 0 0 0 0 0 0 ...
## $ V47: num 0 0 0 0 0 0 0 0 0 0 ...
## $ V48: num 0 0 0 0 0 0 0 0 0 0 ...
## $ V49: num 0 0 0.01 0 0 0 0 0 0 0.04 ...
## $ V50: num 0 0.132 0.143 0.137 0.135 0.223 0.054 0.206 0.271 0.03 ...
## $ V51: num 0 0 0 0 0 0 0 0 0 0 ...
## $ V52: num 0.778 0.372 0.276 0.137 0.135 0 0.164 0 0.181 0.244 ...
## $ V53: num 0 0.18 0.184 0 0 0 0.054 0 0.203 0.081 ...
## $ V54: num 0 0.048 0.01 0 0 0 0 0 0.022 0 ...
## $ V55: num 3.76 5.11 9.82 3.54 3.54 ...
## $ V56: int 61 101 485 40 40 15 4 11 445 43 ...
## $ V57: int 278 1028 2259 191 191 54 112 49 1257 749 ...
## $ V58: int 1 1 1 1 1 1 1 1 1 1 ...
class(Spam_DF)
## [1] "data.frame"
#Examine how each predictor is different between spam & non-sapm email by comparing class averages.
library(dplyr)
Pivot <- Spam_DF %>%
group_by (V58)%>%
summarise_all(funs(mean))
## Warning: `funs()` is deprecated as of dplyr 0.8.0.
## Please use a list of either functions or lambdas:
##
## # Simple named list:
## list(mean = mean, median = median)
##
## # Auto named with `tibble::lst()`:
## tibble::lst(mean, median)
##
## # Using lambdas
## list(~ mean(., trim = .2), ~ median(., na.rm = TRUE))
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_warnings()` to see where this warning was generated.
head(Pivot)
#Identify the 10 variables with highest difference between class average
Var_Table <-data.frame(r1=names(Pivot), t(Pivot))
Var_Table["Difference"] <- NA
Var_Table$Difference <- round(abs(Var_Table$X1 - Var_Table$X2),4)
view(Var_Table)
Final_Table <- Var_Table[-c(1),]
Final_Table[order(-Final_Table$Difference),]
class(Final_Table)
## [1] "data.frame"
#Using training dataset with only 10 predictors with higest difference - Data Parition - Normalize the data
Working_Table <- Spam_DF[,c(57,56,55,27,19,21,25,16,26,52,58)]
view(Working_Table)
Working_Data <-transform(Working_Table, V58 = as.character(V58))
str(Working_Data)
## 'data.frame': 4601 obs. of 11 variables:
## $ V57: int 278 1028 2259 191 191 54 112 49 1257 749 ...
## $ V56: int 61 101 485 40 40 15 4 11 445 43 ...
## $ V55: num 3.76 5.11 9.82 3.54 3.54 ...
## $ V27: num 0 0 0 0 0 0 0 0 0 0 ...
## $ V19: num 1.93 3.47 1.36 3.18 3.18 0 3.85 0 1.23 1.67 ...
## $ V21: num 0.96 1.59 0.51 0.31 0.31 0 0.64 0 2 0.71 ...
## $ V25: num 0 0 0 0 0 0 0 0 0 0 ...
## $ V16: num 0.32 0.14 0.06 0.31 0.31 0 0.96 0 0 0 ...
## $ V26: num 0 0 0 0 0 0 0 0 0 0 ...
## $ V52: num 0.778 0.372 0.276 0.137 0.135 0 0.164 0 0.181 0.244 ...
## $ V58: chr "1" "1" "1" "1" ...
set.seed(30)
training.index <- sample (row.names(Working_Data), 0.8*dim(Working_Data)[1])
valid.index <- setdiff(row.names(Working_Data), training.index)
train.df <- Working_Data [training.index, ]
valid.df <- Working_Data [valid.index, ]
norm.values <- preProcess(train.df, method = c("center", "scale"))
spambase.train.norm <- predict(norm.values, train.df)
spambase.valid.norm <- predict(norm.values, valid.df)
lda2 <- lda(V58~., data = spambase.train.norm)
lda2
## Call:
## lda(V58 ~ ., data = spambase.train.norm)
##
## Prior probabilities of groups:
## 0 1
## 0.6119565 0.3880435
##
## Group means:
## V57 V56 V55 V27 V19 V21 V25
## 0 -0.2090370 -0.1617526 -0.09068574 0.1428907 -0.2230743 -0.2982154 0.1992864
## 1 0.3296577 0.2550888 0.14301421 -0.2253430 0.3517950 0.4702949 -0.3142808
## V16 V26 V52
## 0 -0.2202027 0.1798576 -0.2461680
## 1 0.3472665 -0.2836409 0.3882145
##
## Coefficients of linear discriminants:
## LD1
## V57 0.41495419
## V56 0.06008164
## V55 0.07093636
## V27 -0.18894208
## V19 0.21561656
## V21 0.51801297
## V25 -0.22560523
## V16 0.40532249
## V26 -0.15133245
## V52 0.41660424
pred2.train <- predict(lda2, spambase.train.norm)
pred2.valid <- predict(lda2, spambase.valid.norm)
plot(lda2)
# What are the prior probabilities?
lda2$prior
## 0 1
## 0.6119565 0.3880435
/* As per the Dataset Description the spam and non-spam are depicted as 1 and 0 respectively. The prior probabilities give information about the distribution of spam and non-spam in the entire data set.Here the non-spams are having the 0.6119 and spam of 0.3880 probabilities.The number of non-spam cases are more compared to the spam. */
lda2$scaling
## LD1
## V57 0.41495419
## V56 0.06008164
## V55 0.07093636
## V27 -0.18894208
## V19 0.21561656
## V21 0.51801297
## V25 -0.22560523
## V16 0.40532249
## V26 -0.15133245
## V52 0.41660424
/* The coefficients of linear discriminants are V55,V56,V57- measure the length of sequences of consecutive capital letters V27,V19,V21- */
lda2$scaling
## LD1
## V57 0.41495419
## V56 0.06008164
## V55 0.07093636
## V27 -0.18894208
## V19 0.21561656
## V21 0.51801297
## V25 -0.22560523
## V16 0.40532249
## V26 -0.15133245
## V52 0.41660424
/* We can infer from the above data that the predictors V27,V25,V26 are mainly classified non-spam.Whereas the other Predictors(V57,V56,V55,V19,V21,V16,V52) are classified as spam.The discriminant values obtained negative are considered as non-spams and the values which are positive are considered spam. */
lda2$scaling
## LD1
## V57 0.41495419
## V56 0.06008164
## V55 0.07093636
## V27 -0.18894208
## V19 0.21561656
## V21 0.51801297
## V25 -0.22560523
## V16 0.40532249
## V26 -0.15133245
## V52 0.41660424
/* There is only 1 linear discriminant in the model which is LD1 ,because there are two classes which are specifically spam and non-spam. */
plot(lda2,col="blue",main="Training DataSet")
lda3 <- lda(V58~., data = spambase.valid.norm)
lda3
## Call:
## lda(V58 ~ ., data = spambase.valid.norm)
##
## Prior probabilities of groups:
## 0 1
## 0.5819761 0.4180239
##
## Group means:
## V57 V56 V55 V27 V19 V21 V25
## 0 -0.2224732 -0.1651376 -0.09253522 0.1969354 -0.1509265 -0.3531615 0.2618924
## 1 0.3901886 0.2718384 0.25601141 -0.2253298 0.3513156 0.4499671 -0.3209704
## V16 V26 V52
## 0 -0.1905350 0.1985788 -0.1250177
## 1 0.3246399 -0.2914700 0.3056053
##
## Coefficients of linear discriminants:
## LD1
## V57 0.17671419
## V56 0.60509817
## V55 -0.09161010
## V27 -0.25420448
## V19 0.21821258
## V21 0.63930103
## V25 -0.24553402
## V16 0.29732238
## V26 -0.22522073
## V52 0.05329203
plot(lda3)
CMat <- table(pred2.valid$class, spambase.valid.norm$V58) # pred v actual
confusionMatrix(CMat)
## Confusion Matrix and Statistics
##
##
## 0 1
## 0 497 128
## 1 39 257
##
## Accuracy : 0.8187
## 95% CI : (0.7922, 0.8431)
## No Information Rate : 0.582
## P-Value [Acc > NIR] : < 2.2e-16
##
## Kappa : 0.6148
##
## Mcnemar's Test P-Value : 9.784e-12
##
## Sensitivity : 0.9272
## Specificity : 0.6675
## Pos Pred Value : 0.7952
## Neg Pred Value : 0.8682
## Prevalence : 0.5820
## Detection Rate : 0.5396
## Detection Prevalence : 0.6786
## Balanced Accuracy : 0.7974
##
## 'Positive' Class : 0
##
/* Sensitivity is 92.72% and the Specificity is 66.75% */