R Markdown

This is an R Markdown document. Markdown is a simple formatting syntax for authoring HTML, PDF, and MS Word documents. For more details on using R Markdown see http://rmarkdown.rstudio.com.

When you click the Knit button a document will be generated that includes both content as well as the output of any embedded R code chunks within the document. You can embed an R code chunk like this:

#Load Packages

if(!require("pacman")) install.packages("pacman")
## Loading required package: pacman
pacman::p_load(tidyverse, reshape, gplots, ggmap, MASS, 
               mlbench, data.table,leaps, pivottabler, forecast, dplyr,caret)

#Read Data

Spam_Data <- fread ("spambase.data")
Spam_DF <- setDF (Spam_Data)
str(Spam_DF)
## 'data.frame':    4601 obs. of  58 variables:
##  $ V1 : num  0 0.21 0.06 0 0 0 0 0 0.15 0.06 ...
##  $ V2 : num  0.64 0.28 0 0 0 0 0 0 0 0.12 ...
##  $ V3 : num  0.64 0.5 0.71 0 0 0 0 0 0.46 0.77 ...
##  $ V4 : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ V5 : num  0.32 0.14 1.23 0.63 0.63 1.85 1.92 1.88 0.61 0.19 ...
##  $ V6 : num  0 0.28 0.19 0 0 0 0 0 0 0.32 ...
##  $ V7 : num  0 0.21 0.19 0.31 0.31 0 0 0 0.3 0.38 ...
##  $ V8 : num  0 0.07 0.12 0.63 0.63 1.85 0 1.88 0 0 ...
##  $ V9 : num  0 0 0.64 0.31 0.31 0 0 0 0.92 0.06 ...
##  $ V10: num  0 0.94 0.25 0.63 0.63 0 0.64 0 0.76 0 ...
##  $ V11: num  0 0.21 0.38 0.31 0.31 0 0.96 0 0.76 0 ...
##  $ V12: num  0.64 0.79 0.45 0.31 0.31 0 1.28 0 0.92 0.64 ...
##  $ V13: num  0 0.65 0.12 0.31 0.31 0 0 0 0 0.25 ...
##  $ V14: num  0 0.21 0 0 0 0 0 0 0 0 ...
##  $ V15: num  0 0.14 1.75 0 0 0 0 0 0 0.12 ...
##  $ V16: num  0.32 0.14 0.06 0.31 0.31 0 0.96 0 0 0 ...
##  $ V17: num  0 0.07 0.06 0 0 0 0 0 0 0 ...
##  $ V18: num  1.29 0.28 1.03 0 0 0 0.32 0 0.15 0.12 ...
##  $ V19: num  1.93 3.47 1.36 3.18 3.18 0 3.85 0 1.23 1.67 ...
##  $ V20: num  0 0 0.32 0 0 0 0 0 3.53 0.06 ...
##  $ V21: num  0.96 1.59 0.51 0.31 0.31 0 0.64 0 2 0.71 ...
##  $ V22: num  0 0 0 0 0 0 0 0 0 0 ...
##  $ V23: num  0 0.43 1.16 0 0 0 0 0 0 0.19 ...
##  $ V24: num  0 0.43 0.06 0 0 0 0 0 0.15 0 ...
##  $ V25: num  0 0 0 0 0 0 0 0 0 0 ...
##  $ V26: num  0 0 0 0 0 0 0 0 0 0 ...
##  $ V27: num  0 0 0 0 0 0 0 0 0 0 ...
##  $ V28: num  0 0 0 0 0 0 0 0 0 0 ...
##  $ V29: num  0 0 0 0 0 0 0 0 0 0 ...
##  $ V30: num  0 0 0 0 0 0 0 0 0 0 ...
##  $ V31: num  0 0 0 0 0 0 0 0 0 0 ...
##  $ V32: num  0 0 0 0 0 0 0 0 0 0 ...
##  $ V33: num  0 0 0 0 0 0 0 0 0.15 0 ...
##  $ V34: num  0 0 0 0 0 0 0 0 0 0 ...
##  $ V35: num  0 0 0 0 0 0 0 0 0 0 ...
##  $ V36: num  0 0 0 0 0 0 0 0 0 0 ...
##  $ V37: num  0 0.07 0 0 0 0 0 0 0 0 ...
##  $ V38: num  0 0 0 0 0 0 0 0 0 0 ...
##  $ V39: num  0 0 0 0 0 0 0 0 0 0 ...
##  $ V40: num  0 0 0.06 0 0 0 0 0 0 0 ...
##  $ V41: num  0 0 0 0 0 0 0 0 0 0 ...
##  $ V42: num  0 0 0 0 0 0 0 0 0 0 ...
##  $ V43: num  0 0 0.12 0 0 0 0 0 0.3 0 ...
##  $ V44: num  0 0 0 0 0 0 0 0 0 0.06 ...
##  $ V45: num  0 0 0.06 0 0 0 0 0 0 0 ...
##  $ V46: num  0 0 0.06 0 0 0 0 0 0 0 ...
##  $ V47: num  0 0 0 0 0 0 0 0 0 0 ...
##  $ V48: num  0 0 0 0 0 0 0 0 0 0 ...
##  $ V49: num  0 0 0.01 0 0 0 0 0 0 0.04 ...
##  $ V50: num  0 0.132 0.143 0.137 0.135 0.223 0.054 0.206 0.271 0.03 ...
##  $ V51: num  0 0 0 0 0 0 0 0 0 0 ...
##  $ V52: num  0.778 0.372 0.276 0.137 0.135 0 0.164 0 0.181 0.244 ...
##  $ V53: num  0 0.18 0.184 0 0 0 0.054 0 0.203 0.081 ...
##  $ V54: num  0 0.048 0.01 0 0 0 0 0 0.022 0 ...
##  $ V55: num  3.76 5.11 9.82 3.54 3.54 ...
##  $ V56: int  61 101 485 40 40 15 4 11 445 43 ...
##  $ V57: int  278 1028 2259 191 191 54 112 49 1257 749 ...
##  $ V58: int  1 1 1 1 1 1 1 1 1 1 ...
class(Spam_DF)
## [1] "data.frame"

#Examine how each predictor is different between spam & non-sapm email by comparing class averages.

library(dplyr)
Pivot <- Spam_DF %>%
  group_by (V58)%>%
  summarise_all(funs(mean))
## Warning: `funs()` is deprecated as of dplyr 0.8.0.
## Please use a list of either functions or lambdas: 
## 
##   # Simple named list: 
##   list(mean = mean, median = median)
## 
##   # Auto named with `tibble::lst()`: 
##   tibble::lst(mean, median)
## 
##   # Using lambdas
##   list(~ mean(., trim = .2), ~ median(., na.rm = TRUE))
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_warnings()` to see where this warning was generated.
head(Pivot)

#Identify the 10 variables with highest difference between class average

Var_Table <-data.frame(r1=names(Pivot), t(Pivot))
Var_Table["Difference"] <- NA
Var_Table$Difference <- round(abs(Var_Table$X1 - Var_Table$X2),4)
view(Var_Table)

Final_Table <- Var_Table[-c(1),]
Final_Table[order(-Final_Table$Difference),]
class(Final_Table)
## [1] "data.frame"

#Using training dataset with only 10 predictors with higest difference - Data Parition - Normalize the data

Working_Table <- Spam_DF[,c(57,56,55,27,19,21,25,16,26,52,58)]
view(Working_Table)
Working_Data <-transform(Working_Table, V58 = as.character(V58)) 
str(Working_Data)
## 'data.frame':    4601 obs. of  11 variables:
##  $ V57: int  278 1028 2259 191 191 54 112 49 1257 749 ...
##  $ V56: int  61 101 485 40 40 15 4 11 445 43 ...
##  $ V55: num  3.76 5.11 9.82 3.54 3.54 ...
##  $ V27: num  0 0 0 0 0 0 0 0 0 0 ...
##  $ V19: num  1.93 3.47 1.36 3.18 3.18 0 3.85 0 1.23 1.67 ...
##  $ V21: num  0.96 1.59 0.51 0.31 0.31 0 0.64 0 2 0.71 ...
##  $ V25: num  0 0 0 0 0 0 0 0 0 0 ...
##  $ V16: num  0.32 0.14 0.06 0.31 0.31 0 0.96 0 0 0 ...
##  $ V26: num  0 0 0 0 0 0 0 0 0 0 ...
##  $ V52: num  0.778 0.372 0.276 0.137 0.135 0 0.164 0 0.181 0.244 ...
##  $ V58: chr  "1" "1" "1" "1" ...
set.seed(30)
training.index <- sample (row.names(Working_Data), 0.8*dim(Working_Data)[1])
valid.index <- setdiff(row.names(Working_Data), training.index)
train.df <- Working_Data [training.index, ]
valid.df <- Working_Data [valid.index, ]

norm.values <- preProcess(train.df, method = c("center", "scale"))

spambase.train.norm <- predict(norm.values, train.df)
spambase.valid.norm <- predict(norm.values, valid.df)

Run LDA & run prediction using Training Data set & plot

lda2 <- lda(V58~., data = spambase.train.norm)
lda2
## Call:
## lda(V58 ~ ., data = spambase.train.norm)
## 
## Prior probabilities of groups:
##         0         1 
## 0.6119565 0.3880435 
## 
## Group means:
##          V57        V56         V55        V27        V19        V21        V25
## 0 -0.2090370 -0.1617526 -0.09068574  0.1428907 -0.2230743 -0.2982154  0.1992864
## 1  0.3296577  0.2550888  0.14301421 -0.2253430  0.3517950  0.4702949 -0.3142808
##          V16        V26        V52
## 0 -0.2202027  0.1798576 -0.2461680
## 1  0.3472665 -0.2836409  0.3882145
## 
## Coefficients of linear discriminants:
##             LD1
## V57  0.41495419
## V56  0.06008164
## V55  0.07093636
## V27 -0.18894208
## V19  0.21561656
## V21  0.51801297
## V25 -0.22560523
## V16  0.40532249
## V26 -0.15133245
## V52  0.41660424
pred2.train <- predict(lda2, spambase.train.norm)
pred2.valid <- predict(lda2, spambase.valid.norm)

plot(lda2)

# What are the prior probabilities?

lda2$prior
##         0         1 
## 0.6119565 0.3880435

/* As per the Dataset Description the spam and non-spam are depicted as 1 and 0 respectively. The prior probabilities give information about the distribution of spam and non-spam in the entire data set.Here the non-spams are having the 0.6119 and spam of 0.3880 probabilities.The number of non-spam cases are more compared to the spam. */

What are the coefficients of linear discriminants? Explain.

lda2$scaling
##             LD1
## V57  0.41495419
## V56  0.06008164
## V55  0.07093636
## V27 -0.18894208
## V19  0.21561656
## V21  0.51801297
## V25 -0.22560523
## V16  0.40532249
## V26 -0.15133245
## V52  0.41660424

/* The coefficients of linear discriminants are V55,V56,V57- measure the length of sequences of consecutive capital letters V27,V19,V21- */

Generate linear discriminants using your analysis. How are they used in classifying spams and non-spams?

lda2$scaling
##             LD1
## V57  0.41495419
## V56  0.06008164
## V55  0.07093636
## V27 -0.18894208
## V19  0.21561656
## V21  0.51801297
## V25 -0.22560523
## V16  0.40532249
## V26 -0.15133245
## V52  0.41660424

/* We can infer from the above data that the predictors V27,V25,V26 are mainly classified non-spam.Whereas the other Predictors(V57,V56,V55,V19,V21,V16,V52) are classified as spam.The discriminant values obtained negative are considered as non-spams and the values which are positive are considered spam. */

How many linear discriminants are in the model? Why?

lda2$scaling
##             LD1
## V57  0.41495419
## V56  0.06008164
## V55  0.07093636
## V27 -0.18894208
## V19  0.21561656
## V21  0.51801297
## V25 -0.22560523
## V16  0.40532249
## V26 -0.15133245
## V52  0.41660424

/* There is only 1 linear discriminant in the model which is LD1 ,because there are two classes which are specifically spam and non-spam. */

Generate LDA plot using the training and validation data. What information is presented in these plots? How are they different?

plot(lda2,col="blue",main="Training DataSet")

lda3 <- lda(V58~., data = spambase.valid.norm)
lda3
## Call:
## lda(V58 ~ ., data = spambase.valid.norm)
## 
## Prior probabilities of groups:
##         0         1 
## 0.5819761 0.4180239 
## 
## Group means:
##          V57        V56         V55        V27        V19        V21        V25
## 0 -0.2224732 -0.1651376 -0.09253522  0.1969354 -0.1509265 -0.3531615  0.2618924
## 1  0.3901886  0.2718384  0.25601141 -0.2253298  0.3513156  0.4499671 -0.3209704
##          V16        V26        V52
## 0 -0.1905350  0.1985788 -0.1250177
## 1  0.3246399 -0.2914700  0.3056053
## 
## Coefficients of linear discriminants:
##             LD1
## V57  0.17671419
## V56  0.60509817
## V55 -0.09161010
## V27 -0.25420448
## V19  0.21821258
## V21  0.63930103
## V25 -0.24553402
## V16  0.29732238
## V26 -0.22522073
## V52  0.05329203
plot(lda3)

Generate the relevant confusion matrix. What are the sensitivity and specificity?

CMat <- table(pred2.valid$class, spambase.valid.norm$V58)  # pred v actual
confusionMatrix(CMat)
## Confusion Matrix and Statistics
## 
##    
##       0   1
##   0 497 128
##   1  39 257
##                                           
##                Accuracy : 0.8187          
##                  95% CI : (0.7922, 0.8431)
##     No Information Rate : 0.582           
##     P-Value [Acc > NIR] : < 2.2e-16       
##                                           
##                   Kappa : 0.6148          
##                                           
##  Mcnemar's Test P-Value : 9.784e-12       
##                                           
##             Sensitivity : 0.9272          
##             Specificity : 0.6675          
##          Pos Pred Value : 0.7952          
##          Neg Pred Value : 0.8682          
##              Prevalence : 0.5820          
##          Detection Rate : 0.5396          
##    Detection Prevalence : 0.6786          
##       Balanced Accuracy : 0.7974          
##                                           
##        'Positive' Class : 0               
## 

/* Sensitivity is 92.72% and the Specificity is 66.75% */