knitr::opts_chunk$set(echo = TRUE,
fig.align = "center")
# load packages
pacman::p_load(tidyverse, class, skimr, caret, rpart, rpart.plot)
# Changing the default theme
theme_set(theme_bw())
The data set “Spam_Email.csv” contains columns that measure how frequently certain characters (;, (, [, !, and $) and capitalized letters appear in emails. The goal is to classify each email as either “spam” or “not spam” using these 9 columns.
Read in the data “Spam_Email.csv”. Skim the data frame, and make sure that the classification feature is a factor (change it if necessary).
What percentage/proportion of the emails in the data are unwanted spam?
# Read in the email data
email <- read.csv("Spam_Email.csv")
# recode style as a factor
email <-
email |>
mutate(spam = factor(spam))
# summarize data
skim(email)
Name | |
Number of rows | 4601 |
Number of columns | 10 |
_______________________ | |
Column type frequency: | |
factor | 1 |
numeric | 9 |
________________________ | |
Group variables | None |
Variable type: factor
skim_variable | n_missing | complete_rate | ordered | n_unique | top_counts |
---|---|---|---|---|---|
spam | 0 | 1 | FALSE | 2 | not: 2788, spa: 1813 |
Variable type: numeric
skim_variable | n_missing | complete_rate | mean | sd | p0 | p25 | p50 | p75 | p100 | hist |
---|---|---|---|---|---|---|---|---|---|---|
char_semicolon | 0 | 1 | 0.04 | 0.24 | 0 | 0.00 | 0.00 | 0.00 | 4.38 | ▇▁▁▁▁ |
char_parentheses | 0 | 1 | 0.14 | 0.27 | 0 | 0.00 | 0.06 | 0.19 | 9.75 | ▇▁▁▁▁ |
char_bracket | 0 | 1 | 0.02 | 0.11 | 0 | 0.00 | 0.00 | 0.00 | 4.08 | ▇▁▁▁▁ |
char_exclamation | 0 | 1 | 0.27 | 0.82 | 0 | 0.00 | 0.00 | 0.32 | 32.48 | ▇▁▁▁▁ |
char_dollar | 0 | 1 | 0.08 | 0.25 | 0 | 0.00 | 0.00 | 0.05 | 6.00 | ▇▁▁▁▁ |
char_pound | 0 | 1 | 0.04 | 0.43 | 0 | 0.00 | 0.00 | 0.00 | 19.83 | ▇▁▁▁▁ |
capital_run_average | 0 | 1 | 5.19 | 31.73 | 1 | 1.59 | 2.28 | 3.71 | 1102.50 | ▇▁▁▁▁ |
capital_run_longest | 0 | 1 | 52.17 | 194.89 | 1 | 6.00 | 15.00 | 43.00 | 9989.00 | ▇▁▁▁▁ |
capital_total | 0 | 1 | 283.29 | 606.35 | 1 | 35.00 | 95.00 | 266.00 | 15841.00 | ▇▁▁▁▁ |
Create a new data set using min-max normalization on all of
the quantitative variables. Use skim()
to make sure it
worked.
# create normalization function
normalize <- function(x){return((x - min(x))/(x |> range() |> diff() |> abs()))}
# normalize the email data
email_norm <-
email |>
mutate(
across(.cols = where(is.numeric),
.fns = normalize))
# confirm that normalization worked
skim(email_norm) |>
as_tibble() |>
select(skim_variable, numeric.p0, numeric.p100)
## # A tibble: 10 × 3
## skim_variable numeric.p0 numeric.p100
## <chr> <dbl> <dbl>
## 1 spam NA NA
## 2 char_semicolon 0 1
## 3 char_parentheses 0 1
## 4 char_bracket 0 1
## 5 char_exclamation 0 1
## 6 char_dollar 0 1
## 7 char_pound 0 1
## 8 capital_run_average 0 1
## 9 capital_run_longest 0 1
## 10 capital_total 0 1
Repeat 1b), but standardizing the data
# create normalization function
standardize <- function(x){return((x - mean(x))/(sd(x)))}
# normalize the email data
email_stan <-
email |>
mutate(
across(.cols = where(is.numeric),
.fns = standardize)
)
# confirm that normalization worked
skim(email_stan) |>
as_tibble() |>
select(skim_variable, numeric.mean, numeric.sd) |>
mutate(numeric.mean = round(numeric.mean, digits = 4))
## # A tibble: 10 × 3
## skim_variable numeric.mean numeric.sd
## <chr> <dbl> <dbl>
## 1 spam NA NA
## 2 char_semicolon 0 1
## 3 char_parentheses 0 1
## 4 char_bracket 0 1
## 5 char_exclamation 0 1
## 6 char_dollar 0 1
## 7 char_pound 0 1
## 8 capital_run_average 0 1
## 9 capital_run_longest 0 1
## 10 capital_total 0 1
Find the best choice of k to use for the kNN algorithm for both the normalized and standardized data (separately). Search over the range of 1 to 50
# Keep this at the top of the codechunk
RNGversion("4.1.0")
set.seed(187)
k_vec <- 1:50
knn_mat <- data.frame(k = k_vec,
norm = rep(-1, length(k_vec)),
stan = rep(-1, length(k_vec)))
for (i in 1:length(k_vec)){
# Using knn.cv on the normalized data
knn_norm <- knn.cv(train = email_norm |> select(-spam),
cl = email$spam,
k = k_vec[i])
# Using knn.cv on the standardized data
knn_stan <- knn.cv(train = email_stan |> select(-spam),
cl = email$spam,
k = k_vec[i])
# Saving the accuracy of each rescaling method:
knn_mat[i, 2:3] <- c(mean(email$spam == knn_norm), # norm accuracy
mean(email$spam == knn_stan)) # standardized accuracy
}
Create a graph of the results
knn_mat |>
pivot_longer(cols = norm:stan,
values_to = "accuracy",
names_to = "rescale") |>
ggplot(mapping = aes(x = k,
y = accuracy,
color = rescale)) +
geom_line(size = 1)
## Warning: Using `size` aesthetic for lines was deprecated in ggplot2 3.4.0.
## ℹ Please use `linewidth` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
knn_mat |>
pivot_longer(cols = norm:stan,
values_to = "accuracy",
names_to = "rescale") |>
slice_max(accuracy)
## # A tibble: 1 × 3
## k rescale accuracy
## <int> <chr> <dbl>
## 1 5 stan 0.870
Which k and rescaling should we use to predict if an email is spam?
Form a confusion matrix for the best “model”. How much more accurate is the model compared to just guessing that none of the emails are spam?
best_knn <-
knn.cv(train = email_stan |> select(-spam),
cl = email$spam,
k = 5)
confusionMatrix(data = best_knn,
reference = email$spam)
## Confusion Matrix and Statistics
##
## Reference
## Prediction not spam spam
## not spam 2559 372
## spam 229 1441
##
## Accuracy : 0.8694
## 95% CI : (0.8593, 0.879)
## No Information Rate : 0.606
## P-Value [Acc > NIR] : < 2.2e-16
##
## Kappa : 0.7226
##
## Mcnemar's Test P-Value : 6.943e-09
##
## Sensitivity : 0.9179
## Specificity : 0.7948
## Pos Pred Value : 0.8731
## Neg Pred Value : 0.8629
## Prevalence : 0.6060
## Detection Rate : 0.5562
## Detection Prevalence : 0.6370
## Balanced Accuracy : 0.8563
##
## 'Positive' Class : not spam
##
Using the kNN results, can you determine which variables are the most important when determining if an email is spam or not? If you can, which variables are the most important?
The ‘titanic8.csv’ file has the result of over 1000 different passengers and eight different variables:
Start by reading in the csv and save it as titanic, saving all the character columns as factors:
# Keep this at the top of the code chunk
rm(list = ls())
# Read in the data
titanic <- read.csv("titanic8.csv",
stringsAsFactors = T)
Using all the other columns of the data, grow a full classification tree to predict survived. Display the complexity parameter table of the full tree
# Leave this at the top
RNGversion("4.1.0")
set.seed(187)
# Create the full classification tree below
titanic_full <-
rpart(formula = survived ~ .,
data = titanic,
method = "class",
parms = list(split = "information"),
minsplit = 0,
minbucket = 0,
cp = -1)
# Plot the full class tree:
titanic_full$cptable
## CP nsplit rel error xerror xstd
## 1 0.4564705882 0 1.00000000 1.0000000 0.03733856
## 2 0.0247058824 1 0.54352941 0.5435294 0.03155389
## 3 0.0200000000 3 0.49411765 0.5529412 0.03174743
## 4 0.0117647059 5 0.45411765 0.5200000 0.03105281
## 5 0.0070588235 7 0.43058824 0.4847059 0.03025278
## 6 0.0035294118 8 0.42352941 0.4823529 0.03019728
## 7 0.0023529412 11 0.41176471 0.4870588 0.03030800
## 8 0.0017647059 77 0.22117647 0.4917647 0.03041761
## 9 0.0016806723 89 0.20000000 0.5129412 0.03089756
## 10 0.0015686275 96 0.18823529 0.5152941 0.03094957
## 11 0.0014117647 99 0.18352941 0.5270588 0.03120577
## 12 0.0011764706 104 0.17647059 0.5600000 0.03189006
## 13 0.0010084034 165 0.10117647 0.5623529 0.03193713
## 14 0.0008823529 174 0.09176471 0.5552941 0.03179521
## 15 0.0007843137 185 0.08000000 0.5623529 0.03193713
## 16 0.0005882353 206 0.06117647 0.5623529 0.03193713
## 17 0.0004705882 216 0.05411765 0.5623529 0.03193713
## 18 0.0000000000 221 0.05176471 0.5835294 0.03235030
## 19 -1.0000000000 250 0.05176471 0.5835294 0.03235030
Find the xerror and cp value to prune the full tree.
titanic_full$cptable |>
data.frame() |>
slice_min(xerror) |>
mutate(xcutoff = xerror + xstd) |>
slice(1) |>
select(xcutoff) |>
as.numeric() ->
xcutoff
titanic_full$cptable |>
data.frame() |>
filter(xerror < xcutoff)
## CP nsplit rel.error xerror xstd
## 5 0.007058824 7 0.4305882 0.4847059 0.03025278
## 6 0.003529412 8 0.4235294 0.4823529 0.03019728
## 7 0.002352941 11 0.4117647 0.4870588 0.03030800
## 8 0.001764706 77 0.2211765 0.4917647 0.03041761
The cutoff for the xerror is 0.513, which occurs on row 5 with 7 splits for a total of 8 leaf nodes.
The cp value to use for the cutoff is any value between 0.00706 and 0.011
Prune the full tree from part a) and plot it.
titanic_pruned <-
prune(tree = titanic_full,
cp = 0.008)
rpart.plot(x = titanic_pruned,
type = 5,
extra = 101)
Interpret the right most and left most leaf nodes in context
Create the confusion matrix for the pruned tree. What is the model’s accuracy?
titanic_cm <-
confusionMatrix(data = predict(object = titanic_pruned,
type = "class"),
reference = titanic$survived,
positive = "yes",
dnn = c("predicted", "actual"))
titanic_cm
## Confusion Matrix and Statistics
##
## actual
## predicted no yes
## no 560 125
## yes 58 300
##
## Accuracy : 0.8245
## 95% CI : (0.8001, 0.8472)
## No Information Rate : 0.5925
## P-Value [Acc > NIR] : < 2.2e-16
##
## Kappa : 0.6275
##
## Mcnemar's Test P-Value : 1.067e-06
##
## Sensitivity : 0.7059
## Specificity : 0.9061
## Pos Pred Value : 0.8380
## Neg Pred Value : 0.8175
## Prevalence : 0.4075
## Detection Rate : 0.2876
## Detection Prevalence : 0.3432
## Balanced Accuracy : 0.8060
##
## 'Positive' Class : yes
##
Which 3 variables are the most important when predicting if someone survived or died aboard the Titanic?
caret::varImp(titanic_pruned) |>
arrange(desc(Overall))
## Overall
## sex 154.78836
## passenger 117.54217
## fare 96.38869
## embarked 59.49988
## parent_children 33.56088
## siblings 24.81237
## age 23.47021
Create a new pruned tree using the 3 most important variables from the pruned tree in question 2.
# Leave this at the top
RNGversion("4.1.0")
set.seed(1234)
# Create the full classification tree below
titanic_full3 <-
rpart(formula = survived ~ sex + passenger + fare,
data = titanic,
method = "class",
parms = list(split = "information"),
minsplit = 0,
minbucket = 0,
cp = -1)
# Finding the cp cutoff
titanic_full3$cptable |>
data.frame() |>
slice_min(xerror) |>
mutate(xcutoff = xerror + xstd) |>
slice(1) |>
select(xcutoff) |>
as.numeric() ->
xcutoff
titanic_full3$cptable |>
data.frame() |>
filter(xerror < xcutoff)
## CP nsplit rel.error xerror xstd
## 4 0.0054901961 5 0.4776471 0.5129412 0.03089756
## 5 0.0031372549 8 0.4611765 0.4894118 0.03036294
## 6 0.0023529412 14 0.4352941 0.5011765 0.03063358
## 7 0.0015686275 46 0.3576471 0.4988235 0.03057999
## 8 0.0011764706 61 0.3317647 0.4941176 0.03047201
## 9 0.0007843137 73 0.3176471 0.5035294 0.03068690
## 10 0.0000000000 117 0.2611765 0.5129412 0.03089756
## 11 -1.0000000000 187 0.2611765 0.5129412 0.03089756
# Pruning the tree
titanic_pruned3 <-
prune(tree = titanic_full3,
cp = 0.0055)
rpart.plot(titanic_pruned3,
type = 5,
extra = 101)
Compare the accuracy of the two pruned trees from question 2 and question 3. Which model do you prefer? Briefly explain why
titanic_cm3 <-
predict(object = titanic_pruned3,
type = "class") |>
confusionMatrix(reference = titanic$survived,
positive = "yes",
dnn = c('predicted', "actual"))
c("full" = titanic_cm$overall["Accuracy"],
"smaller" = titanic_cm3$overall["Accuracy"])
## full.Accuracy smaller.Accuracy
## 0.8245446 0.8053691
Using the model with fewer predictors, we lose about 2% accuracy, but now we only need to know 3 things about each passenger compared to 6 different variables for the full model.