Homework 8 - Classification Analysis

Set up

knitr::opts_chunk$set(echo = TRUE,
                      fig.align = "center")
# load packages
pacman::p_load(tidyverse, class, skimr, caret, rpart, rpart.plot)


# Changing the default theme
theme_set(theme_bw())

Question 1) Spam Email

The data set “Spam_Email.csv” contains columns that measure how frequently certain characters (;, (, [, !, and $) and capitalized letters appear in emails. The goal is to classify each email as either “spam” or “not spam” using these 9 columns.

1a) Read in email data and summarize

Read in the data “Spam_Email.csv”. Skim the data frame, and make sure that the classification feature is a factor (change it if necessary).

What percentage/proportion of the emails in the data are unwanted spam?

# Read in the email data
email <- read.csv("Spam_Email.csv")


# recode style as a factor
email <- 
  email |> 
  mutate(spam = factor(spam))


# summarize data
skim(email)

Data summary
Name	email
Number of rows	4601
Number of columns	10
_______________________
Column type frequency:
factor	1
numeric	9
________________________
Group variables	None

Variable type: factor

skim_variable	n_missing	complete_rate	ordered	n_unique	top_counts
spam	0	1	FALSE	2	not: 2788, spa: 1813

Variable type: numeric

skim_variable	complete_rate	mean	sd	p0	p25	p50	p75	p100	hist
char_semicolon	1	0.04	0.24	0	0.00	0.00	0.00	4.38	▇▁▁▁▁
char_parentheses	1	0.14	0.27	0	0.00	0.06	0.19	9.75	▇▁▁▁▁
char_bracket	1	0.02	0.11	0	0.00	0.00	0.00	4.08	▇▁▁▁▁
char_exclamation	1	0.27	0.82	0	0.00	0.00	0.32	32.48	▇▁▁▁▁
char_dollar	1	0.08	0.25	0	0.00	0.00	0.05	6.00	▇▁▁▁▁
char_pound	1	0.04	0.43	0	0.00	0.00	0.00	19.83	▇▁▁▁▁
capital_run_average	1	5.19	31.73	1	1.59	2.28	3.71	1102.50	▇▁▁▁▁
capital_run_longest	1	52.17	194.89	1	6.00	15.00	43.00	9989.00	▇▁▁▁▁
capital_total	1	283.29	606.35	1	35.00	95.00	266.00	15841.00	▇▁▁▁▁

1b) Normalize data

Create a new data set using min-max normalization on all of the quantitative variables. Use skim() to make sure it worked.

# create normalization function
normalize <- function(x){return((x - min(x))/(x |> range() |> diff() |> abs()))}


# normalize the email data
email_norm <- 
  email |> 
  mutate(
    across(.cols = where(is.numeric),
           .fns = normalize))


# confirm that normalization worked
skim(email_norm) |> 
  as_tibble() |> 
  select(skim_variable, numeric.p0, numeric.p100)

## # A tibble: 10 × 3
##    skim_variable       numeric.p0 numeric.p100
##    <chr>                    <dbl>        <dbl>
##  1 spam                        NA           NA
##  2 char_semicolon               0            1
##  3 char_parentheses             0            1
##  4 char_bracket                 0            1
##  5 char_exclamation             0            1
##  6 char_dollar                  0            1
##  7 char_pound                   0            1
##  8 capital_run_average          0            1
##  9 capital_run_longest          0            1
## 10 capital_total                0            1

c) Standardize the data

Repeat 1b), but standardizing the data

# create normalization function
standardize <- function(x){return((x - mean(x))/(sd(x)))}


# normalize the email data
email_stan <- 
  email |> 
  mutate(
    across(.cols = where(is.numeric),
           .fns = standardize)
    )


# confirm that normalization worked
skim(email_stan) |> 
  as_tibble() |> 
  select(skim_variable, numeric.mean, numeric.sd) |> 
  mutate(numeric.mean = round(numeric.mean, digits = 4))

## # A tibble: 10 × 3
##    skim_variable       numeric.mean numeric.sd
##    <chr>                      <dbl>      <dbl>
##  1 spam                          NA         NA
##  2 char_semicolon                 0          1
##  3 char_parentheses               0          1
##  4 char_bracket                   0          1
##  5 char_exclamation               0          1
##  6 char_dollar                    0          1
##  7 char_pound                     0          1
##  8 capital_run_average            0          1
##  9 capital_run_longest            0          1
## 10 capital_total                  0          1

1d) Finding best kNN “model”

Find the best choice of k to use for the kNN algorithm for both the normalized and standardized data (separately). Search over the range of 1 to 50

# Keep this at the top of the codechunk
RNGversion("4.1.0")
set.seed(187)

k_vec <- 1:50

knn_mat <- data.frame(k = k_vec,
                      norm = rep(-1, length(k_vec)),
                      stan = rep(-1, length(k_vec)))

for (i in 1:length(k_vec)){
  
  # Using knn.cv on the normalized data
  knn_norm <- knn.cv(train = email_norm |> select(-spam),
                     cl = email$spam,
                     k = k_vec[i])
  
  # Using knn.cv on the standardized data
  knn_stan <- knn.cv(train = email_stan |> select(-spam),
                     cl = email$spam,
                     k = k_vec[i])
  
  # Saving the accuracy of each rescaling method:
  knn_mat[i, 2:3] <- c(mean(email$spam == knn_norm),  # norm accuracy
                       mean(email$spam == knn_stan))  # standardized accuracy
  
}

Create a graph of the results

knn_mat |> 
  pivot_longer(cols = norm:stan,
               values_to = "accuracy",
               names_to = "rescale") |> 
  
  ggplot(mapping = aes(x = k,
                       y = accuracy, 
                       color = rescale)) +
  
  geom_line(size = 1)

## Warning: Using `size` aesthetic for lines was deprecated in ggplot2 3.4.0.
## ℹ Please use `linewidth` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.

knn_mat |> 
  pivot_longer(cols = norm:stan,
               values_to = "accuracy",
               names_to = "rescale") |> 
  slice_max(accuracy)

## # A tibble: 1 × 3
##       k rescale accuracy
##   <int> <chr>      <dbl>
## 1     5 stan       0.870

Which k and rescaling should we use to predict if an email is spam?

Part 1e) Accuracy of the best model

Form a confusion matrix for the best “model”. How much more accurate is the model compared to just guessing that none of the emails are spam?

best_knn <- 
  knn.cv(train = email_stan |> select(-spam),
         cl = email$spam,
         k = 5)

confusionMatrix(data = best_knn,
                reference = email$spam)

## Confusion Matrix and Statistics
## 
##           Reference
## Prediction not spam spam
##   not spam     2559  372
##   spam          229 1441
##                                          
##                Accuracy : 0.8694         
##                  95% CI : (0.8593, 0.879)
##     No Information Rate : 0.606          
##     P-Value [Acc > NIR] : < 2.2e-16      
##                                          
##                   Kappa : 0.7226         
##                                          
##  Mcnemar's Test P-Value : 6.943e-09      
##                                          
##             Sensitivity : 0.9179         
##             Specificity : 0.7948         
##          Pos Pred Value : 0.8731         
##          Neg Pred Value : 0.8629         
##              Prevalence : 0.6060         
##          Detection Rate : 0.5562         
##    Detection Prevalence : 0.6370         
##       Balanced Accuracy : 0.8563         
##                                          
##        'Positive' Class : not spam       
##

1f) Important variables

Using the kNN results, can you determine which variables are the most important when determining if an email is spam or not? If you can, which variables are the most important?

Question 2) Titanic Passengers

The ‘titanic8.csv’ file has the result of over 1000 different passengers and eight different variables:

passenger = class of passenger (1st/2nd/3rd)
survived = If they survived the sinking of the Titanic (yes/no)
sex = If the passenger was male or female
age = The age of the passenger at the time of departure
siblings = number of siblings aboard the ship
parent_children = number of parents + children aboard the ship
fare = the price of the ticket
embarked = where they boarded the Titanic (Cherbourg/Queenstown/Southampton)

Start by reading in the csv and save it as titanic, saving all the character columns as factors:

# Keep this at the top of the code chunk
rm(list = ls())

# Read in the data 
titanic <- read.csv("titanic8.csv",
                    stringsAsFactors = T)

1a) Grow the full tree

Using all the other columns of the data, grow a full classification tree to predict survived. Display the complexity parameter table of the full tree

# Leave this at the top
RNGversion("4.1.0")
set.seed(187)

# Create the full classification tree below
titanic_full <- 
  rpart(formula = survived ~ .,
        data = titanic,
        method = "class",
        parms = list(split = "information"),
        minsplit = 0, 
        minbucket = 0,
        cp = -1)

# Plot the full class tree:
titanic_full$cptable

##               CP nsplit  rel error    xerror       xstd
## 1   0.4564705882      0 1.00000000 1.0000000 0.03733856
## 2   0.0247058824      1 0.54352941 0.5435294 0.03155389
## 3   0.0200000000      3 0.49411765 0.5529412 0.03174743
## 4   0.0117647059      5 0.45411765 0.5200000 0.03105281
## 5   0.0070588235      7 0.43058824 0.4847059 0.03025278
## 6   0.0035294118      8 0.42352941 0.4823529 0.03019728
## 7   0.0023529412     11 0.41176471 0.4870588 0.03030800
## 8   0.0017647059     77 0.22117647 0.4917647 0.03041761
## 9   0.0016806723     89 0.20000000 0.5129412 0.03089756
## 10  0.0015686275     96 0.18823529 0.5152941 0.03094957
## 11  0.0014117647     99 0.18352941 0.5270588 0.03120577
## 12  0.0011764706    104 0.17647059 0.5600000 0.03189006
## 13  0.0010084034    165 0.10117647 0.5623529 0.03193713
## 14  0.0008823529    174 0.09176471 0.5552941 0.03179521
## 15  0.0007843137    185 0.08000000 0.5623529 0.03193713
## 16  0.0005882353    206 0.06117647 0.5623529 0.03193713
## 17  0.0004705882    216 0.05411765 0.5623529 0.03193713
## 18  0.0000000000    221 0.05176471 0.5835294 0.03235030
## 19 -1.0000000000    250 0.05176471 0.5835294 0.03235030

Part 2b) Find the cut off to prune the tree

Find the xerror and cp value to prune the full tree.

titanic_full$cptable |> 
  data.frame() |> 
  slice_min(xerror) |> 
  mutate(xcutoff = xerror + xstd) |> 
  slice(1) |> 
  select(xcutoff) |> 
  as.numeric() ->
  xcutoff

titanic_full$cptable |> 
  data.frame() |> 
  filter(xerror < xcutoff)

##            CP nsplit rel.error    xerror       xstd
## 5 0.007058824      7 0.4305882 0.4847059 0.03025278
## 6 0.003529412      8 0.4235294 0.4823529 0.03019728
## 7 0.002352941     11 0.4117647 0.4870588 0.03030800
## 8 0.001764706     77 0.2211765 0.4917647 0.03041761

The cutoff for the xerror is 0.513, which occurs on row 5 with 7 splits for a total of 8 leaf nodes.

The cp value to use for the cutoff is any value between 0.00706 and 0.011

Part 2c) Creating the pruned tree

Prune the full tree from part a) and plot it.

titanic_pruned <- 
  prune(tree = titanic_full,
        cp = 0.008)

rpart.plot(x = titanic_pruned,
           type = 5,
           extra = 101)

Part 2d) Intepreting two leaf nodes

Interpret the right most and left most leaf nodes in context

Part 2e) Confusion Matrix

Create the confusion matrix for the pruned tree. What is the model’s accuracy?

titanic_cm <- 
  confusionMatrix(data = predict(object = titanic_pruned,
                                 type = "class"),
                  reference = titanic$survived,
                  positive = "yes",
                  dnn = c("predicted", "actual"))

titanic_cm

## Confusion Matrix and Statistics
## 
##          actual
## predicted  no yes
##       no  560 125
##       yes  58 300
##                                           
##                Accuracy : 0.8245          
##                  95% CI : (0.8001, 0.8472)
##     No Information Rate : 0.5925          
##     P-Value [Acc > NIR] : < 2.2e-16       
##                                           
##                   Kappa : 0.6275          
##                                           
##  Mcnemar's Test P-Value : 1.067e-06       
##                                           
##             Sensitivity : 0.7059          
##             Specificity : 0.9061          
##          Pos Pred Value : 0.8380          
##          Neg Pred Value : 0.8175          
##              Prevalence : 0.4075          
##          Detection Rate : 0.2876          
##    Detection Prevalence : 0.3432          
##       Balanced Accuracy : 0.8060          
##                                           
##        'Positive' Class : yes             
##

Part 2f) Variable Importance

Which 3 variables are the most important when predicting if someone survived or died aboard the Titanic?

caret::varImp(titanic_pruned) |> 
  arrange(desc(Overall))

##                   Overall
## sex             154.78836
## passenger       117.54217
## fare             96.38869
## embarked         59.49988
## parent_children  33.56088
## siblings         24.81237
## age              23.47021

Quesiton 3) Titanic Tree (again)

3a) Recreate the pruned tree

Create a new pruned tree using the 3 most important variables from the pruned tree in question 2.

# Leave this at the top
RNGversion("4.1.0")
set.seed(1234)

# Create the full classification tree below
titanic_full3 <- 
  rpart(formula = survived ~ sex + passenger + fare,
        data = titanic,
        method = "class",
        parms = list(split = "information"),
        minsplit = 0, 
        minbucket = 0,
        cp = -1)

# Finding the cp cutoff
titanic_full3$cptable |> 
  data.frame() |> 
  slice_min(xerror) |> 
  mutate(xcutoff = xerror + xstd) |> 
  slice(1) |> 
  select(xcutoff) |> 
  as.numeric() ->
  xcutoff

titanic_full3$cptable |> 
  data.frame() |> 
  filter(xerror < xcutoff)

##               CP nsplit rel.error    xerror       xstd
## 4   0.0054901961      5 0.4776471 0.5129412 0.03089756
## 5   0.0031372549      8 0.4611765 0.4894118 0.03036294
## 6   0.0023529412     14 0.4352941 0.5011765 0.03063358
## 7   0.0015686275     46 0.3576471 0.4988235 0.03057999
## 8   0.0011764706     61 0.3317647 0.4941176 0.03047201
## 9   0.0007843137     73 0.3176471 0.5035294 0.03068690
## 10  0.0000000000    117 0.2611765 0.5129412 0.03089756
## 11 -1.0000000000    187 0.2611765 0.5129412 0.03089756

# Pruning the tree
titanic_pruned3 <- 
  prune(tree = titanic_full3,
        cp = 0.0055)

rpart.plot(titanic_pruned3,
           type = 5,
           extra = 101)

Part 3b) Compare the accuarcy of the two pruned trees

Compare the accuracy of the two pruned trees from question 2 and question 3. Which model do you prefer? Briefly explain why

titanic_cm3 <- 
  predict(object = titanic_pruned3,
          type = "class") |> 
  confusionMatrix(reference = titanic$survived,
                  positive = "yes",
                  dnn = c('predicted', "actual"))

c("full" = titanic_cm$overall["Accuracy"],
  "smaller" = titanic_cm3$overall["Accuracy"])

##    full.Accuracy smaller.Accuracy 
##        0.8245446        0.8053691

Using the model with fewer predictors, we lose about 2% accuracy, but now we only need to know 3 things about each passenger compared to 6 different variables for the full model.