knitr::opts_chunk$set(echo = TRUE,
message = F,
warning = F,
fig.align = "center")
# load packages: typical - tidyverse and skimr
# Classification - class, caret, rpart, rpart.plot
pacman::p_load(tidyverse, skimr, class, caret, rpart, rpart.plot)
theme_set(theme_bw())
# Reading in the data
cancer <-
read.csv(
'cancer.csv',
stringsAsFactors = T
) |>
mutate(
diagnosis = factor(diagnosis,
levels = c("B", "M"),
labels = c("Benign", "Malignant"))
)
For this homework assignment, you’ll be using ten (10) different features to try to predict if a tumor is malignant (cancerous) or benign (harmless).
The 10 different measurements (features) about each tumor are:
Calculate the mean for each of the twenty explanatory
variables (features) for malignant and benign tumors. Hint: To get the
data in the same format as what is in the solutions, you’ll need to use
both pivot_longer()
and then (later)
pivot_wider()
cancer |>
# stacking all the numeric columns into one column
pivot_longer(
cols = -diagnosis,
names_to = "Feature"
) |>
# Calculating the mean for each variable by diagnosis
summarize(
.by = c(Feature, diagnosis),
average = mean(value),
#standard_deviation = sd(value)
) |>
# Turning the diagnosis and value columns into two columns
pivot_wider(
id_cols = Feature,
names_from = diagnosis,
values_from = average
)
## # A tibble: 10 × 3
## Feature Benign Malignant
## <chr> <dbl> <dbl>
## 1 radius 12.1 17.5
## 2 texture 17.9 21.6
## 3 perimeter 78.1 115.
## 4 area 463. 978.
## 5 smoothness 0.0925 0.103
## 6 compactness 0.0801 0.145
## 7 concavity 0.0461 0.161
## 8 points 0.0257 0.0880
## 9 symmetry 0.174 0.193
## 10 dimension 0.0629 0.0627
Create a pair of boxplots for each feature to compare the malignant and benign tumors
cancer |>
# Stacking the columns again
pivot_longer(
cols = -diagnosis,
names_to = "feature"
) |>
# Creating small multiples for the boxplots
ggplot(
mapping = aes(x = value,
y = diagnosis,
fill = fct_rev(diagnosis))
) +
geom_boxplot(show.legend = F) +
facet_wrap(
facets = ~ feature,
scales = "free_x",
nrow = 5
) +
labs(
y = NULL,
x = NULL
) +
# Removing the tick marks on the x-axis
scale_x_continuous(breaks = NULL)
Which feature seems to be the most useful at determining if a tumor is malignant? Points seems to be the most helpful, while area, concavity, perimeter, radius, texture, and compactness appear to be atleast somewhat helpful
Which feature seems to be the least useful at determining if a tumor is malignant? The boxplots for dimension have the most overlap, followed by smoothness and symmetry.
Create two data sets named cancer_norm, and cancer_stan that have the normalized and standardized features, respectively.
# create min-max normalization function
normalize <- function(x) {
return( ( x - min(x) ) / ( max(x) - min(x) ))
}
# Now let's normalize the cancer data:
cancer_norm <-
cancer |>
mutate(
across(
.cols = where(is.numeric),
.fns = normalize
)
)
The code chunk below should verify that you’ve normalized the data correctly.
## # A tibble: 10 × 5
## feature average standard_deviation p0 p100
## <chr> <dbl> <dbl> <dbl> <dbl>
## 1 radius 0.34 0.17 0 1
## 2 texture 0.32 0.15 0 1
## 3 perimeter 0.33 0.17 0 1
## 4 area 0.22 0.15 0 1
## 5 smoothness 0.39 0.13 0 1
## 6 compactness 0.26 0.16 0 1
## 7 concavity 0.21 0.19 0 1
## 8 points 0.24 0.19 0 1
## 9 symmetry 0.38 0.14 0 1
## 10 dimension 0.27 0.15 0 1
Briefly explain why the table above shows that you’ve normalized the data correctly
# create min-max normalization function
standardize <- function(x) {
return( ( x - mean(x) ) / ( sd(x) ))
}
# Now let's normalize the cancer data:
cancer_stan <-
cancer |>
mutate(
across(
.cols = where(is.numeric),
.fns = standardize
)
)
The code chunk below should verify that you’ve standardized the data correctly.
## # A tibble: 10 × 5
## feature average standard_deviation p0 p100
## <chr> <dbl> <dbl> <dbl> <dbl>
## 1 radius 0 1 -2.03 3.97
## 2 texture 0 1 -2.23 4.65
## 3 perimeter 0 1 -1.98 3.97
## 4 area 0 1 -1.45 5.25
## 5 smoothness 0 1 -3.11 4.77
## 6 compactness 0 1 -1.61 4.56
## 7 concavity 0 1 -1.11 4.24
## 8 points 0 1 -1.26 3.92
## 9 symmetry 0 1 -2.74 4.48
## 10 dimension 0 1 -1.82 4.91
Briefly explain why the table above shows that you’ve standardized the data correctly
Create a tibble()
named knn_results to store
the:
k: the value of k from 1 to 100
norm_acc: The accuracy for that choice of k with normalized data
stan_acc: The accuracy for that choice of k with standardized data
knn_results <-
tibble(
k = 5:100,
norm_acc = rep(-1, length(k)),
stan_acc = rep(-1, length(k))
)
knn_results
## # A tibble: 96 × 3
## k norm_acc stan_acc
## <int> <dbl> <dbl>
## 1 5 -1 -1
## 2 6 -1 -1
## 3 7 -1 -1
## 4 8 -1 -1
## 5 9 -1 -1
## 6 10 -1 -1
## 7 11 -1 -1
## 8 12 -1 -1
## 9 13 -1 -1
## 10 14 -1 -1
## # ℹ 86 more rows
Now use a single loop to find the accuracy for each choice of k for the normalized and standardized data
RNGversion("4.1.0");set.seed(1234)
# Writing the for loop
for (i in 1:nrow(knn_results)){
# performing knn with normalized data
norm_loop <-
knn.cv(
train = cancer_norm |> dplyr::select(-diagnosis),
cl = cancer_norm$diagnosis,
k = knn_results$k[i]
)
# Saving the normalized results
knn_results[i, "norm_acc"] <- mean(cancer_norm$diagnosis == norm_loop)
## Repeating above, but with the standardized data
# performing knn with standardized data
stan_loop <-
knn.cv(
train = cancer_stan |> dplyr::select(-diagnosis),
cl = cancer_stan$diagnosis,
k = knn_results$k[i]
)
# Saving the standarized accuracy
knn_results[i, "stan_acc"] <- mean(cancer_stan$diagnosis == stan_loop)
}
# Displaying the first 10 rows
tibble(knn_results)
## # A tibble: 96 × 3
## k norm_acc stan_acc
## <int> <dbl> <dbl>
## 1 5 0.942 0.944
## 2 6 0.933 0.937
## 3 7 0.938 0.942
## 4 8 0.938 0.944
## 5 9 0.944 0.947
## 6 10 0.938 0.942
## 7 11 0.949 0.947
## 8 12 0.947 0.935
## 9 13 0.942 0.944
## 10 14 0.940 0.940
## # ℹ 86 more rows
Use a line graph to display the accuracy for each choice of k and rescaling method
knn_results |>
# Stacking the two accuracy columns together
pivot_longer(
cols = -k,
names_to = "rescale",
values_to = "accuracy"
) |>
# Creating the line graph
ggplot(
mapping = aes(x = k,
y = accuracy,
color = rescale)
) +
geom_line(
linewidth = 1
) +
# Changing the theme and legend location
theme(
legend.position = c(0.85, 0.85)
) +
labs(
x = "Choice of k",
color = "Rescale Method"
) +
scale_color_discrete(
labels = c("Normalized", "Standardized")
) +
scale_x_continuous(
breaks = seq(0, 100, 10),
minor_breaks = NULL
) +
scale_y_continuous(
labels = scales::label_percent()
)
What value of k and rescaling method should be used?
knn_results |>
# Stacking the two accuracy columns together
pivot_longer(
cols = -k,
names_to = "rescale",
values_to = "accuracy"
) |>
slice_max(accuracy)
## # A tibble: 1 × 3
## k rescale accuracy
## <int> <chr> <dbl>
## 1 11 norm_acc 0.949
Using your answer in part 2c), create a confusion matrix for the results.
confusionMatrix(
data = knn.cv(train = cancer_norm |> dplyr::select(-diagnosis),
cl = cancer$diagnosis,
k = 11),
reference = cancer$diagnosis,
positive = "Malignant"
)
## Confusion Matrix and Statistics
##
## Reference
## Prediction Benign Malignant
## Benign 345 17
## Malignant 12 195
##
## Accuracy : 0.949
## 95% CI : (0.9276, 0.9656)
## No Information Rate : 0.6274
## P-Value [Acc > NIR] : <2e-16
##
## Kappa : 0.8905
##
## Mcnemar's Test P-Value : 0.4576
##
## Sensitivity : 0.9198
## Specificity : 0.9664
## Pos Pred Value : 0.9420
## Neg Pred Value : 0.9530
## Prevalence : 0.3726
## Detection Rate : 0.3427
## Detection Prevalence : 0.3638
## Balanced Accuracy : 0.9431
##
## 'Positive' Class : Malignant
##
How much does KNN improve the accuracy of diagnosis compared to just diagnosing every tumor as benign?
The no information rate is 62.7%, meaning if we diagnosed every tumor as benign, we’d be right about 63% of the time.
The model diagnoses about 95% of tumors correctly (92% of malignant as malignant and 96.6% of benign as benign), which is much more accurate than using the no information prediction!
Instead of using k-nearest-neighbors, you’ll use a classification (decision) tree to predict if a tumor is malignant or benign
Grow the full classification tree then display the cp table in the knitted document. DO NOT DISPLAY THE FULL CLASSIFICATION TREE
# Keep this at the top of the code chunk
RNGversion("4.1.0"); set.seed(1234)
# Grow the full tree below
full_tree <-
rpart(
formula = diagnosis ~ ., # . means all the other columns
data = cancer,
parm = list(split = "information"),
cp = -1,
minsplit = 2,
minbucket = 1
)
# Display the cp table as a data frame
data.frame(full_tree$cptable)
## CP nsplit rel.error xerror xstd
## 1 0.773584906 0 1.000000000 1.0000000 0.05440140
## 2 0.021226415 1 0.226415094 0.2405660 0.03214090
## 3 0.017295597 3 0.183962264 0.2311321 0.03156514
## 4 0.016509434 6 0.132075472 0.2122642 0.03036546
## 5 0.014150943 8 0.099056604 0.2122642 0.03036546
## 6 0.004716981 11 0.056603774 0.1886792 0.02876510
## 7 0.003930818 17 0.028301887 0.1886792 0.02876510
## 8 0.002358491 23 0.004716981 0.2264151 0.03127141
## 9 -1.000000000 25 0.000000000 0.2264151 0.03127141
Find the relative error and cp value to prune the full tree
full_tree$cptable |>
data.frame() |>
# finding the row with the smallest xerror
slice_min(xerror,
with_ties = F) |>
# Calculating the xerror + xstd
mutate(xcutoff = xerror + xstd) |>
# Pulling out the xcutoff and saving it
pull(xcutoff) ->
xcutoff
# Finding the cp value to prune the tree
full_tree$cptable |>
data.frame() |>
# Keeping the rows with a xerror below the xerror cut off
filter(xerror < xcutoff) |>
slice(1) |>
# Pulling out the cp value and saving it
pull(CP) ->
cp_cutoff
The xerror cutoff is: 0.2174
The cp value to prune the tree is: 0.0165
Prune the tree appropriately then plot the resulting tree.
Name the pruned tree as tree_pruned
.
tree_pruned <-
prune(
tree = full_tree,
cp = cp_cutoff
)
rpart.plot(
x = tree_pruned,
type = 5,
extra = 101
)
Left-most node: If a tumor has points below 0.051 and an area less than 696, it is expected to be benign (low points + small area = benign)
Right-most node: If a tumor has points above 0.051 and an area above 791, it is expected to be malignant (high points + large area = malignant)
Which features, if any, are important when diagnosing a tumor as benign or malignant, according to the pruned classification tree? Which two are the least useful?
varImp(object = tree_pruned) |>
arrange(-Overall)
## Overall
## points 269.193890
## area 243.159674
## perimeter 240.623872
## radius 238.276245
## concavity 193.172384
## texture 64.485358
## smoothness 3.329745
## symmetry 3.329745
## compactness 0.000000
## dimension 0.000000
Points, area, perimeter, radius, and concavity all seem to have high predictive strength.
Compactness and dimension don’t add any predictive power to the classification tree.
tree_pruned |>
pluck("cptable") |>
data.frame()
## CP nsplit rel.error xerror xstd
## 1 0.77358491 0 1.0000000 1.0000000 0.05440140
## 2 0.02122642 1 0.2264151 0.2405660 0.03214090
## 3 0.01729560 3 0.1839623 0.2311321 0.03156514
## 4 0.01650943 6 0.1320755 0.2122642 0.03036546
Use the CP table below to calculate the estimated error rate (how often the tree will predict the cancer incorrectly) using some method of cross-validation.
The estimated error using cross-validation is: no information error rate \(\times\) the pruned tree’s xerror:
\[\textrm{Estimated error} = (1-0.627) \times 0.212 = 0.079\]