## Load libraries
library(DT)
library(ggthemes)
library(leaflet)
library(patchwork)
library(plotly)
library(tidyverse)
library(data.table)
library(ggplot2)
library(cluster)
library(NbClust)
library(gridExtra)
library(qrcode)
## Output QR code
plot(
qrcode::qr_code(
"https://rpubs.com/rgib/DS7130-Final-Project"
)
)An Exploratory Clustering Analysis of the Edibility of Gilled Mushrooms
Introduction
A dataset retrieved from the UCI Machine Learning repository documenting various visual and environmental characteristics of gilled mushrooms is frequently used for classification: to predict, based on those documented characteristics of a hypothetical gilled mushroom, whether it is edible or poisonous/of unknown edibility. Per the relevant documentation, classification models trained on this dataset can achieve perfect or near perfect accuracy in their predictions; however, backsolving why those models make the predictions they do is less obviously clear.
This analysis will:
- Identify which visual and environmental characteristics of these gilled mushrooms may be most useful in predicting whether a given mushroom is poisonous through exploratory data analysis.
- Evaluate the relevance of the chosen predictors by assessing how closely k-means clustering, with \(k = 2\) means, matches the target classes with those predictors vs. the complete set of predictors.
By evaluating a variable selection procedure, we seek to potentially support future classification-based analyses that could yield a more carefully-considered set of predictive models that are more parsimonious and meaningful than those that leave variable selection to classification estimators.
Data Extraction and Preparation
Source Code
## Load dataset, add column names, and make levels for each
## categorical variable human-readable
mushrooms <- readr::read_csv(
"data/agaricus-lepiota.data",
col_names = c(
"poisonous",
"cap_shape",
"cap_surface",
"cap_color",
"bruises",
"odor",
"gill_attachment",
"gill_spacing",
"gill_size",
"gill_color",
"stalk_shape",
"stalk_root",
"stalk_surface_above_ring",
"stalk_surface_below_ring",
"stalk_color_above_ring",
"stalk_color_below_ring",
"veil_type",
"veil_color",
"ring_number",
"ring_type",
"spore_print_color",
"population",
"habitat"
)
) %>%
dplyr::mutate(
poisonous = factor(
dplyr::case_when(
poisonous == "e" ~ "Edible",
poisonous == "p" ~ "Poisonous or Unknown"
),
levels = c("Poisonous or Unknown", "Edible"),
ordered = TRUE
),
cap_shape = as.factor(
dplyr::case_when(
cap_shape == "b" ~ "Bell",
cap_shape == "c" ~ "Conical",
cap_shape == "x" ~ "Convex",
cap_shape == "f" ~ "Flat",
cap_shape == "k" ~ "Knobbed",
cap_shape == "s" ~ "Sunken"
)
),
cap_surface = as.factor(
dplyr::case_when(
cap_surface == "f" ~ "Fibrous",
cap_surface == "g" ~ "Grooves",
cap_surface == "y" ~ "Scaly",
cap_surface == "s" ~ "Smooth"
)
),
cap_color = as.factor(
dplyr::case_when(
cap_color == "n" ~ "Brown",
cap_color == "b" ~ "Buff",
cap_color == "c" ~ "Cinnamon",
cap_color == "g" ~ "Gray",
cap_color == "r" ~ "Green",
cap_color == "p" ~ "Pink",
cap_color == "u" ~ "Purple",
cap_color == "e" ~ "Red",
cap_color == "w" ~ "White",
cap_color == "y" ~ "Yellow"
)
),
bruises = as.factor(
dplyr::case_when(
bruises == TRUE ~ "Bruises",
bruises == FALSE ~ "No"
)
),
odor = as.factor(
dplyr::case_when(
odor == "a" ~ "Almond",
odor == "l" ~ "Anise",
odor == "c" ~ "Creosote",
odor == "y" ~ "Fishy",
odor == "f" ~ "Foul",
odor == "m" ~ "Musty",
odor == "n" ~ "None",
odor == "p" ~ "Pungent",
odor == "s" ~ "Spicy"
)
),
gill_attachment = as.factor(
dplyr::case_when(
gill_attachment == "a" ~ "Attached",
gill_attachment == "d" ~ "Descending",
gill_attachment == "f" ~ "Free",
gill_attachment == "n" ~ "Notched"
)
),
gill_spacing = as.factor(
dplyr::case_when(
gill_spacing == "c" ~ "Close",
gill_spacing == "w" ~ "Crowded",
gill_spacing == "d" ~ "Distant"
)
),
gill_size = as.factor(
dplyr::case_when(
gill_size == "b" ~ "Broad",
gill_size == "n" ~ "Narrow"
)
),
gill_color = as.factor(
dplyr::case_when(
gill_color == "k" ~ "Black",
gill_color == "n" ~ "Brown",
gill_color == "b" ~ "Buff",
gill_color == "h" ~ "Chocolate",
gill_color == "g" ~ "Gray",
gill_color == "r" ~ "Green",
gill_color == "o" ~ "Orange",
gill_color == "p" ~ "Pink",
gill_color == "u" ~ "Purple",
gill_color == "e" ~ "Red",
gill_color == "w" ~ "White",
gill_color == "y" ~ "Yellow"
)
),
stalk_shape = as.factor(
dplyr::case_when(
stalk_shape == "e" ~ "Enlarging",
stalk_shape == "t" ~ "Tapering"
)
),
stalk_root = as.factor(
dplyr::case_when(
stalk_root == "b" ~ "Bulbous",
stalk_root == "c" ~ "Club",
stalk_root == "u" ~ "Cup",
stalk_root == "e" ~ "Equal",
stalk_root == "z" ~ "Rhizomorphs",
stalk_root == "r" ~ "Rooted",
stalk_root == "?" ~ NA
)
),
stalk_surface_above_ring = as.factor(
dplyr::case_when(
stalk_surface_above_ring == "f" ~ "Fibrous",
stalk_surface_above_ring == "y" ~ "Scaly",
stalk_surface_above_ring == "k" ~ "Silky",
stalk_surface_above_ring == "s" ~ "Smooth"
)
),
stalk_surface_below_ring = as.factor(
dplyr::case_when(
stalk_surface_below_ring == "f" ~ "Fibrous",
stalk_surface_below_ring == "y" ~ "Scaly",
stalk_surface_below_ring == "k" ~ "Silky",
stalk_surface_below_ring == "s" ~ "Smooth"
)
),
stalk_color_above_ring = as.factor(
dplyr::case_when(
stalk_color_above_ring == "n" ~ "Brown",
stalk_color_above_ring == "b" ~ "Buff",
stalk_color_above_ring == "c" ~ "Cinnamon",
stalk_color_above_ring == "g" ~ "Gray",
stalk_color_above_ring == "o" ~ "Orange",
stalk_color_above_ring == "p" ~ "Pink",
stalk_color_above_ring == "e" ~ "Red",
stalk_color_above_ring == "w" ~ "White",
stalk_color_above_ring == "y" ~ "Yellow"
)
),
stalk_color_below_ring = as.factor(
dplyr::case_when(
stalk_color_below_ring == "n" ~ "Brown",
stalk_color_below_ring == "b" ~ "Buff",
stalk_color_below_ring == "c" ~ "Cinnamon",
stalk_color_below_ring == "g" ~ "Gray",
stalk_color_below_ring == "o" ~ "Orange",
stalk_color_below_ring == "p" ~ "Pink",
stalk_color_below_ring == "e" ~ "Red",
stalk_color_below_ring == "w" ~ "White",
stalk_color_below_ring == "y" ~ "Yellow"
)
),
veil_type = as.factor(
dplyr::case_when(
veil_type == "p" ~ "Partial",
veil_type == "u" ~ "Universal"
)
),
veil_color = as.factor(
dplyr::case_when(
veil_color == "n" ~ "Brown",
veil_color == "o" ~ "Orange",
veil_color == "w" ~ "White",
veil_color == "y" ~ "Yellow"
)
),
ring_number = as.factor(
dplyr::case_when(
ring_number == "n" ~ "None",
ring_number == "o" ~ "One",
ring_number == "t" ~ "Two"
)
),
ring_type = as.factor(
dplyr::case_when(
ring_type == "c" ~ "Cobwebby",
ring_type == "e" ~ "Evanescent",
ring_type == "f" ~ "Flaring",
ring_type == "l" ~ "Large",
ring_type == "n" ~ "None",
ring_type == "p" ~ "Pendant",
ring_type == "s" ~ "Sheathing",
ring_type == "z" ~ "Zone"
)
),
spore_print_color = as.factor(
dplyr::case_when(
spore_print_color == "k" ~ "Black",
spore_print_color == "n" ~ "Brown",
spore_print_color == "b" ~ "Buff",
spore_print_color == "h" ~ "Chocolate",
spore_print_color == "r" ~ "Green",
spore_print_color == "o" ~ "Orange",
spore_print_color == "u" ~ "Purple",
spore_print_color == "w" ~ "White",
spore_print_color == "y" ~ "Yellow"
)
),
population = as.factor(
dplyr::case_when(
population == "a" ~ "Abundant",
population == "c" ~ "Clustered",
population == "n" ~ "Numerous",
population == "s" ~ "Scattered",
population == "v" ~ "Several",
population == "y" ~ "Solitary",
)
),
habitat = as.factor(
dplyr::case_when(
habitat == "g" ~ "Grasses",
habitat == "l" ~ "Leaves",
habitat == "m" ~ "Meadows",
habitat == "p" ~ "Paths",
habitat == "u" ~ "Urban",
habitat == "w" ~ "Waste",
habitat == "d" ~ "Woods",
)
)
)The code above reads in the raw data as a column-delimited file and assigns column names upon import following the documentation referenced below. Additionally, human-readable names were assigned to each level of all 23 variables in the dataset to improve interpretability.
Interactive Table
Source Code and Output
## Output the cleaned, prepared dataset as an
## interactive table
mushrooms %>%
DT::datatable()Description
Above is an interactive searchable table containing the “mushrooms” dataset with the levels of each variable mapped into interpretable, human-readable names rather than the original letter-based categories from the source dataset.
Additional Information
The “mushrooms” dataset referenced above and throughout the code and analysis that follows was retrieved from the UCI Machine Learning Repository, where each row in the dataset (see the interactive table above) represents a “hypothetical” sample of a gilled mushroom originally extracted from an Audobon Society field guide. A few notes on this dataset:
- Our understanding of “hypothetical” (as the documentation puts it) in this context means a possible configuration of each gilled mushroom species covered rather than meaning that the dataset is synthetic.
- The target variable, while named “poisonous” in the dataset above and its documentation, will be referred to as “Edibility” in most contexts within this analysis.
- Additionally, the two levels of this variable are not “Poisonous” and “Edible” but rather “Poisonous or Unknown” and “Edible” since there is a chance that any mushroom of unknown edibility is indeed poisonous and unsafe to consume.
Exploratory Data Analysis
Overview
Source Code and Output
## Prepare summary statistics for the dataset
# Display columns, data types, and number of rows
mushrooms %>%
dplyr::glimpse()Rows: 8,124
Columns: 23
$ poisonous <ord> Poisonous or Unknown, Edible, Edible, Poisono…
$ cap_shape <fct> Convex, Convex, Bell, Convex, Convex, Convex,…
$ cap_surface <fct> Smooth, Smooth, Smooth, Scaly, Smooth, Scaly,…
$ cap_color <fct> Brown, Yellow, White, White, Gray, Yellow, Wh…
$ bruises <fct> Bruises, Bruises, Bruises, Bruises, No, Bruis…
$ odor <fct> Pungent, Almond, Anise, Pungent, None, Almond…
$ gill_attachment <fct> Free, Free, Free, Free, Free, Free, Free, Fre…
$ gill_spacing <fct> Close, Close, Close, Close, Crowded, Close, C…
$ gill_size <fct> Narrow, Broad, Broad, Narrow, Broad, Broad, B…
$ gill_color <fct> Black, Black, Brown, Brown, Black, Brown, Gra…
$ stalk_shape <fct> Enlarging, Enlarging, Enlarging, Enlarging, T…
$ stalk_root <fct> Equal, Club, Club, Equal, Equal, Club, Club, …
$ stalk_surface_above_ring <fct> Smooth, Smooth, Smooth, Smooth, Smooth, Smoot…
$ stalk_surface_below_ring <fct> Smooth, Smooth, Smooth, Smooth, Smooth, Smoot…
$ stalk_color_above_ring <fct> White, White, White, White, White, White, Whi…
$ stalk_color_below_ring <fct> White, White, White, White, White, White, Whi…
$ veil_type <fct> Partial, Partial, Partial, Partial, Partial, …
$ veil_color <fct> White, White, White, White, White, White, Whi…
$ ring_number <fct> One, One, One, One, One, One, One, One, One, …
$ ring_type <fct> Pendant, Pendant, Pendant, Pendant, Evanescen…
$ spore_print_color <fct> Black, Brown, Brown, Black, Brown, Black, Bla…
$ population <fct> Scattered, Numerous, Numerous, Scattered, Abu…
$ habitat <fct> Urban, Grasses, Meadows, Urban, Grasses, Gras…
# Displays counts by level of each variable and the
# number of NAs (if applicable)
mushrooms %>%
summary() %>%
knitr::kable()| poisonous | cap_shape | cap_surface | cap_color | bruises | odor | gill_attachment | gill_spacing | gill_size | gill_color | stalk_shape | stalk_root | stalk_surface_above_ring | stalk_surface_below_ring | stalk_color_above_ring | stalk_color_below_ring | veil_type | veil_color | ring_number | ring_type | spore_print_color | population | habitat | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| Poisonous or Unknown:3916 | Bell : 452 | Fibrous:2320 | Brown :2284 | Bruises:3376 | None :3528 | Attached: 210 | Close :6812 | Broad :5612 | Buff :1728 | Enlarging:3516 | Bulbous:3776 | Fibrous: 552 | Fibrous: 600 | White :4464 | White :4384 | Partial:8124 | Brown : 96 | None: 36 | Evanescent:2776 | White :2388 | Abundant : 384 | Grasses:2148 | |
| Edible :4208 | Conical: 4 | Grooves: 4 | Gray :1840 | No :4748 | Foul :2160 | Free :7914 | Crowded:1312 | Narrow:2512 | Pink :1492 | Tapering :4608 | Club : 556 | Scaly : 24 | Scaly : 284 | Pink :1872 | Pink :1872 | NA | Orange: 96 | One :7488 | Flaring : 48 | Brown :1968 | Clustered: 340 | Leaves : 832 | |
| NA | Convex :3656 | Scaly :3244 | Red :1500 | NA | Fishy : 576 | NA | NA | NA | White :1202 | NA | Equal :1120 | Silky :2372 | Silky :2304 | Gray : 576 | Gray : 576 | NA | White :7924 | Two : 600 | Large :1296 | Black :1872 | Numerous : 400 | Meadows: 292 | |
| NA | Flat :3152 | Smooth :2556 | Yellow :1072 | NA | Spicy : 576 | NA | NA | NA | Brown :1048 | NA | Rooted : 192 | Smooth :5176 | Smooth :4936 | Brown : 448 | Brown : 512 | NA | Yellow: 8 | NA | None : 36 | Chocolate:1632 | Scattered:1248 | Paths :1144 | |
| NA | Knobbed: 828 | NA | White :1040 | NA | Almond : 400 | NA | NA | NA | Gray : 752 | NA | NA’s :2480 | NA | NA | Buff : 432 | Buff : 432 | NA | NA | NA | Pendant :3968 | Green : 72 | Several :4040 | Urban : 368 | |
| NA | Sunken : 32 | NA | Buff : 168 | NA | Anise : 400 | NA | NA | NA | Chocolate: 732 | NA | NA | NA | NA | Orange : 192 | Orange : 192 | NA | NA | NA | NA | Buff : 48 | Solitary :1712 | Waste : 192 | |
| NA | NA | NA | (Other): 220 | NA | (Other): 484 | NA | NA | NA | (Other) :1170 | NA | NA | NA | NA | (Other): 140 | (Other): 156 | NA | NA | NA | NA | (Other) : 144 | NA | Woods :3148 |
# Display the distribution of the target variable
mushrooms %>%
ggplot2::ggplot(
mapping = ggplot2::aes(
y = poisonous
)
) +
ggplot2::stat_count() +
ggplot2::labs(
title = "Count Frequency by Edibility",
x = "Count Frequency",
y = "Edibility"
) +
ggthemes::theme_pander()Description
- The structural output above reports the number of observations and variables in the dataset, variable names and data types, and the first few observations for each variable.
- It shows the counts for the seven (7) most frequent levels of each variable if the variable has more than seven levels or all of the levels in the dataset if it has seven or fewer levels.
- This table also reports the number of NA’s for each variable if relevant.
- The horizontal bar chart above displays the frequency of mushrooms in the sample by edibility.
Relevant Analysis
Per the summary frequency table, dataset structure, and horizontal bar chart above:
- There are 8,124 observations across 23 variables in this dataset. Only one variable has NAs: Stalk Root (“stalk_root” in the dataset) with 2,480 such observations.
- The target variable, Edibility (“poisonous” in the dataset), is approximately balanced, with a roughly similar distribution of values across the two classes, “Edible” and “Poisonous or Unknown.”
- Most predictor variables in this dataset show imbalanced level distributions, with a single or small subset of levels accounting for most of the sampled observations.
- Many predictors appear to have a mixture of:
- A few dominant levels that contain nearly all observations
- Several low-frequency levels that contain relatively few observations (approx. < 500)
- Many predictors appear to have a mixture of:
- In future classification modeling, these sparse predictor levels may need to be addressed through recoding, collapsing categories, or other pre-processing methods, but that is beyond the scope of this course.
Cap Characteristics
Source Code and Output
## Creating patchwork for cap visual characteristics
# Creates a horizontal bar chart for cap shape
# (stacked by level of the target)
cap_shape_freq <- mushrooms %>%
ggplot2::ggplot(
mapping = ggplot2::aes(
y = cap_shape,
fill = cap_shape
)
) +
ggplot2::stat_count(show.legend = FALSE) +
ggplot2::labs(
x = "Frequency",
y = "Cap Shape"
)
# Return a horizontal 100% stacked bar chart for cap shape
# (by level of the target variable)
cap_shape_100 <- mushrooms %>%
ggplot2::ggplot(
mapping = ggplot2::aes(
y = cap_shape,
fill = poisonous
)
) +
ggplot2::geom_bar(
position = "fill"
) +
ggplot2::scale_x_continuous(
breaks = c(0.00, 0.33, 0.5, 0.66, 1)
) +
ggplot2::labs(
x = "Relative Frequency",
y = NULL,
fill = "Edibility",
)
# Output a horizontal bar chart for cap surface
# (by level of the target variable)
cap_surface_freq <- mushrooms %>%
ggplot2::ggplot(
mapping = ggplot2::aes(
y = cap_surface,
fill = cap_surface
)
) +
ggplot2::stat_count(show.legend = FALSE) +
ggplot2::labs(
x = "Frequency",
y = "Cap Surface Type"
)
# Output a horizontal 100% stacked bar chart for cap surface
# (by level of the target variable)
cap_surface_100 <- mushrooms %>%
ggplot2::ggplot(
mapping = ggplot2::aes(
y = cap_surface,
fill = poisonous
)
) +
ggplot2::geom_bar(
position = "fill"
) +
ggplot2::scale_x_continuous(
breaks = c(0.00, 0.33, 0.5, 0.66, 1)
) +
ggplot2::labs(
x = "Relative Frequency",
y = NULL,
fill = "Edibility",
)
# Output a horizontal bar chart for cap color (by
# level of the target variable)
cap_color_freq <- mushrooms %>%
ggplot2::ggplot(
mapping = ggplot2::aes(
y = cap_color,
fill = cap_color
)
) +
ggplot2::stat_count(show.legend = FALSE) +
ggplot2::labs(
x = "Frequency",
y = "Cap Color"
)
# Output a horizontal 100% stacked bar chart for cap color
# (by level of the target)
cap_color_100 <- mushrooms %>%
ggplot2::ggplot(
mapping = ggplot2::aes(
y = cap_color,
fill = poisonous
)
) +
ggplot2::geom_bar(
position = "fill"
) +
ggplot2::scale_x_continuous(
breaks = c(0.00, 0.33, 0.5, 0.66, 1)
) +
ggplot2::labs(
x = "Relative Frequency",
y = NULL,
fill = "Edibility"
)
# Create a patchwork layout with 3 rows (1 for each
# relevant variable); with the frequency and 100% stacked
# bar charts side by side for each variable
cap_viz_patchwork <- (
(cap_shape_freq + cap_shape_100)
/ (cap_surface_freq + cap_surface_100)
/ (cap_color_freq + cap_color_100)
)
# Customize and display the patchwork layout
cap_viz_patchwork +
patchwork::plot_layout(
guides = "collect"
) +
patchwork::plot_annotation(
title = "Relative Frequency of Edibility and Count Frequency by Level of\nVarious Cap Characteristics"
) &
ggthemes::theme_pander() &
ggplot2::theme(
legend.position = "top"
)Relevant Analysis
- Consistent with the summary statistics, all three cap-characteristic variables above—Cap Shape, Cap Surface Type, and Cap Color—have uneven level distributions, with some levels occurring much more frequently than others. Several levels are also relatively sparse, which may limit their usefulness in later modeling unless categories are collapsed or re-coded.
- Although there are some level-to-level differences in the relative frequency of Edible versus Poisonous mushrooms for all three variables, none of these differences meet or exceed our screening threshold of a 2:1 ratio between the two target levels. Therefore, these variables do not appear to be strong enough to retain for the clustering or classification-focused portions of our analysis.
Bruise Characteristics
Source Code and Output
## Creating patchwork for bruise characteristics
# Create a stacked horizontal bar chart for bruises
# by level of the response/target variable
bruises_freq <- mushrooms %>%
ggplot2::ggplot(
mapping = ggplot2::aes(
y = bruises,
fill = bruises
)
) +
ggplot2::stat_count(show.legend = FALSE) +
ggplot2::labs(
x = "Frequency",
y = "Bruises?"
)
# Create a 100% stacked horizontal bar chart for bruises
# by level of the response/target variable
bruises_100 <- mushrooms %>%
ggplot2::ggplot(
mapping = ggplot2::aes(
y = bruises,
fill = poisonous
)
) +
ggplot2::geom_bar(
position = "fill"
) +
ggplot2::scale_x_continuous(
breaks = c(0.00, 0.33, 0.5, 0.66, 1)
) +
ggplot2::labs(
x = "Relative Frequency",
y = NULL,
fill = "Edibility",
)
# Defines a patchwork plot layout (1 row, 2 columns;
# with the horizontal bar chart and horizontal 100% stacked
# bar charts for "bruises" placed side-by-side)
bruises_viz_patchwork <- (
(bruises_freq + bruises_100)
)
# Customizes and displays the patchwork plot layout
bruises_viz_patchwork +
patchwork::plot_layout(
guides = "collect"
) +
patchwork::plot_annotation(
title = "Relative Frequency of Edibility and Count Frequency by \nWhether a Given Mushroom is Bruised"
) &
ggthemes::theme_pander() &
ggplot2::theme(
legend.position = "top"
)Relevant Analysis
- Unlike several of the cap-related variables, Bruises shows a clear and practically meaningful difference in the relative frequency of Edible versus Poisonous mushrooms across its two levels.
- Mushrooms with Bruises are predominantly Edible (about 81.5% Edible vs. 18.5% Poisonous), while mushrooms with No Bruises are predominantly Poisonous (about 30.7% Edible vs. 69.3% Poisonous).
- This difference exceeds our screening guideline of a 2:1 ratio between the two target levels, indicating that Bruises is a strong predictor of interest.
- Because the variable has only two levels, is easy to interpret, and shows a strong separation by Edibility, Bruises should be retained for the main clustering and classification-oriented portions of the analysis.
Odor Characteristics
Source Code and Output
## Creating patchwork for odor characteristics
# Create a stacked horizontal bar chart for odors
# by level of the response/target variable
odor_freq <- mushrooms %>%
ggplot2::ggplot(
mapping = ggplot2::aes(
y = odor,
fill = odor
)
) +
ggplot2::stat_count(show.legend = FALSE) +
ggplot2::labs(
x = "Frequency",
y = "Odor"
)
# Create a 100% stacked horizontal bar chart for odors
# by level of the response/target variable
odor_100 <- mushrooms %>%
ggplot2::ggplot(
mapping = ggplot2::aes(
y = odor,
fill = poisonous
)
) +
ggplot2::geom_bar(
position = "fill"
) +
ggplot2::scale_x_continuous(
breaks = c(0.00, 0.33, 0.5, 0.66, 1)
) +
ggplot2::labs(
x = "Relative Frequency",
y = NULL,
fill = "Edibility",
)
# Defines a patchwork plot layout (1 row, 2 columns;
# with the horizontal bar chart and horizontal 100% stacked
# bar charts for "odor" placed side-by-side)
odor_viz_patchwork <- (
(odor_freq + odor_100)
)
# Customizes and displays the patchwork plot layout
odor_viz_patchwork +
patchwork::plot_layout(
guides = "collect"
) +
patchwork::plot_annotation(
title = "Relative Frequency of Edibility and Count Frequency by a Mushroom's Odor"
) &
ggthemes::theme_pander() &
ggplot2::theme(
legend.position = "top"
)Relevant Analysis
- Odor shows the strongest separation between Edible and Poisonous mushrooms among the predictors examined so far.
- Several odor levels are perfectly separated by the target: Almond and Anise are entirely Edible, whereas Creosote, Fishy, Foul, Musty, Pungent, and Spicy are entirely Poisonous.
- The level None is also highly informative, with the vast majority of mushrooms in this category being Edible.
- These differences are far beyond our screening guideline of a 2:1 ratio between the two target levels, indicating that Odor is an extremely strong predictor of interest.
- Because Odor shows both clear separation by Edibility and practical interpretability, it should be retained as one of the most important variables for the main clustering and classification-oriented portions of the analysis.
Gill Characteristics
Source Code and Output
## Creating patchwork for gill visual characteristics
# Create a stacked horizontal bar chart for gill attachment
# by level of the response/target variable
gill_attachment_freq <- mushrooms %>%
ggplot2::ggplot(
mapping = ggplot2::aes(
y = gill_attachment,
fill = gill_attachment
)
) +
ggplot2::stat_count(show.legend = FALSE) +
ggplot2::labs(
x = "Frequency",
y = "Gill Attachment"
)
# Create a 100% stacked horizontal bar chart for gill attachment
# by level of the response/target variable
gill_attachment_100 <- mushrooms %>%
ggplot2::ggplot(
mapping = ggplot2::aes(
y = gill_attachment,
fill = poisonous
)
) +
ggplot2::geom_bar(
position = "fill"
) +
ggplot2::scale_x_continuous(
breaks = c(0.00, 0.33, 0.5, 0.66, 1)
) +
ggplot2::labs(
x = "Relative Frequency",
y = NULL,
fill = "Edibility",
)
# Create a stacked horizontal bar chart for gill spacing
# by level of the response/target variable
gill_spacing_freq <- mushrooms %>%
ggplot2::ggplot(
mapping = ggplot2::aes(
y = gill_spacing,
fill = gill_spacing
)
) +
ggplot2::stat_count(show.legend = FALSE) +
ggplot2::labs(
x = "Frequency",
y = "Gill Spacing"
)
# Create a 100% stacked horizontal bar chart for gill spacing
# by level of the response/target variable
gill_spacing_100 <- mushrooms %>%
ggplot2::ggplot(
mapping = ggplot2::aes(
y = gill_spacing,
fill = poisonous
)
) +
ggplot2::geom_bar(
position = "fill"
) +
ggplot2::scale_x_continuous(
breaks = c(0.00, 0.33, 0.5, 0.66, 1)
) +
ggplot2::labs(
x = "Relative Frequency",
y = NULL,
fill = "Edibility",
)
# Create a stacked horizontal bar chart for gill size
# by level of the response/target variable
gill_size_freq <- mushrooms %>%
ggplot2::ggplot(
mapping = ggplot2::aes(
y = gill_size,
fill = gill_size
)
) +
ggplot2::stat_count(show.legend = FALSE) +
ggplot2::labs(
x = "Frequency",
y = "Gill Size"
)
# Create a 100% stacked horizontal bar chart for gill size
# by level of the response/target variable
gill_size_100 <- mushrooms %>%
ggplot2::ggplot(
mapping = ggplot2::aes(
y = gill_size,
fill = poisonous
)
) +
ggplot2::geom_bar(
position = "fill"
) +
ggplot2::scale_x_continuous(
breaks = c(0.00, 0.33, 0.5, 0.66, 1)
) +
ggplot2::labs(
x = "Relative Frequency",
y = NULL,
fill = "Edibility",
)
# Create a stacked horizontal bar chart for gill color by
# level of the response variable
gill_color_freq <- mushrooms %>%
ggplot2::ggplot(
mapping = ggplot2::aes(
y = gill_color,
fill = gill_color
)
) +
ggplot2::stat_count(show.legend = FALSE) +
ggplot2::labs(
x = "Frequency",
y = "Gill Color"
)
# Create a 100% stacked horizontal bar chart for gill color
# by value of the response variable
gill_color_100 <- mushrooms %>%
ggplot2::ggplot(
mapping = ggplot2::aes(
y = gill_color,
fill = poisonous
)
) +
ggplot2::geom_bar(
position = "fill"
) +
ggplot2::scale_x_continuous(
breaks = c(0.00, 0.33, 0.5, 0.66, 1)
) +
ggplot2::labs(
x = "Relative Frequency",
y = NULL,
fill = "Edibility",
)
# Defines a patchwork plot layout (4 rows, 2 columns;
# one row per predictor, with stacked and 100% stacked
# horizontal bar charts placed side by side)
gill_viz_patchwork <- (
(gill_attachment_freq + gill_attachment_100)
/ (gill_spacing_freq + gill_spacing_100)
/ (gill_size_freq + gill_size_100)
/ (gill_color_freq + gill_color_100)
)
# Customizes and displays the patchwork plot layout
gill_viz_patchwork +
patchwork::plot_layout(
guides = "collect"
) +
patchwork::plot_annotation(
title = "Relative Frequency of Edibility and Count Frequency by Level of\nVarious Gill Characteristics"
) &
ggthemes::theme_pander() &
ggplot2::theme(
legend.position = "top"
)Relevant Analysis
Per the left column in the plot layout above, the gill characteristics variables in this dataset—Gill Attachment, Gill Spacing, Gill Size, and Gill Color—all demonstrate the previously identified level imbalances both (a) across the dataset and (b) in the cap visual characteristics.
Of the variables in this set of predictors, Gill Size and Gill Color appear to be relevant to future classification and clustering analyses.
- Gill Size, per the 100% stacked bar chart displaying the distribution of the target variable by level of the predictor, exceeds the 2:1 ratio between levels of the response, indicating that it may be useful for both clustering and classification tasks.
- Gill Color also shows meaningful separation by the target across several levels, with multiple categories meeting or exceeding the 2:1 ratio between response levels and some levels showing especially strong differences in edibility.
By contrast, Gill Attachment and Gill Spacing show weaker or less consistent separation across levels of the response and therefore appear less useful for the main clustering and classification-oriented portions of the analysis.
Therefore, among the gill-related predictors, Gill Size and Gill Color should be retained for the main analysis, while Gill Attachment and Gill Spacing do not appear sufficiently informative to include.
Stalk Characteristics
Source Code and Output
## Creating patchwork for stalk characteristics
# Create a horizontal bar chart for stalk
# shape by level of the response/target variable
stalk_shape_freq <- mushrooms %>%
ggplot2::ggplot(
mapping = ggplot2::aes(
y = stalk_shape,
fill = stalk_shape
)
) +
ggplot2::stat_count(show.legend = FALSE) +
ggplot2::labs(
x = "Frequency",
y = "Stalk Shape"
)
# Create a 100% stacked horizontal bar chart for stalk
# shape by level of the response/target variable
stalk_shape_100 <- mushrooms %>%
ggplot2::ggplot(
mapping = ggplot2::aes(
y = stalk_shape,
fill = poisonous
)
) +
ggplot2::geom_bar(
position = "fill"
) +
ggplot2::scale_x_continuous(
breaks = c(0.00, 0.33, 0.5, 0.66, 1)
) +
ggplot2::labs(
x = "Relative Frequency",
y = NULL,
fill = "Edibility",
)
# Create a horizontal bar chart for stalk
# root by level of the response/target variable
stalk_root_freq <- mushrooms %>%
ggplot2::ggplot(
mapping = ggplot2::aes(
y = stalk_root,
fill = stalk_root
)
) +
ggplot2::stat_count(show.legend = FALSE) +
ggplot2::labs(
x = "Frequency",
y = "Stalk Root"
)
# Create a 100% stacked horizontal bar chart for stalk
# root by level of the response/target variable
stalk_root_100 <- mushrooms %>%
ggplot2::ggplot(
mapping = ggplot2::aes(
y = stalk_root,
fill = poisonous
)
) +
ggplot2::geom_bar(
position = "fill"
) +
ggplot2::scale_x_continuous(
breaks = c(0.00, 0.33, 0.5, 0.66, 1)
) +
ggplot2::labs(
x = "Relative Frequency",
y = NULL,
fill = "Edibility",
)
# Create a horizontal bar chart for stalk
# surface above ring by level of the response/target
# variable
stalk_surface_above_ring_freq <- mushrooms %>%
ggplot2::ggplot(
mapping = ggplot2::aes(
y = stalk_surface_above_ring,
fill = stalk_surface_above_ring
)
) +
ggplot2::stat_count(show.legend = FALSE) +
ggplot2::labs(
x = "Frequency",
y = "Stalk Surface Above Ring"
)
# Create a 100% stacked horizontal bar chart for stalk
# surface above ring by level of the response/target
# variable
stalk_surface_above_ring_100 <- mushrooms %>%
ggplot2::ggplot(
mapping = ggplot2::aes(
y = stalk_surface_above_ring,
fill = poisonous
)
) +
ggplot2::geom_bar(
position = "fill"
) +
ggplot2::scale_x_continuous(
breaks = c(0.00, 0.33, 0.5, 0.66, 1)
) +
ggplot2::labs(
x = "Relative Frequency",
y = NULL,
fill = "Edibility",
)
# Create a horizontal bar chart for stalk
# surface below ring by level of the response/target
# variable
stalk_surface_below_ring_freq <- mushrooms %>%
ggplot2::ggplot(
mapping = ggplot2::aes(
y = stalk_surface_below_ring,
fill = stalk_surface_below_ring
)
) +
ggplot2::stat_count(show.legend = FALSE) +
ggplot2::labs(
x = "Frequency",
y = "Stalk Surface Below Ring"
)
# Create a 100% stacked horizontal bar chart for stalk
# surface below ring by level of the response/target
# variable
stalk_surface_below_ring_100 <- mushrooms %>%
ggplot2::ggplot(
mapping = ggplot2::aes(
y = stalk_surface_below_ring,
fill = poisonous
)
) +
ggplot2::geom_bar(
position = "fill"
) +
ggplot2::scale_x_continuous(
breaks = c(0.00, 0.33, 0.5, 0.66, 1)
) +
ggplot2::labs(
x = "Relative Frequency",
y = NULL,
fill = "Edibility",
)
# Create a horizontal bar chart for stalk
# color above ring by level of the response/target
# variable
stalk_color_above_ring_freq <- mushrooms %>%
ggplot2::ggplot(
mapping = ggplot2::aes(
y = stalk_color_above_ring,
fill = stalk_color_above_ring
)
) +
ggplot2::stat_count(show.legend = FALSE) +
ggplot2::labs(
x = "Frequency",
y = "Stalk Color Above Ring"
)
# Create a 100% stacked horizontal bar chart for stalk
# shape by level of the response/target variable
stalk_color_above_ring_100 <- mushrooms %>%
ggplot2::ggplot(
mapping = ggplot2::aes(
y = stalk_color_above_ring,
fill = poisonous
)
) +
ggplot2::geom_bar(
position = "fill"
) +
ggplot2::scale_x_continuous(
breaks = c(0.00, 0.33, 0.5, 0.66, 1)
) +
ggplot2::labs(
x = "Relative Frequency",
y = NULL,
fill = "Edibility",
)
# Create a horizontal bar chart for stalk
# color below ring by level of the response/target
# variable
stalk_color_below_ring_freq <- mushrooms %>%
ggplot2::ggplot(
mapping = ggplot2::aes(
y = stalk_color_below_ring,
fill = stalk_color_below_ring
)
) +
ggplot2::stat_count(show.legend = FALSE) +
ggplot2::labs(
x = "Frequency",
y = "Stalk Color Below Ring"
)
# Create a 100% stacked horizontal bar chart for stalk
# color below ring by level of the response/target
# variable
stalk_color_below_ring_100 <- mushrooms %>%
ggplot2::ggplot(
mapping = ggplot2::aes(
y = stalk_color_below_ring,
fill = poisonous
)
) +
ggplot2::geom_bar(
position = "fill"
) +
ggplot2::scale_x_continuous(
breaks = c(0.00, 0.33, 0.5, 0.66, 1)
) +
ggplot2::labs(
x = "Relative Frequency",
y = NULL,
fill = "Edibility",
)
# Defines a patchwork plot layout (1 row, 2 columns;
# with the horizontal bar chart and horizontal 100% stacked
# bar charts for "bruises" placed side-by-side)
stalk_viz_patchwork <- (
(stalk_shape_freq + stalk_shape_100)
/ (stalk_root_freq + stalk_root_100)
/ (stalk_surface_above_ring_freq + stalk_surface_above_ring_100)
/ (stalk_surface_below_ring_freq + stalk_surface_below_ring_100)
/ (stalk_color_above_ring_freq + stalk_color_above_ring_100)
/ (stalk_color_below_ring_freq + stalk_color_below_ring_100)
)
# Customizes and displays the patchwork plot layout
stalk_viz_patchwork +
patchwork::plot_layout(
guides = "collect"
) +
patchwork::plot_annotation(
title = "Relative Frequency of Edibility and Count Frequency by Level of\nVarious Stalk Characteristics"
) &
ggthemes::theme_pander() &
ggplot2::theme(
legend.position = "top"
)Relevant Analysis
Per the left column in the plot layout above, the stalk-related predictors—Stalk Shape, Stalk Root, Stalk Surface Above Ring, Stalk Surface Below Ring, Stalk Color Above Ring, and Stalk Color Below Ring—show the same previously noted pattern of uneven level distributions, with a small number of dominant levels accounting for most observations and several lower-frequency levels appearing much less often.
Among these variables, Stalk Surface Above Ring appears to be the most appropriate stalk-related predictor to retain for the main analysis.
- Both Stalk Surface Above Ring and Stalk Surface Below Ring show strong separation by the target, particularly for the Silky level, which is strongly associated with Poisonous or Unknown mushrooms.
- However, because these two variables display very similar patterns across their levels, retaining both would add little new information. For that reason, only Stalk Surface Above Ring will be carried forward into the final predictor set.
By contrast, Stalk Shape shows only modest differences in the relative frequency of the target across its two levels and therefore does not appear sufficiently informative for the main clustering- and classification-oriented portions of the analysis.
Although Stalk Root shows some separation by the target, the variable contains a substantial number of missing values, so it will not be retained in the final set of predictors.
Stalk Color Above Ring and Stalk Color Below Ring do show strong separation for some individual levels, including levels that are entirely or almost entirely associated with one target level. However, these patterns are not consistent enough across the variables as a whole to justify retaining them in the final predictor set.
Therefore, among the stalk-related predictors, only Stalk Surface Above Ring will be retained for the main analysis, while Stalk Shape, Stalk Root, Stalk Surface Below Ring, Stalk Color Above Ring, and Stalk Color Below Ring will be excluded.
Veil Characteristics
Source Code and Output
## Creating patchwork for veil visual characteristics
# Creates a horizontal bar chart for veil type
# by value of the response variable
veil_type_freq <- mushrooms %>%
ggplot2::ggplot(
mapping = ggplot2::aes(
y = veil_type,
fill = veil_type
)
) +
ggplot2::stat_count(show.legend = FALSE) +
ggplot2::labs(
x = "Frequency",
y = "Veil Type"
)
# Creates a 100% horizontal stacked bar chart for veil type
# by value of the response variable
veil_type_100 <- mushrooms %>%
ggplot2::ggplot(
mapping = ggplot2::aes(
y = veil_type,
fill = poisonous
)
) +
ggplot2::geom_bar(
position = "fill"
) +
ggplot2::scale_x_continuous(
breaks = c(0.00, 0.33, 0.5, 0.66, 1)
) +
ggplot2::labs(
x = "Relative Frequency",
y = NULL,
fill = "Edibility",
)
# Creates a horizontal bar chart for veil color
# by value of the response variable
veil_color_freq <- mushrooms %>%
ggplot2::ggplot(
mapping = ggplot2::aes(
y = veil_color,
fill = veil_color
)
) +
ggplot2::stat_count(show.legend = FALSE) +
ggplot2::labs(
x = "Frequency",
y = "Veil Color"
)
# Creates a 100% horizontal stacked bar chart for veil color
# by value of the response variable
veil_color_100 <- mushrooms %>%
ggplot2::ggplot(
mapping = ggplot2::aes(
y = veil_color,
fill = poisonous
)
) +
ggplot2::geom_bar(
position = "fill"
) +
ggplot2::scale_x_continuous(
breaks = c(0.00, 0.33, 0.5, 0.66, 1)
) +
ggplot2::labs(
x = "Relative Frequency",
y = NULL,
fill = "Edibility",
)
# Initializes a patchwork plot layout with 1 row and
# 2 columns (side-by-side bar charts for a veil color)
veil_viz_patchwork <- (
(veil_type_freq + veil_type_100)
/ (veil_color_freq + veil_color_100)
)
# Customizes and displays the patchwork plot layout
veil_viz_patchwork +
patchwork::plot_layout(
guides = "collect"
) +
patchwork::plot_annotation(
title = "Relative Frequency of Edibility and Count Frequency by Level of\nVarious Veil Characteristics"
) &
ggthemes::theme_pander() &
ggplot2::theme(
legend.position = "top"
)Relevant Analysis
- Per the horizontal bar chart above (left), the mushrooms sampled primarily have white veils, with a smattering of mushrooms that have orange, brown, and yellow veil colors.
- While three of the four classes of this variable—“Yellow,” “Orange,” and “Brown”—are either all edible or all poisonous, the majority level—mushrooms with “white” veil colors—are nearly evenly split indicating this may be (a) representative of the broader population and (b) indicative that this would be a weak variable for use in clustering and classification tasks.
Ring Characteristics
Source Code and Output
## Creating patchwork for ring visual characteristics
# Creates a horizontal bar chart for ring number
# by variable of the response variable
ring_number_freq <- mushrooms %>%
ggplot2::ggplot(
mapping = ggplot2::aes(
y = ring_number,
fill = ring_number
)
) +
ggplot2::stat_count(show.legend = FALSE) +
ggplot2::labs(
x = "Frequency",
y = "Ring Number"
)
# Creates a 100% horizontal stacked bar chart for
# ring number by level of the response
ring_number_100 <- mushrooms %>%
ggplot2::ggplot(
mapping = ggplot2::aes(
y = ring_number,
fill = poisonous
)
) +
ggplot2::geom_bar(
position = "fill"
) +
ggplot2::scale_x_continuous(
breaks = c(0.00, 0.33, 0.5, 0.66, 1)
) +
ggplot2::labs(
x = "Relative Frequency",
y = NULL,
fill = "Edibility",
)
# Creates a horizontal bar chart for ring type
# by level of the response variable
ring_type_freq <- mushrooms %>%
ggplot2::ggplot(
mapping = ggplot2::aes(
y = ring_type,
fill = ring_type
)
) +
ggplot2::stat_count(show.legend = FALSE) +
ggplot2::labs(
x = "Frequency",
y = "Ring Type"
)
# Creates a 100% horizontal stacked bar chart for ring
# type by level of the response variable
ring_type_100 <- mushrooms %>%
ggplot2::ggplot(
mapping = ggplot2::aes(
y = ring_type,
fill = poisonous
)
) +
ggplot2::geom_bar(
position = "fill"
) +
ggplot2::scale_x_continuous(
breaks = c(0.00, 0.33, 0.5, 0.66, 1)
) +
ggplot2::labs(
x = "Relative Frequency",
y = NULL,
fill = "Edibility",
)
# Initialize a patchwork plot layout with 2 rows
# (one for each ring visual characteristic) and 2
# columns (side-by-side stacked bar charts)
ring_viz_patchwork <- (
(ring_number_freq + ring_number_100)
/ (ring_type_freq + ring_type_100)
)
# Customize and display the patchwork plot layout
ring_viz_patchwork +
patchwork::plot_layout(
guides = "collect"
) +
patchwork::plot_annotation(
title = "Relative Frequency of Edibility and Count Frequency by Level of\nVarious Ring Characteristics"
) &
ggthemes::theme_pander() &
ggplot2::theme(
legend.position = "top"
)Relevant Analysis
- Both ring characteristics variables—Ring Number and Ring Type—have severe level imbalances consistent with the rest of the dataset.
- While two of the three levels of Ring Number exceed the previously outlined threshold (a 2:1 ratio between levels of the target variable for each level of a given predictor) at which a variable may be useful for clustering or classification, the majority level—“one”—is a roughly even split between levels of the target indicating that may be representative of the broader population and may not be useful for classification and/or clustering.
- However, all five levels of Ring Type appear to exceed that threshold, which supports it’s future use in clustering and classification tasks to predict edibility.
Spore Print Characteristics
Source Code and Output
## Creating patchwork for veil visual characteristics
# Create a horizontal stacked bar chart for spore print
# color by level of the response variable
spore_print_color_freq <- mushrooms %>%
ggplot2::ggplot(
mapping = ggplot2::aes(
y = spore_print_color,
fill = spore_print_color
)
) +
ggplot2::stat_count(show.legend = FALSE) +
ggplot2::labs(
x = "Frequency",
y = "Spore Print Color"
)
# Create a 100% stacked horizontal bar chart for spore print
# color by level of the response variable
spore_print_color_100 <- mushrooms %>%
ggplot2::ggplot(
mapping = ggplot2::aes(
y = spore_print_color,
fill = poisonous
)
) +
ggplot2::geom_bar(
position = "fill"
) +
ggplot2::scale_x_continuous(
breaks = c(0.00, 0.33, 0.5, 0.66, 1)
) +
ggplot2::labs(
x = "Relative Frequency",
y = NULL,
fill = "Edibility",
)
# Initialize a patchwork plot layout to display the
# horizontal bar charts side by side
spore_viz_patchwork <- (
(spore_print_color_freq + spore_print_color_100)
)
# Customize and display the patchwork plot layout
spore_viz_patchwork +
patchwork::plot_layout(
guides = "collect"
) +
patchwork::plot_annotation(
title = "Relative Frequency of Edibility and Count Frequency by Level of\nVarious Spore Print Color Characteristics"
) &
ggthemes::theme_pander() &
ggplot2::theme(
legend.position = "top"
)Relevant Analysis
- Level imbalances persist with Spore Print Color. While no single level of the variable accounts for a majority of observations in the dataset, a minority of the classes—“White,” “Chocolate,” “Brown,” and “Black”—account for a majority of the observations indicating corrective action may be required, as with many other variables in this dataset, to develop a robust classification model for the target, poisonous.
- All of the levels of Spore Print Color have a ratio between levels of the target variable that exceeds 2:1, indicating this variable may be useful for predictive modeling in clustering (which will be covered in this analysis) and future classification approaches to this data.
- Substantial imbalances in the response variable persist in the majority classes previously mentioned which lends further weight to the conclusion above.
Environmental Characteristics
Source Code and Output
## Creating patchwork for environmental characteristics
# Creates a horizontal bar chart for population
# by variable of the response variable
population_freq <- mushrooms %>%
ggplot2::ggplot(
mapping = ggplot2::aes(
y = population,
fill = population
)
) +
ggplot2::stat_count(show.legend = FALSE) +
ggplot2::labs(
x = "Frequency",
y = "Population Density"
)
# Creates a 100% horizontal stacked bar chart for
# population by level of the response
population_100 <- mushrooms %>%
ggplot2::ggplot(
mapping = ggplot2::aes(
y = population,
fill = poisonous
)
) +
ggplot2::geom_bar(
position = "fill"
) +
ggplot2::scale_x_continuous(
breaks = c(0.00, 0.33, 0.5, 0.66, 1)
) +
ggplot2::labs(
x = "Relative Frequency",
y = NULL,
fill = "Edibility",
)
# Creates a horizontal bar chart for habitat
# by level of the response variable
habitat_freq <- mushrooms %>%
ggplot2::ggplot(
mapping = ggplot2::aes(
y = habitat,
fill = habitat
)
) +
ggplot2::stat_count(show.legend = FALSE) +
ggplot2::labs(
x = "Frequency",
y = "Habitat"
)
# Creates a 100% horizontal stacked bar chart for habitat
# by level of the response variable
habitat_100 <- mushrooms %>%
ggplot2::ggplot(
mapping = ggplot2::aes(
y = habitat,
fill = poisonous
)
) +
ggplot2::geom_bar(
position = "fill"
) +
ggplot2::scale_x_continuous(
breaks = c(0.00, 0.33, 0.5, 0.66, 1)
) +
ggplot2::labs(
x = "Relative Frequency",
y = NULL,
fill = "Edibility",
)
# Initialize a patchwork plot layout with 2 rows
# (one for each ring visual characteristic) and 2
# columns (side-by-side stacked bar charts)
environment_viz_patchwork <- (
(population_freq + population_100)
/ (habitat_freq + habitat_100)
)
# Customize and display the patchwork plot layout
environment_viz_patchwork +
patchwork::plot_layout(
guides = "collect"
) +
patchwork::plot_annotation(
title = "Relative Frequency of Edibility and Count Frequency by Level of\nVarious Environmental Characteristics"
) &
ggthemes::theme_pander() &
ggplot2::theme(
legend.position = "top"
)Relevant Analysis
- The two environmental predictors shown above—Population Density and Habitat—also display uneven level distributions, with a few dominant levels accounting for most observations and several lower-frequency levels appearing much less often.
- Both variables show some differences in the relative frequency of Edible versus Poisonous or Unknown mushrooms across their levels, indicating that they contain some contextual variation with respect to the target.
- However, these patterns are not consistently strong enough across levels to meet the screening criterion used in this analysis, since the separation does not reliably satisfy the 2:1 ratio between target levels.
- Therefore, although Population Density and Habitat provide some descriptive context, neither variable will be retained in the final set of predictors for the main clustering- and classification-oriented portions of the analysis.
Data Modeling and Results
Cluster Analysis for Reduced Model
Source Code and Output
## Cluster Analysis for Reduced Model
# Select predictors only and remove the DV before clustering
cluster_input <- mushrooms %>%
select(
odor,
bruises,
gill_size,
gill_color,
stalk_surface_above_ring,
ring_type,
spore_print_color
) %>%
droplevels() # Dropping empty levels
# One-hot encode the categorical predictors
# Convert each level into a 0/1 dummy column
# Remove the intercept column added by model.matrix()
cluster_dummy <- model.matrix(~ ., data = cluster_input)[, -1, drop = FALSE] %>%
as.data.frame()
# Remove any zero-variance columns because they add no clustering information
cluster_dummy <- cluster_dummy[, sapply(cluster_dummy, var) > 0, drop = FALSE]
# Remove perfectly collinear columns so the final matrix does not
# cause singularity errors in NbClust()
# Use QR decomposition to keep only an independent set of columns and remove redundant ones
qr_x <- qr(as.matrix(cluster_dummy))
cluster_dummy <- cluster_dummy[, qr_x$pivot[seq_len(qr_x$rank)], drop = FALSE]
# Estimate the best number of clusters
number_cluster_estimate <- NbClust(
cluster_dummy,
distance = "euclidean",
min.nc = 2,
max.nc = 4,
method = "kmeans"
)*** : The Hubert index is a graphical method of determining the number of clusters.
In the plot of Hubert index, we seek a significant knee that corresponds to a
significant increase of the value of the measure i.e the significant peak in Hubert
index second differences plot.
*** : The D index is a graphical method of determining the number of clusters.
In the plot of D index, we seek a significant knee (the significant peak in Dindex
second differences plot) that corresponds to a significant increase of the value of
the measure.
*******************************************************************
* Among all indices:
* 13 proposed 2 as the best number of clusters
* 6 proposed 3 as the best number of clusters
* 5 proposed 4 as the best number of clusters
***** Conclusion *****
* According to the majority rule, the best number of clusters is 2
*******************************************************************
# Extract the cluster counts proposed by the different indices
k_votes <- number_cluster_estimate$Best.nc[1, ]
# Choose the number of clusters by majority vote
k_best <- as.numeric(names(sort(table(k_votes), decreasing = TRUE)[1]))
# For cluster analysis, the selected mushroom predictors were categorical, so they were first converted into
# binary dummy variables. One reference level per predictor was omitted to avoid perfect redundancy.
# Zero-variance columns and perfectly collinear columns were then removed so that the clustering input matrix
# would be stable for computation. The number of clusters was estimated using NbClust()
# with Euclidean distance and k-means-based criteria over cluster sizes 2 through 4.
# Based on the majority rule across the indices, the best number of clusters was 2.
# Fit PAM clustering using the selected number of clusters
set.seed(123)
pam_mushroom <- pam(cluster_dummy, k = k_best)
# Join the DV back after clustering for interpretation
cluster_results <- mushrooms %>%
select(
poisonous,
odor,
bruises,
gill_size,
gill_color,
stalk_surface_above_ring,
ring_type,
spore_print_color
) %>%
mutate(cluster = factor(pam_mushroom$clustering))
# Create a summary table of cluster sizes
cluster_size_table <- cluster_results %>%
count(cluster, name = "count") %>%
mutate(prop = round(count / sum(count), 3))
# Summary of each cluster, count and proportion of Poisonous
# & Edible mushrooms within the cluster
cluster_class_summary <- cluster_results %>%
count(cluster, poisonous, name = "count") %>%
group_by(cluster) %>%
mutate(prop = round(count / sum(count), 3)) %>%
ungroup()
# Create a summary table of the top original category in each cluster
cluster_mode_summary <- cluster_results %>%
group_by(cluster) %>%
summarise(
across(
.cols = -poisonous,
.fns = ~ names(sort(table(.x), decreasing = TRUE))[1],
.names = "{.col}_top"
)
)# Preps and outputs knitr table
cluster_mode_summary %>%
knitr::kable(
caption = "A summary table containing the most frequent category for each cluster
from each predictor and the target variable for the reduced model."
)| cluster | odor_top | bruises_top | gill_size_top | gill_color_top | stalk_surface_above_ring_top | ring_type_top | spore_print_color_top |
|---|---|---|---|---|---|---|---|
| 1 | None | Bruises | Broad | White | Smooth | Pendant | Brown |
| 2 | Foul | No | Narrow | Buff | Silky | Evanescent | White |
# Outputs interactive table
datatable(
cluster_results,
caption = "An interactive table containing all values from the reduced set of
predictors alongside the predicted cluster."
)Visualizations
## Interactive cluster visualization with plotly
# Left plot: cluster frequencies
cluster_size_plotly <- plotly::plot_ly(
data = cluster_size_table,
x = ~count,
y = ~cluster,
type = "bar",
orientation = "h",
color = ~cluster,
showlegend = FALSE,
hovertemplate = "Cluster: %{y}<br>Frequency: %{x}<extra></extra>"
)
# Right plot: edibility composition within clusters
cluster_class_plotly <- plotly::plot_ly() %>%
plotly::add_trace(
data = dplyr::filter(cluster_class_summary, poisonous == "Edible"),
x = ~prop,
y = ~cluster,
type = "bar",
orientation = "h",
name = "Edible",
marker = list(color = "#F1E11D"),
hovertemplate = "Cluster: %{y}<br>Edibility: Edible<br>Relative Frequency: %{x:.3f}<extra></extra>"
) %>%
plotly::add_trace(
data = dplyr::filter(cluster_class_summary, poisonous == "Poisonous or Unknown"),
x = ~prop,
y = ~cluster,
type = "bar",
orientation = "h",
name = "Poisonous or Unknown",
marker = list(color = "#4B0055"),
hovertemplate = "Cluster: %{y}<br>Edibility: Poisonous or Unknown<br>Relative Frequency: %{x:.3f}<extra></extra>"
)
# Combine the two plots side by side
plotly::subplot(
cluster_size_plotly, cluster_class_plotly,
nrows = 1, shareY = TRUE, widths = c(0.45, 0.55), margin = 0.05
) %>%
plotly::layout(
barmode = "stack",
title = list(
text = "Cluster Sizes and Edibility Composition by Cluster<br><sup>Reduced Model</sup>",
x = 0.5
),
xaxis = list(title = "Frequency"),
xaxis2 = list(title = "Relative Frequency", range = c(0, 1)),
yaxis = list(title = "Cluster"),
yaxis2 = list(title = ""),
legend = list(
title = list(text = "Edibility"),
orientation = "h",
x = 0.5, xanchor = "center",
y = 1.08
),
margin = list(t = 90)
)An interactive combined figure with a horizontal bar chart displaying the frequencies for each cluster on the lefthand side and a horizontal 100% stacked bar chart displaying the distribution of Edibility within each cluster on the righthand side.
Relevant Analysis
For the reduced model, the selected predictors were converted into binary dummy variables, cleaned for zero-variance and collinearity issues, and clustered using PAM with (k = 2). The resulting solution produced:
Cluster 1: 5,000 mushrooms (61.5%)
- 4,184 Edible (83.7%)
- 816 Poisonous or Unknown (16.3%)
Cluster 2: 3,124 mushrooms (38.5%)
- 3,100 Poisonous or Unknown (99.2%)
- 24 Edible (0.8%)
The reduced-model cluster profiles were also highly interpretable: Cluster 1 was characterized by None odor, Bruises, Broad gill size, White gill color, Smooth stalk surface above ring, Pendant ring type, and Brown spore print color, while Cluster 2 was characterized by Foul odor, No bruises, Narrow gill size, Buff gill color, Silky stalk surface above ring, Evanescent ring type, and White spore print color.
Cluster Analysis for Full Model
Source Code and Output
## Cluster Analysis for Full Model
# Select all predictors and remove the DV before clustering
cluster_input_full <- mushrooms %>%
mutate(
# Convert NA in stalk_root into an explicit category so that
# all observations can be retained during clustering
stalk_root = as.character(stalk_root),
stalk_root = ifelse(is.na(stalk_root), "Missing", stalk_root),
stalk_root = factor(stalk_root)
) %>%
select(-poisonous, -veil_type) %>%
droplevels() # Dropping empty levels
# One-hot encode the categorical predictors
# Convert each level into a 0/1 dummy column
# Remove the intercept column added by model.matrix()
cluster_dummy_full <- model.matrix(~ ., data = cluster_input_full)[, -1, drop = FALSE] %>%
as.data.frame()
# Remove any zero-variance columns because they add no clustering information
cluster_dummy_full <- cluster_dummy_full[, sapply(cluster_dummy_full, var) > 0, drop = FALSE]
# Remove perfectly collinear columns so the final matrix does not =
# cause singularity errors in NbClust()
# Use QR decomposition to keep only an independent set of columns and remove redundant ones
qr_x_full <- qr(as.matrix(cluster_dummy_full))
cluster_dummy_full <- cluster_dummy_full[, qr_x_full$pivot[seq_len(qr_x_full$rank)], drop = FALSE]
# For cluster analysis, the full set of mushroom predictors was categorical,
# so they were first converted into binary dummy variables.
# One reference level per predictor was omitted to avoid perfect redundancy.
# Zero-variance columns and perfectly collinear columns were then removed so that
# the clustering input matrix would be stable for computation. Since the reduced
# model supported a 2-cluster solution and the goal here is to compare reduced
# and full clustering results consistently, the number of clusters for the full
# model was fixed at 2.
k_best_full <- 2
# Fit PAM clustering using the selected number of clusters
set.seed(123)
pam_mushroom_full <- pam(cluster_dummy_full, k = k_best_full)
# Join the DV back after clustering for interpretation
cluster_results_full <- mushrooms %>%
mutate(
stalk_root = as.character(stalk_root),
stalk_root = ifelse(is.na(stalk_root), "Missing", stalk_root),
stalk_root = factor(stalk_root)
) %>%
mutate(cluster = factor(pam_mushroom_full$clustering))
# Create a summary table of cluster sizes
cluster_size_table_full <- cluster_results_full %>%
count(cluster, name = "count") %>%
mutate(prop = round(count / sum(count), 3))
# Summary of each cluster, count and proportion of Poisonous & Edible mushrooms within the cluster
cluster_class_summary_full <- cluster_results_full %>%
count(cluster, poisonous, name = "count") %>%
group_by(cluster) %>%
mutate(prop = round(count / sum(count), 3)) %>%
ungroup()
# Create a summary table of the top original category in each cluster
cluster_mode_summary_full <- cluster_results_full %>%
group_by(cluster) %>%
summarise(
across(
.cols = -poisonous,
.fns = ~ names(sort(table(.x), decreasing = TRUE))[1],
.names = "{.col}_top"
)
)# Preps and outputs knitr table
cluster_mode_summary_full %>%
knitr::kable(
caption = "A summary table containing the most frequent category for each cluster
from all predictors and the target variable for the full model."
)| cluster | cap_shape_top | cap_surface_top | cap_color_top | bruises_top | odor_top | gill_attachment_top | gill_spacing_top | gill_size_top | gill_color_top | stalk_shape_top | stalk_root_top | stalk_surface_above_ring_top | stalk_surface_below_ring_top | stalk_color_above_ring_top | stalk_color_below_ring_top | veil_type_top | veil_color_top | ring_number_top | ring_type_top | spore_print_color_top | population_top | habitat_top |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 1 | Convex | Smooth | Brown | Bruises | None | Free | Close | Broad | White | Tapering | Bulbous | Smooth | Smooth | White | White | Partial | White | One | Pendant | Brown | Several | Woods |
| 2 | Flat | Scaly | Brown | No | Foul | Free | Close | Narrow | Buff | Tapering | Missing | Silky | Silky | Pink | Pink | Partial | White | One | Evanescent | White | Several | Woods |
# Outputs interactive table
datatable(
cluster_results_full,
caption = "An interactive table containing all values from the
full set of predictors and the target alongside the predicted cluster."
)Visualizations
## Interactive cluster visualization for full model
# Left plot: cluster frequencies
cluster_size_plotly_full <- plotly::plot_ly(
data = cluster_size_table_full,
x = ~count,
y = ~cluster,
type = "bar",
orientation = "h",
color = ~cluster,
showlegend = FALSE,
hovertemplate = "Cluster: %{y}<br>Frequency: %{x}<extra></extra>"
)
# Right plot: edibility composition within clusters
cluster_class_plotly_full <- plotly::plot_ly() %>%
plotly::add_trace(
data = dplyr::filter(cluster_class_summary_full, poisonous == "Edible"),
x = ~prop,
y = ~cluster,
type = "bar",
orientation = "h",
name = "Edible",
marker = list(color = "#F1E11D"),
hovertemplate = "Cluster: %{y}<br>Edibility: Edible<br>Relative Frequency: %{x:.3f}<extra></extra>"
) %>%
plotly::add_trace(
data = dplyr::filter(cluster_class_summary_full, poisonous == "Poisonous or Unknown"),
x = ~prop,
y = ~cluster,
type = "bar",
orientation = "h",
name = "Poisonous or Unknown",
marker = list(color = "#4B0055"),
hovertemplate = "Cluster: %{y}<br>Edibility: Poisonous or Unknown<br>Relative Frequency: %{x:.3f}<extra></extra>"
)
# Combine the two plots side by side
plotly::subplot(
cluster_size_plotly_full, cluster_class_plotly_full,
nrows = 1, shareY = TRUE, widths = c(0.45, 0.55), margin = 0.05
) %>%
plotly::layout(
barmode = "stack",
title = list(
text = "Cluster Sizes and Edibility Composition by Cluster<br><sup>Full Model</sup>",
x = 0.5
),
xaxis = list(title = "Frequency"),
xaxis2 = list(title = "Relative Frequency", range = c(0, 1)),
yaxis = list(title = "Cluster"),
yaxis2 = list(title = ""),
legend = list(
title = list(text = "Edibility"),
orientation = "h",
x = 0.5, xanchor = "center",
y = 1.08
),
margin = list(t = 90)
)An interactive combined figure with a horizontal bar chart displaying the frequencies for each cluster on the lefthand side and a horizontal 100% stacked bar chart displaying the distribution of Edibility within each cluster on the righthand side.
Relevant Analysis
For the full model, clustering was repeated using the complete predictor set, again with (k = 2) for consistency. This solution produced:
Cluster 1: 5,175 mushrooms (63.7%)
- 4,208 Edible (81.3%)
- 967 Poisonous or Unknown (18.7%)
Cluster 2: 2,949 mushrooms (36.3%)
- 2,949 Poisonous or Unknown (100.0%)
- 0 Edible (0.0%)
The full-model cluster profiles were also sensible. Cluster 1 was dominated by traits such as Convex cap shape, Smooth cap surface, Bruises, None odor, Broad gill size, White gill color, Smooth stalk surfaces, Pendant ring type, Brown spore print color, Several population, and Woods habitat, while Cluster 2 was dominated by Flat cap shape, Scaly cap surface, No bruises, Foul odor, Narrow gill size, Buff gill color, Missing stalk root, Silky stalk surfaces, Pink stalk color, Evanescent ring type, White spore print color, Several population, and Woods habitat.
Conclusion
Exploratory Data Analysis
The exploratory analysis showed that the mushroom dataset is fairly balanced overall, with 4,208 Edible mushrooms (51.8%) and 3,916 Poisonous or Unknown mushrooms (48.2%), but that the predictors themselves are not equally informative. A relatively small group of variables showed the clearest and most consistent separation by edibility. Based on the EDA screening process, the final reduced predictor set was:
- Odor
- Bruises
- Gill Size
- Gill Color
- Stalk Surface Above Ring
- Ring Type
- Spore Print Color
Across the visual analysis, these predictors repeatedly showed the strongest differences in relative edibility by level, while many others either had weak separation, heavy sparsity, substantial missingness, or redundant patterns that made them less useful for the main analysis.
Reduced vs. Full Model
The full model performed better in terms of pure agreement with the target classes, since its second cluster contained only Poisonous or Unknown mushrooms (100%), whereas the reduced model’s second cluster still contained 24 Edible mushrooms (0.8%). However, the reduced model came very close to the full model while using only a much smaller and more interpretable subset of predictors.
This results suggests that the EDA-based variable-selection procedure worked well: even after removing many predictors, the reduced model still recovered almost the same edible-versus-poisonous structure as the full model. In other words, most of the useful clustering signal in the full dataset appears to be captured by the seven selected variables.
Final Takeaway
Overall, this project shows that mushroom edibility in this dataset can be summarized effectively with a small, carefully selected set of visual traits. The full model gave the cleanest separation, but the reduced model remained highly competitive while being simpler, more interpretable, and more closely tied to the exploratory findings. As a result, the reduced model provides a strong practical summary of the main edible-versus-poisonous structure in the data, while the full model serves as a useful benchmark showing that the selected predictors retained nearly all of the important information.
Campus Map
## Add map embed for an OSM map centered on the
## KSU Marietta campus
leaflet() %>%
leaflet::addTiles() %>%
leaflet::addMarkers(
lng = -84.52022,
lat = 33.93799,
popup = "KSU Marietta",
layerId = ""
)Acknowledgements
Team Members
- Rohan C Bhuin (rbhuin@students.kennesaw.edu)
- Rohit Gibson (rgibso50@students.kennesaw.edu)
Acknowledgement of Exclusivity
The findings presented in this project are exclusive to this course and were not in this or previous semesters and will not be presented in any other courses during this semester.