Interactive EDA: Breast Cancer Wisconsin

📦 Load Packages and Dataset

library(tidyverse)

## Warning: package 'tidyverse' was built under R version 4.4.3

## Warning: package 'ggplot2' was built under R version 4.4.3

## Warning: package 'tidyr' was built under R version 4.4.3

## Warning: package 'readr' was built under R version 4.4.3

## Warning: package 'dplyr' was built under R version 4.4.3

## Warning: package 'forcats' was built under R version 4.4.3

## Warning: package 'lubridate' was built under R version 4.4.3

## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.4     ✔ readr     2.1.5
## ✔ forcats   1.0.0     ✔ stringr   1.5.1
## ✔ ggplot2   3.5.1     ✔ tibble    3.2.1
## ✔ lubridate 1.9.4     ✔ tidyr     1.3.1
## ✔ purrr     1.0.2     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors

library(ggstatsplot)

## Warning: package 'ggstatsplot' was built under R version 4.4.3

## You can cite this package as:
##      Patil, I. (2021). Visualizations with statistical details: The 'ggstatsplot' approach.
##      Journal of Open Source Software, 6(61), 3167, doi:10.21105/joss.03167

library(plotly)

## Warning: package 'plotly' was built under R version 4.4.3

## 
## Attaching package: 'plotly'
## 
## The following object is masked from 'package:ggplot2':
## 
##     last_plot
## 
## The following object is masked from 'package:stats':
## 
##     filter
## 
## The following object is masked from 'package:graphics':
## 
##     layout

library(mlbench)

## Warning: package 'mlbench' was built under R version 4.4.3

# Load Breast Cancer Wisconsin dataset
data("BreastCancer")
df <- BreastCancer

# Clean the dataset
df$Bare.nuclei <- as.numeric(df$Bare.nuclei)
df$Bare.nuclei[is.na(df$Bare.nuclei)] <- median(df$Bare.nuclei, na.rm = TRUE)

df_clean <- df %>%
  select(-Id) %>%
  mutate(across(-Class, as.numeric))

str(df_clean)

## 'data.frame':    699 obs. of  10 variables:
##  $ Cl.thickness   : num  5 5 3 6 4 8 1 2 2 4 ...
##  $ Cell.size      : num  1 4 1 8 1 10 1 1 1 2 ...
##  $ Cell.shape     : num  1 4 1 8 1 10 1 2 1 1 ...
##  $ Marg.adhesion  : num  1 5 1 1 3 8 1 1 1 1 ...
##  $ Epith.c.size   : num  2 7 2 3 2 7 2 2 2 2 ...
##  $ Bare.nuclei    : num  1 10 2 4 1 10 10 1 1 1 ...
##  $ Bl.cromatin    : num  3 3 3 3 3 9 3 3 1 2 ...
##  $ Normal.nucleoli: num  1 2 1 7 1 7 1 1 1 1 ...
##  $ Mitoses        : num  1 1 1 1 1 1 1 1 5 1 ...
##  $ Class          : Factor w/ 2 levels "benign","malignant": 1 1 1 1 1 2 1 1 1 1 ...

summary(df_clean)

##   Cl.thickness      Cell.size        Cell.shape     Marg.adhesion   
##  Min.   : 1.000   Min.   : 1.000   Min.   : 1.000   Min.   : 1.000  
##  1st Qu.: 2.000   1st Qu.: 1.000   1st Qu.: 1.000   1st Qu.: 1.000  
##  Median : 4.000   Median : 1.000   Median : 1.000   Median : 1.000  
##  Mean   : 4.418   Mean   : 3.134   Mean   : 3.207   Mean   : 2.807  
##  3rd Qu.: 6.000   3rd Qu.: 5.000   3rd Qu.: 5.000   3rd Qu.: 4.000  
##  Max.   :10.000   Max.   :10.000   Max.   :10.000   Max.   :10.000  
##   Epith.c.size     Bare.nuclei      Bl.cromatin     Normal.nucleoli 
##  Min.   : 1.000   Min.   : 1.000   Min.   : 1.000   Min.   : 1.000  
##  1st Qu.: 2.000   1st Qu.: 1.000   1st Qu.: 2.000   1st Qu.: 1.000  
##  Median : 2.000   Median : 1.000   Median : 3.000   Median : 1.000  
##  Mean   : 3.216   Mean   : 3.486   Mean   : 3.438   Mean   : 2.867  
##  3rd Qu.: 4.000   3rd Qu.: 5.000   3rd Qu.: 5.000   3rd Qu.: 4.000  
##  Max.   :10.000   Max.   :10.000   Max.   :10.000   Max.   :10.000  
##     Mitoses            Class    
##  Min.   :1.000   benign   :458  
##  1st Qu.:1.000   malignant:241  
##  Median :1.000                  
##  Mean   :1.569                  
##  3rd Qu.:1.000                  
##  Max.   :9.000

ggbetweenstats(
  data = df_clean,
  x = Class,
  y = Cl.thickness,
  title = "Clump Thickness by Tumor Type",
  xlab = "Tumor Type",
  ylab = "Clump Thickness",
  messages = FALSE
)

ggbetweenstats(
  data = df_clean,
  x = Class,
  y = Cell.size,
  title = "Cell Size by Tumor Type",
  xlab = "Tumor Type",
  ylab = "Cell Size",
  messages = FALSE
)

ggscatterstats(
  data = df_clean,
  x = Cl.thickness,
  y = Cell.shape,
  title = "Clump Thickness vs Cell Shape Uniformity",
  xlab = "Clump Thickness",
  ylab = "Cell Shape Uniformity",
  messages = FALSE
)

## Registered S3 method overwritten by 'ggside':
##   method from   
##   +.gg   ggplot2

## `stat_xsidebin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_ysidebin()` using `bins = 30`. Pick better value with `binwidth`.

interactive_plot <- ggplot(df_clean, aes(
  x = Cl.thickness,
  y = Cell.size,
  color = Class,
  text = paste(
    "Tumor Type: ", Class, "<br>",
    "Clump Thickness: ", Cl.thickness, "<br>",
    "Cell Size: ", Cell.size, "<br>",
    "Bare Nuclei: ", Bare.nuclei
  )
)) +
  geom_point(size = 2, alpha = 0.8) +
  labs(
    title = "Interactive: Clump Thickness vs Cell Size",
    x = "Clump Thickness",
    y = "Cell Size",
    color = "Tumor Type"
  ) +
  theme_minimal()

ggplotly(interactive_plot, tooltip = "text") %>%
  layout(modebar = list(visible = FALSE))

Interactive EDA: Breast Cancer Wisconsin

John Andrew Emmanuel Avelino

2025-04-15

📦 Load Packages and Dataset