📦 Load Packages and Dataset
library(tidyverse)
## Warning: package 'tidyverse' was built under R version 4.4.3
## Warning: package 'ggplot2' was built under R version 4.4.3
## Warning: package 'tidyr' was built under R version 4.4.3
## Warning: package 'readr' was built under R version 4.4.3
## Warning: package 'dplyr' was built under R version 4.4.3
## Warning: package 'forcats' was built under R version 4.4.3
## Warning: package 'lubridate' was built under R version 4.4.3
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.1.4 ✔ readr 2.1.5
## ✔ forcats 1.0.0 ✔ stringr 1.5.1
## ✔ ggplot2 3.5.1 ✔ tibble 3.2.1
## ✔ lubridate 1.9.4 ✔ tidyr 1.3.1
## ✔ purrr 1.0.2
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(ggstatsplot)
## Warning: package 'ggstatsplot' was built under R version 4.4.3
## You can cite this package as:
## Patil, I. (2021). Visualizations with statistical details: The 'ggstatsplot' approach.
## Journal of Open Source Software, 6(61), 3167, doi:10.21105/joss.03167
library(plotly)
## Warning: package 'plotly' was built under R version 4.4.3
##
## Attaching package: 'plotly'
##
## The following object is masked from 'package:ggplot2':
##
## last_plot
##
## The following object is masked from 'package:stats':
##
## filter
##
## The following object is masked from 'package:graphics':
##
## layout
library(mlbench)
## Warning: package 'mlbench' was built under R version 4.4.3
# Load Breast Cancer Wisconsin dataset
data("BreastCancer")
df <- BreastCancer
# Clean the dataset
df$Bare.nuclei <- as.numeric(df$Bare.nuclei)
df$Bare.nuclei[is.na(df$Bare.nuclei)] <- median(df$Bare.nuclei, na.rm = TRUE)
df_clean <- df %>%
select(-Id) %>%
mutate(across(-Class, as.numeric))
str(df_clean)
## 'data.frame': 699 obs. of 10 variables:
## $ Cl.thickness : num 5 5 3 6 4 8 1 2 2 4 ...
## $ Cell.size : num 1 4 1 8 1 10 1 1 1 2 ...
## $ Cell.shape : num 1 4 1 8 1 10 1 2 1 1 ...
## $ Marg.adhesion : num 1 5 1 1 3 8 1 1 1 1 ...
## $ Epith.c.size : num 2 7 2 3 2 7 2 2 2 2 ...
## $ Bare.nuclei : num 1 10 2 4 1 10 10 1 1 1 ...
## $ Bl.cromatin : num 3 3 3 3 3 9 3 3 1 2 ...
## $ Normal.nucleoli: num 1 2 1 7 1 7 1 1 1 1 ...
## $ Mitoses : num 1 1 1 1 1 1 1 1 5 1 ...
## $ Class : Factor w/ 2 levels "benign","malignant": 1 1 1 1 1 2 1 1 1 1 ...
summary(df_clean)
## Cl.thickness Cell.size Cell.shape Marg.adhesion
## Min. : 1.000 Min. : 1.000 Min. : 1.000 Min. : 1.000
## 1st Qu.: 2.000 1st Qu.: 1.000 1st Qu.: 1.000 1st Qu.: 1.000
## Median : 4.000 Median : 1.000 Median : 1.000 Median : 1.000
## Mean : 4.418 Mean : 3.134 Mean : 3.207 Mean : 2.807
## 3rd Qu.: 6.000 3rd Qu.: 5.000 3rd Qu.: 5.000 3rd Qu.: 4.000
## Max. :10.000 Max. :10.000 Max. :10.000 Max. :10.000
## Epith.c.size Bare.nuclei Bl.cromatin Normal.nucleoli
## Min. : 1.000 Min. : 1.000 Min. : 1.000 Min. : 1.000
## 1st Qu.: 2.000 1st Qu.: 1.000 1st Qu.: 2.000 1st Qu.: 1.000
## Median : 2.000 Median : 1.000 Median : 3.000 Median : 1.000
## Mean : 3.216 Mean : 3.486 Mean : 3.438 Mean : 2.867
## 3rd Qu.: 4.000 3rd Qu.: 5.000 3rd Qu.: 5.000 3rd Qu.: 4.000
## Max. :10.000 Max. :10.000 Max. :10.000 Max. :10.000
## Mitoses Class
## Min. :1.000 benign :458
## 1st Qu.:1.000 malignant:241
## Median :1.000
## Mean :1.569
## 3rd Qu.:1.000
## Max. :9.000
ggbetweenstats(
data = df_clean,
x = Class,
y = Cl.thickness,
title = "Clump Thickness by Tumor Type",
xlab = "Tumor Type",
ylab = "Clump Thickness",
messages = FALSE
)

ggbetweenstats(
data = df_clean,
x = Class,
y = Cell.size,
title = "Cell Size by Tumor Type",
xlab = "Tumor Type",
ylab = "Cell Size",
messages = FALSE
)

ggscatterstats(
data = df_clean,
x = Cl.thickness,
y = Cell.shape,
title = "Clump Thickness vs Cell Shape Uniformity",
xlab = "Clump Thickness",
ylab = "Cell Shape Uniformity",
messages = FALSE
)
## Registered S3 method overwritten by 'ggside':
## method from
## +.gg ggplot2
## `stat_xsidebin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_ysidebin()` using `bins = 30`. Pick better value with `binwidth`.

interactive_plot <- ggplot(df_clean, aes(
x = Cl.thickness,
y = Cell.size,
color = Class,
text = paste(
"Tumor Type: ", Class, "<br>",
"Clump Thickness: ", Cl.thickness, "<br>",
"Cell Size: ", Cell.size, "<br>",
"Bare Nuclei: ", Bare.nuclei
)
)) +
geom_point(size = 2, alpha = 0.8) +
labs(
title = "Interactive: Clump Thickness vs Cell Size",
x = "Clump Thickness",
y = "Cell Size",
color = "Tumor Type"
) +
theme_minimal()
ggplotly(interactive_plot, tooltip = "text") %>%
layout(modebar = list(visible = FALSE))