###This breast cancer databases was obtained from the University of Wisconsin Hospitals, Madison from Dr. William H. Wolberg.
###Attributes 1 through 10 have been used to represent instances. Each instance has one of 2 possible classes: benign or malignant.
###Content
###Attribute Domain: ### Sample code number id number, Clump Thickness 1 - 10, Uniformity of Cell Size 1 - 10, Uniformity of Cell Shape 1 - 10, Marginal Adhesion 1 - 10, Single Epithelial Cell Size 1 - 10, Bare Nuclei 1 - 10, Bland Chromatin 1 - 10, Normal Nucleoli 1 - 10, Mitoses 1 - 10, Class (2 for benign, 4 for malignant)
library(tidyverse)
library(ggstatsplot)
library(plotly)
library(mlbench)
data(BreastCancer)
head(BreastCancer)
## Id Cl.thickness Cell.size Cell.shape Marg.adhesion Epith.c.size
## 1 1000025 5 1 1 1 2
## 2 1002945 5 4 4 5 7
## 3 1015425 3 1 1 1 2
## 4 1016277 6 8 8 1 3
## 5 1017023 4 1 1 3 2
## 6 1017122 8 10 10 8 7
## Bare.nuclei Bl.cromatin Normal.nucleoli Mitoses Class
## 1 1 3 1 1 benign
## 2 10 3 2 1 benign
## 3 2 3 1 1 benign
## 4 4 3 7 1 benign
## 5 1 3 1 1 benign
## 6 10 9 7 1 malignant
#DATA UNDERSTANDING
dim(BreastCancer)
## [1] 699 11
str(BreastCancer)
## 'data.frame': 699 obs. of 11 variables:
## $ Id : chr "1000025" "1002945" "1015425" "1016277" ...
## $ Cl.thickness : Ord.factor w/ 10 levels "1"<"2"<"3"<"4"<..: 5 5 3 6 4 8 1 2 2 4 ...
## $ Cell.size : Ord.factor w/ 10 levels "1"<"2"<"3"<"4"<..: 1 4 1 8 1 10 1 1 1 2 ...
## $ Cell.shape : Ord.factor w/ 10 levels "1"<"2"<"3"<"4"<..: 1 4 1 8 1 10 1 2 1 1 ...
## $ Marg.adhesion : Ord.factor w/ 10 levels "1"<"2"<"3"<"4"<..: 1 5 1 1 3 8 1 1 1 1 ...
## $ Epith.c.size : Ord.factor w/ 10 levels "1"<"2"<"3"<"4"<..: 2 7 2 3 2 7 2 2 2 2 ...
## $ Bare.nuclei : Factor w/ 10 levels "1","2","3","4",..: 1 10 2 4 1 10 10 1 1 1 ...
## $ Bl.cromatin : Factor w/ 10 levels "1","2","3","4",..: 3 3 3 3 3 9 3 3 1 2 ...
## $ Normal.nucleoli: Factor w/ 10 levels "1","2","3","4",..: 1 2 1 7 1 7 1 1 1 1 ...
## $ Mitoses : Factor w/ 9 levels "1","2","3","4",..: 1 1 1 1 1 1 1 1 5 1 ...
## $ Class : Factor w/ 2 levels "benign","malignant": 1 1 1 1 1 2 1 1 1 1 ...
colnames(BreastCancer)
## [1] "Id" "Cl.thickness" "Cell.size" "Cell.shape"
## [5] "Marg.adhesion" "Epith.c.size" "Bare.nuclei" "Bl.cromatin"
## [9] "Normal.nucleoli" "Mitoses" "Class"
summary(BreastCancer)
## Id Cl.thickness Cell.size Cell.shape Marg.adhesion
## Length:699 1 :145 1 :384 1 :353 1 :407
## Class :character 5 :130 10 : 67 2 : 59 2 : 58
## Mode :character 3 :108 3 : 52 10 : 58 3 : 58
## 4 : 80 2 : 45 3 : 56 10 : 55
## 10 : 69 4 : 40 4 : 44 4 : 33
## 2 : 50 5 : 30 5 : 34 8 : 25
## (Other):117 (Other): 81 (Other): 95 (Other): 63
## Epith.c.size Bare.nuclei Bl.cromatin Normal.nucleoli Mitoses
## 2 :386 1 :402 2 :166 1 :443 1 :579
## 3 : 72 10 :132 3 :165 10 : 61 2 : 35
## 4 : 48 2 : 30 1 :152 3 : 44 3 : 33
## 1 : 47 5 : 30 7 : 73 2 : 36 10 : 14
## 6 : 41 3 : 28 4 : 40 8 : 24 4 : 12
## 5 : 39 (Other): 61 5 : 34 6 : 22 7 : 9
## (Other): 66 NA's : 16 (Other): 69 (Other): 69 (Other): 17
## Class
## benign :458
## malignant:241
##
##
##
##
##
sum(is.na(BreastCancer))
## [1] 16
#DATA CLEANING
bc <- BreastCancer[, -1]
# Replace "?" with NA
bc[bc == "?"] <- NA
# Convert all to proper types (except Class)
bc$Class <- factor(bc$Class, levels = c("benign", "malignant"))
bc[ , 1:9] <- lapply(bc[ , 1:9], function(x) as.numeric(as.character(x)))
cleanbc <- na.omit(bc)
sum(is.na(cleanbc))
## [1] 0
cleanbc$Id <- NULL
cleanbc$Class <- factor(cleanbc$Class, levels = c("benign", "malignant"))
#UNIVARIATE ANALYSIS
p_ct <- ggplot(cleanbc, aes(x = Cl.thickness, fill = Class)) +
geom_histogram(bins = 10, position = "dodge") +
labs(title = "Distribution of Clump Thickness by Class")
fig_bc <- ggplotly(p_ct, tooltip = "text") %>%
layout(modebar = list(visible = FALSE))
# Show plot
fig_bc
p_cs <- ggplot(cleanbc, aes(x = Cell.size, fill = Class)) +
geom_histogram(bins = 10, position = "dodge") +
labs(title = "Distribution of Cell Size by Class")
fig_bc <- ggplotly(p_cs, tooltip = "text") %>%
layout(modebar = list(visible = FALSE))
# Show plot
fig_bc
p_csh <- ggplot(cleanbc, aes(x = Cell.shape, fill = Class)) +
geom_histogram(bins = 10, position = "dodge") +
labs(title = "Distribution of Cell Shape by Class")
fig_bc <- ggplotly(p_csh, tooltip = "text") %>%
layout(modebar = list(visible = FALSE))
# Show plot
fig_bc
p_ma <- ggplot(cleanbc, aes(x = Marg.adhesion, fill = Class)) +
geom_histogram(bins = 10, position = "dodge") +
labs(title = "Distribution of Marginal Adhesion by Class")
fig_bc <- ggplotly(p_ma, tooltip = "text") %>%
layout(modebar = list(visible = FALSE))
# Show plot
fig_bc
p_ecs <- ggplot(cleanbc, aes(x = Epith.c.size, fill = Class)) +
geom_histogram(bins = 10, position = "dodge") +
labs(title = "Distribution of Single Epithelial Cell Size by Class")
fig_bc <- ggplotly(p_ecs, tooltip = "text") %>%
layout(modebar = list(visible = FALSE))
# Show plot
fig_bc
p_bn <- ggplot(cleanbc, aes(x = Bare.nuclei, fill = Class)) +
geom_histogram(bins = 10, position = "dodge") +
labs(title = "Distribution of Bare Nuclei by Class")
fig_bc <- ggplotly(p_bn, tooltip = "text") %>%
layout(modebar = list(visible = FALSE))
# Show plot
fig_bc
p_bc <- ggplot(cleanbc, aes(x = Bl.cromatin, fill = Class)) +
geom_histogram(bins = 10, position = "dodge") +
labs(title = "Distribution of Bland Chromatin by Class")
fig_bc <- ggplotly(p_bc, tooltip = "text") %>%
layout(modebar = list(visible = FALSE))
# Show plot
fig_bc
p_nn <- ggplot(cleanbc, aes(x = Normal.nucleoli, fill = Class)) +
geom_histogram(bins = 10, position = "dodge") +
labs(title = "Distribution of Normal Nucleoli by Class")
fig_bc <- ggplotly(p_nn, tooltip = "text") %>%
layout(modebar = list(visible = FALSE))
# Show plot
fig_bc
p_mito <- ggplot(cleanbc, aes(x = Mitoses, fill = Class)) +
geom_histogram(bins = 10, position = "dodge") +
labs(title = "Distribution of Mitoses by Class")
fig_bc <- ggplotly(p_mito, tooltip = "text") %>%
layout(modebar = list(visible = FALSE))
# Show plot
fig_bc
##BIVARIATE ANALYSIS ## 1. Do malignant tumors have significantly higher clump thickness than benign ones? 2. Is there a difference in the number of mitoses between benign and malignant tumors? 3. Is there a correlation between clump thickness and cell size? 4. Is there a relationship between marginal adhesion and bare nuclei? 5. How strongly are cell shape and cell size related?
# t-test
t.test(Cl.thickness ~ Class, data = cleanbc)
##
## Welch Two Sample t-test
##
## data: Cl.thickness by Class
## t = -23.927, df = 361.43, p-value < 2.2e-16
## alternative hypothesis: true difference in means between group benign and group malignant is not equal to 0
## 95 percent confidence interval:
## -4.571510 -3.877131
## sample estimates:
## mean in group benign mean in group malignant
## 2.963964 7.188285
# correlation
cor.test(cleanbc$Cell.size, cleanbc$Cell.shape)
##
## Pearson's product-moment correlation
##
## data: cleanbc$Cell.size and cleanbc$Cell.shape
## t = 56.283, df = 681, p-value < 2.2e-16
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
## 0.8929850 0.9196562
## sample estimates:
## cor
## 0.9072282
# boxplot
library(ggplot2)
p_clumpthi <- ggplot(cleanbc, aes(x = Class, y = Cl.thickness, fill = Class)) +
geom_boxplot() +
labs(title = "Clump Thickness by Tumor Type")
fig_bc <- ggplotly(p_clumpthi, tooltip = "text") %>%
layout(modebar = list(visible = FALSE))
# Show plot
fig_bc
t.test(Mitoses ~ Class, data = cleanbc)
##
## Welch Two Sample t-test
##
## data: Mitoses by Class
## t = -9.1697, df = 248.17, p-value < 2.2e-16
## alternative hypothesis: true difference in means between group benign and group malignant is not equal to 0
## 95 percent confidence interval:
## -1.867370 -1.207021
## sample estimates:
## mean in group benign mean in group malignant
## 1.065315 2.602510
p_mitosis <- ggplot(cleanbc, aes(x = Class, y = Mitoses, fill = Class)) +
geom_boxplot() +
labs(title = "Mitoses by Class")
fig_bc <- ggplotly(p_mitosis, tooltip = "text") %>%
layout(modebar = list(visible = FALSE))
# Show plot
fig_bc
cor.test(cleanbc$Cl.thickness, cleanbc$Cell.size)
##
## Pearson's product-moment correlation
##
## data: cleanbc$Cl.thickness and cleanbc$Cell.size
## t = 21.879, df = 681, p-value < 2.2e-16
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
## 0.5961976 0.6845089
## sample estimates:
## cor
## 0.6424815
p_ctcs <- ggplot(cleanbc, aes(x = Cl.thickness, y = Cell.size)) +
geom_point() +
geom_smooth(method = "lm") +
labs(title = "Clump Thickness vs Cell Size")
fig_bc <- ggplotly(p_ctcs, tooltip = "text") %>%
layout(modebar = list(visible = FALSE))
## `geom_smooth()` using formula = 'y ~ x'
# Show plot
fig_bc
cleanbc$Bare.nuclei <- as.numeric(as.character(cleanbc$Bare.nuclei)) # Ensure numeric
cor.test(cleanbc$Marg.adhesion, cleanbc$Bare.nuclei)
##
## Pearson's product-moment correlation
##
## data: cleanbc$Marg.adhesion and cleanbc$Bare.nuclei
## t = 23.594, df = 681, p-value < 2.2e-16
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
## 0.6271832 0.7099493
## sample estimates:
## cor
## 0.6706483
p_mabn <- ggplot(cleanbc, aes(x = Marg.adhesion, y = Bare.nuclei)) +
geom_point() +
geom_smooth(method = "lm") +
labs(title = "Marginal Adhesion vs Bare Nuclei")
fig_bc <- ggplotly(p_mabn, tooltip = "text") %>%
layout(modebar = list(visible = FALSE))
## `geom_smooth()` using formula = 'y ~ x'
# Show plot
fig_bc
ggbetweenstats(
data = bc,
x = Class,
y = Mitoses,
title = "Mitoses: Benign vs Malignant",
messages = FALSE
)
##Insight: Malignant tumors have significantly more mitoses than benign
ones (mean = 2.59 vs. 1.06, p = 1.84e-17).
##3. Is there a correlation between clump thickness and cell size?
ggscatterstats(
data = bc,
x = Cl.thickness,
y = Cell.size,
title = "Correlation: Clump Thickness vs Cell Size",
messages = FALSE
)
## Registered S3 method overwritten by 'ggside':
## method from
## +.gg ggplot2
## `stat_xsidebin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_ysidebin()` using `bins = 30`. Pick better value with `binwidth`.
##Insight: There is a strong positive correlation between clump
thickness and cell size (r = 0.64, p = 1.94e-83).
ggscatterstats(
data = bc,
x = Marg.adhesion,
y = Bare.nuclei,
title = "Correlation: Marginal Adhesion vs Bare Nuclei",
messages = FALSE
)
## `stat_xsidebin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_ysidebin()` using `bins = 30`. Pick better value with `binwidth`.
##Insight: There is a strong positive relationship between marginal
adhesion and bare nuclei. The Pearson correlation is 0.67, indicating
that as marginal adhesion increases, bare nuclei tend to increase as
well.