Introduction

###This breast cancer databases was obtained from the University of Wisconsin Hospitals, Madison from Dr. William H. Wolberg.

###Attributes 1 through 10 have been used to represent instances. Each instance has one of 2 possible classes: benign or malignant.

###Content

###Attribute Domain: ### Sample code number id number, Clump Thickness 1 - 10, Uniformity of Cell Size 1 - 10, Uniformity of Cell Shape 1 - 10, Marginal Adhesion 1 - 10, Single Epithelial Cell Size 1 - 10, Bare Nuclei 1 - 10, Bland Chromatin 1 - 10, Normal Nucleoli 1 - 10, Mitoses 1 - 10, Class (2 for benign, 4 for malignant)

Load the required libraries

library(tidyverse)
library(ggstatsplot)
library(plotly)
library(mlbench)

Load the dataset

data(BreastCancer)
head(BreastCancer)
##        Id Cl.thickness Cell.size Cell.shape Marg.adhesion Epith.c.size
## 1 1000025            5         1          1             1            2
## 2 1002945            5         4          4             5            7
## 3 1015425            3         1          1             1            2
## 4 1016277            6         8          8             1            3
## 5 1017023            4         1          1             3            2
## 6 1017122            8        10         10             8            7
##   Bare.nuclei Bl.cromatin Normal.nucleoli Mitoses     Class
## 1           1           3               1       1    benign
## 2          10           3               2       1    benign
## 3           2           3               1       1    benign
## 4           4           3               7       1    benign
## 5           1           3               1       1    benign
## 6          10           9               7       1 malignant

#DATA UNDERSTANDING

dim(BreastCancer)
## [1] 699  11
str(BreastCancer)
## 'data.frame':    699 obs. of  11 variables:
##  $ Id             : chr  "1000025" "1002945" "1015425" "1016277" ...
##  $ Cl.thickness   : Ord.factor w/ 10 levels "1"<"2"<"3"<"4"<..: 5 5 3 6 4 8 1 2 2 4 ...
##  $ Cell.size      : Ord.factor w/ 10 levels "1"<"2"<"3"<"4"<..: 1 4 1 8 1 10 1 1 1 2 ...
##  $ Cell.shape     : Ord.factor w/ 10 levels "1"<"2"<"3"<"4"<..: 1 4 1 8 1 10 1 2 1 1 ...
##  $ Marg.adhesion  : Ord.factor w/ 10 levels "1"<"2"<"3"<"4"<..: 1 5 1 1 3 8 1 1 1 1 ...
##  $ Epith.c.size   : Ord.factor w/ 10 levels "1"<"2"<"3"<"4"<..: 2 7 2 3 2 7 2 2 2 2 ...
##  $ Bare.nuclei    : Factor w/ 10 levels "1","2","3","4",..: 1 10 2 4 1 10 10 1 1 1 ...
##  $ Bl.cromatin    : Factor w/ 10 levels "1","2","3","4",..: 3 3 3 3 3 9 3 3 1 2 ...
##  $ Normal.nucleoli: Factor w/ 10 levels "1","2","3","4",..: 1 2 1 7 1 7 1 1 1 1 ...
##  $ Mitoses        : Factor w/ 9 levels "1","2","3","4",..: 1 1 1 1 1 1 1 1 5 1 ...
##  $ Class          : Factor w/ 2 levels "benign","malignant": 1 1 1 1 1 2 1 1 1 1 ...
colnames(BreastCancer)
##  [1] "Id"              "Cl.thickness"    "Cell.size"       "Cell.shape"     
##  [5] "Marg.adhesion"   "Epith.c.size"    "Bare.nuclei"     "Bl.cromatin"    
##  [9] "Normal.nucleoli" "Mitoses"         "Class"
summary(BreastCancer)
##       Id             Cl.thickness   Cell.size     Cell.shape  Marg.adhesion
##  Length:699         1      :145   1      :384   1      :353   1      :407  
##  Class :character   5      :130   10     : 67   2      : 59   2      : 58  
##  Mode  :character   3      :108   3      : 52   10     : 58   3      : 58  
##                     4      : 80   2      : 45   3      : 56   10     : 55  
##                     10     : 69   4      : 40   4      : 44   4      : 33  
##                     2      : 50   5      : 30   5      : 34   8      : 25  
##                     (Other):117   (Other): 81   (Other): 95   (Other): 63  
##   Epith.c.size  Bare.nuclei   Bl.cromatin  Normal.nucleoli    Mitoses   
##  2      :386   1      :402   2      :166   1      :443     1      :579  
##  3      : 72   10     :132   3      :165   10     : 61     2      : 35  
##  4      : 48   2      : 30   1      :152   3      : 44     3      : 33  
##  1      : 47   5      : 30   7      : 73   2      : 36     10     : 14  
##  6      : 41   3      : 28   4      : 40   8      : 24     4      : 12  
##  5      : 39   (Other): 61   5      : 34   6      : 22     7      :  9  
##  (Other): 66   NA's   : 16   (Other): 69   (Other): 69     (Other): 17  
##        Class    
##  benign   :458  
##  malignant:241  
##                 
##                 
##                 
##                 
## 
sum(is.na(BreastCancer))
## [1] 16

#DATA CLEANING

bc <- BreastCancer[, -1]

# Replace "?" with NA
bc[bc == "?"] <- NA

# Convert all to proper types (except Class)
bc$Class <- factor(bc$Class, levels = c("benign", "malignant"))
bc[ , 1:9] <- lapply(bc[ , 1:9], function(x) as.numeric(as.character(x)))

cleanbc <- na.omit(bc)
sum(is.na(cleanbc))
## [1] 0
cleanbc$Id <- NULL
cleanbc$Class <- factor(cleanbc$Class, levels = c("benign", "malignant"))

#UNIVARIATE ANALYSIS

p_ct <- ggplot(cleanbc, aes(x = Cl.thickness, fill = Class)) +
  geom_histogram(bins = 10, position = "dodge") +
  labs(title = "Distribution of Clump Thickness by Class")
       
fig_bc <- ggplotly(p_ct, tooltip = "text") %>%
layout(modebar = list(visible = FALSE))

# Show plot
fig_bc
p_cs <- ggplot(cleanbc, aes(x = Cell.size, fill = Class)) +
  geom_histogram(bins = 10, position = "dodge") +
  labs(title = "Distribution of Cell Size by Class")
       
fig_bc <- ggplotly(p_cs, tooltip = "text") %>%
layout(modebar = list(visible = FALSE))

# Show plot
fig_bc
p_csh <- ggplot(cleanbc, aes(x = Cell.shape, fill = Class)) +
  geom_histogram(bins = 10, position = "dodge") +
  labs(title = "Distribution of Cell Shape by Class")
       
fig_bc <- ggplotly(p_csh, tooltip = "text") %>%
layout(modebar = list(visible = FALSE))

# Show plot
fig_bc
p_ma <- ggplot(cleanbc, aes(x = Marg.adhesion, fill = Class)) +
  geom_histogram(bins = 10, position = "dodge") +
  labs(title = "Distribution of Marginal Adhesion by Class")
       
fig_bc <- ggplotly(p_ma, tooltip = "text") %>%
layout(modebar = list(visible = FALSE))

# Show plot
fig_bc
p_ecs <- ggplot(cleanbc, aes(x = Epith.c.size, fill = Class)) +
  geom_histogram(bins = 10, position = "dodge") +
  labs(title = "Distribution of Single Epithelial Cell Size by Class")
       
fig_bc <- ggplotly(p_ecs, tooltip = "text") %>%
layout(modebar = list(visible = FALSE))

# Show plot
fig_bc
p_bn <- ggplot(cleanbc, aes(x = Bare.nuclei, fill = Class)) +
  geom_histogram(bins = 10, position = "dodge") +
  labs(title = "Distribution of Bare Nuclei by Class")
       
fig_bc <- ggplotly(p_bn, tooltip = "text") %>%
layout(modebar = list(visible = FALSE))

# Show plot
fig_bc
p_bc <- ggplot(cleanbc, aes(x = Bl.cromatin, fill = Class)) +
  geom_histogram(bins = 10, position = "dodge") +
  labs(title = "Distribution of Bland Chromatin by Class")
       
fig_bc <- ggplotly(p_bc, tooltip = "text") %>%
layout(modebar = list(visible = FALSE))

# Show plot
fig_bc
p_nn <- ggplot(cleanbc, aes(x = Normal.nucleoli, fill = Class)) +
  geom_histogram(bins = 10, position = "dodge") +
  labs(title = "Distribution of Normal Nucleoli by Class")
       
fig_bc <- ggplotly(p_nn, tooltip = "text") %>%
layout(modebar = list(visible = FALSE))

# Show plot
fig_bc
p_mito <- ggplot(cleanbc, aes(x = Mitoses, fill = Class)) +
  geom_histogram(bins = 10, position = "dodge") +
  labs(title = "Distribution of Mitoses by Class")
       
fig_bc <- ggplotly(p_mito, tooltip = "text") %>%
layout(modebar = list(visible = FALSE))

# Show plot
fig_bc

##BIVARIATE ANALYSIS ## 1. Do malignant tumors have significantly higher clump thickness than benign ones? 2. Is there a difference in the number of mitoses between benign and malignant tumors? 3. Is there a correlation between clump thickness and cell size? 4. Is there a relationship between marginal adhesion and bare nuclei? 5. How strongly are cell shape and cell size related?

# t-test
t.test(Cl.thickness ~ Class, data = cleanbc)
## 
##  Welch Two Sample t-test
## 
## data:  Cl.thickness by Class
## t = -23.927, df = 361.43, p-value < 2.2e-16
## alternative hypothesis: true difference in means between group benign and group malignant is not equal to 0
## 95 percent confidence interval:
##  -4.571510 -3.877131
## sample estimates:
##    mean in group benign mean in group malignant 
##                2.963964                7.188285
# correlation
cor.test(cleanbc$Cell.size, cleanbc$Cell.shape)
## 
##  Pearson's product-moment correlation
## 
## data:  cleanbc$Cell.size and cleanbc$Cell.shape
## t = 56.283, df = 681, p-value < 2.2e-16
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
##  0.8929850 0.9196562
## sample estimates:
##       cor 
## 0.9072282
# boxplot
library(ggplot2)
p_clumpthi <- ggplot(cleanbc, aes(x = Class, y = Cl.thickness, fill = Class)) +
  geom_boxplot() +
  labs(title = "Clump Thickness by Tumor Type")

fig_bc <- ggplotly(p_clumpthi, tooltip = "text") %>%
  layout(modebar = list(visible = FALSE))

# Show plot
fig_bc

2. Is there a difference in the number of mitoses between benign and malignant tumors?

t.test(Mitoses ~ Class, data = cleanbc)
## 
##  Welch Two Sample t-test
## 
## data:  Mitoses by Class
## t = -9.1697, df = 248.17, p-value < 2.2e-16
## alternative hypothesis: true difference in means between group benign and group malignant is not equal to 0
## 95 percent confidence interval:
##  -1.867370 -1.207021
## sample estimates:
##    mean in group benign mean in group malignant 
##                1.065315                2.602510
p_mitosis <- ggplot(cleanbc, aes(x = Class, y = Mitoses, fill = Class)) +
  geom_boxplot() +
  labs(title = "Mitoses by Class")

fig_bc <- ggplotly(p_mitosis, tooltip = "text") %>%
  layout(modebar = list(visible = FALSE))

# Show plot
fig_bc

3. Is there a correlation between clump thickness and cell size?

cor.test(cleanbc$Cl.thickness, cleanbc$Cell.size)
## 
##  Pearson's product-moment correlation
## 
## data:  cleanbc$Cl.thickness and cleanbc$Cell.size
## t = 21.879, df = 681, p-value < 2.2e-16
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
##  0.5961976 0.6845089
## sample estimates:
##       cor 
## 0.6424815
p_ctcs <- ggplot(cleanbc, aes(x = Cl.thickness, y = Cell.size)) +
  geom_point() +
  geom_smooth(method = "lm") +
  labs(title = "Clump Thickness vs Cell Size")

fig_bc <- ggplotly(p_ctcs, tooltip = "text") %>%
  layout(modebar = list(visible = FALSE))
## `geom_smooth()` using formula = 'y ~ x'
# Show plot
fig_bc

4. Is there a relationship between marginal adhesion and bare nuclei?

cleanbc$Bare.nuclei <- as.numeric(as.character(cleanbc$Bare.nuclei)) # Ensure numeric
cor.test(cleanbc$Marg.adhesion, cleanbc$Bare.nuclei)
## 
##  Pearson's product-moment correlation
## 
## data:  cleanbc$Marg.adhesion and cleanbc$Bare.nuclei
## t = 23.594, df = 681, p-value < 2.2e-16
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
##  0.6271832 0.7099493
## sample estimates:
##       cor 
## 0.6706483
p_mabn <- ggplot(cleanbc, aes(x = Marg.adhesion, y = Bare.nuclei)) +
  geom_point() +
  geom_smooth(method = "lm") +
  labs(title = "Marginal Adhesion vs Bare Nuclei")

fig_bc <- ggplotly(p_mabn, tooltip = "text") %>%
  layout(modebar = list(visible = FALSE))
## `geom_smooth()` using formula = 'y ~ x'
# Show plot
fig_bc

2. Is there a difference in the number of mitoses between benign and malignant tumors?

ggbetweenstats(
  data = bc,
  x = Class,
  y = Mitoses,
  title = "Mitoses: Benign vs Malignant",
  messages = FALSE
)

##Insight: Malignant tumors have significantly more mitoses than benign ones (mean = 2.59 vs. 1.06, p = 1.84e-17).

##3. Is there a correlation between clump thickness and cell size?

ggscatterstats(
  data = bc,
  x = Cl.thickness,
  y = Cell.size,
  title = "Correlation: Clump Thickness vs Cell Size",
  messages = FALSE
)
## Registered S3 method overwritten by 'ggside':
##   method from   
##   +.gg   ggplot2
## `stat_xsidebin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_ysidebin()` using `bins = 30`. Pick better value with `binwidth`.

##Insight: There is a strong positive correlation between clump thickness and cell size (r = 0.64, p = 1.94e-83).

4. Is there a relationship between marginal adhesion and bare nuclei?

ggscatterstats(
  data = bc,
  x = Marg.adhesion,
  y = Bare.nuclei,
  title = "Correlation: Marginal Adhesion vs Bare Nuclei",
  messages = FALSE
)
## `stat_xsidebin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_ysidebin()` using `bins = 30`. Pick better value with `binwidth`.

##Insight: There is a strong positive relationship between marginal adhesion and bare nuclei. The Pearson correlation is 0.67, indicating that as marginal adhesion increases, bare nuclei tend to increase as well.