install.packages("ggstatsplot", repos = "https://cloud.r-project.org")
## Installing package into 'C:/Users/dazae/AppData/Local/R/win-library/4.4'
## (as 'lib' is unspecified)
## package 'ggstatsplot' successfully unpacked and MD5 sums checked
## 
## The downloaded binary packages are in
##  C:\Users\dazae\AppData\Local\Temp\RtmpYZxFNy\downloaded_packages
install.packages("plotly", repos = "https://cloud.r-project.org")
## Installing package into 'C:/Users/dazae/AppData/Local/R/win-library/4.4'
## (as 'lib' is unspecified)
## package 'plotly' successfully unpacked and MD5 sums checked
## 
## The downloaded binary packages are in
##  C:\Users\dazae\AppData\Local\Temp\RtmpYZxFNy\downloaded_packages
install.packages("tidyverse", repos = "https://cloud.r-project.org")
## Installing package into 'C:/Users/dazae/AppData/Local/R/win-library/4.4'
## (as 'lib' is unspecified)
## package 'tidyverse' successfully unpacked and MD5 sums checked
## 
## The downloaded binary packages are in
##  C:\Users\dazae\AppData\Local\Temp\RtmpYZxFNy\downloaded_packages
install.packages("mlbench", repos = "https://cloud.r-project.org")
## Installing package into 'C:/Users/dazae/AppData/Local/R/win-library/4.4'
## (as 'lib' is unspecified)
## package 'mlbench' successfully unpacked and MD5 sums checked
## 
## The downloaded binary packages are in
##  C:\Users\dazae\AppData\Local\Temp\RtmpYZxFNy\downloaded_packages
#install.packages("tidyverse")
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.4     ✔ readr     2.1.5
## ✔ forcats   1.0.0     ✔ stringr   1.5.1
## ✔ ggplot2   3.5.2     ✔ tibble    3.2.1
## ✔ lubridate 1.9.4     ✔ tidyr     1.3.1
## ✔ purrr     1.0.4     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(ggstatsplot)  
## You can cite this package as:
##      Patil, I. (2021). Visualizations with statistical details: The 'ggstatsplot' approach.
##      Journal of Open Source Software, 6(61), 3167, doi:10.21105/joss.03167
library(plotly)  
## 
## Attaching package: 'plotly'
## 
## The following object is masked from 'package:ggplot2':
## 
##     last_plot
## 
## The following object is masked from 'package:stats':
## 
##     filter
## 
## The following object is masked from 'package:graphics':
## 
##     layout
library(mlbench)
data("BreastCancer")
#install.packages("mlbench")
data("BreastCancer")
view(BreastCancer)
head(BreastCancer)
##        Id Cl.thickness Cell.size Cell.shape Marg.adhesion Epith.c.size
## 1 1000025            5         1          1             1            2
## 2 1002945            5         4          4             5            7
## 3 1015425            3         1          1             1            2
## 4 1016277            6         8          8             1            3
## 5 1017023            4         1          1             3            2
## 6 1017122            8        10         10             8            7
##   Bare.nuclei Bl.cromatin Normal.nucleoli Mitoses     Class
## 1           1           3               1       1    benign
## 2          10           3               2       1    benign
## 3           2           3               1       1    benign
## 4           4           3               7       1    benign
## 5           1           3               1       1    benign
## 6          10           9               7       1 malignant
summary(BreastCancer)
##       Id             Cl.thickness   Cell.size     Cell.shape  Marg.adhesion
##  Length:699         1      :145   1      :384   1      :353   1      :407  
##  Class :character   5      :130   10     : 67   2      : 59   2      : 58  
##  Mode  :character   3      :108   3      : 52   10     : 58   3      : 58  
##                     4      : 80   2      : 45   3      : 56   10     : 55  
##                     10     : 69   4      : 40   4      : 44   4      : 33  
##                     2      : 50   5      : 30   5      : 34   8      : 25  
##                     (Other):117   (Other): 81   (Other): 95   (Other): 63  
##   Epith.c.size  Bare.nuclei   Bl.cromatin  Normal.nucleoli    Mitoses   
##  2      :386   1      :402   2      :166   1      :443     1      :579  
##  3      : 72   10     :132   3      :165   10     : 61     2      : 35  
##  4      : 48   2      : 30   1      :152   3      : 44     3      : 33  
##  1      : 47   5      : 30   7      : 73   2      : 36     10     : 14  
##  6      : 41   3      : 28   4      : 40   8      : 24     4      : 12  
##  5      : 39   (Other): 61   5      : 34   6      : 22     7      :  9  
##  (Other): 66   NA's   : 16   (Other): 69   (Other): 69     (Other): 17  
##        Class    
##  benign   :458  
##  malignant:241  
##                 
##                 
##                 
##                 
## 

Part 1: Bivariate Analysis

Formulate Statistical Questions

Q1. Is there a significant difference in Clump Thickness between benign and malignant tumors?

Q2. Do malignant tumors have significantly larger epithelial cell size than benign tumors?

Q3. Is there a relationship between Clump Thickness and Uniformity of Cell Size?

Q1. Is there a significant difference in Clump Thickness between benign and malignant tumors?

clean_data <- BreastCancer %>% drop_na() %>% mutate(across(.cols = c(Cl.thickness, Cell.size, Cell.shape, Marg.adhesion,Epith.c.size, Bare.nuclei, Bl.cromatin, Normal.nucleoli, Mitoses), .fns = ~ as.numeric(as.character(.)))) %>% mutate(Class = as.factor(Class))
ggbetweenstats(data = clean_data,x = Class,y = Cl.thickness,title = "Comparison of Clump Thickness by Tumor Class",xlab = "Tumor Type", ylab = "Clump Thickness")

# Interpretation: # The plot shows that malignant tumors have a much higher average clump thickness (mean = 7.19) compared to benign tumors (mean = 2.96).

A statistical test (Welch’s t-test) confirms that this difference is highly significant (p < 0.001), meaning it’s very unlikely to be due to chance.

In simple terms, tumors that are cancerous tend to have thicker cell clumps than those that are not cancerous. This suggests that clump thickness may be an important feature when distinguishing between benign and malignant breast tumors.

Q2: Do malignant tumors have significantly larger epithelial cell sizes than benign tumors?

ggbetweenstats(data = clean_data,x = Class,y = Epith.c.size,title = "Epithelial Cell Size by Tumor Class",xlab = "Tumor Type",ylab = "Epithelial Cell Size")

# Interpretation: # The plot compares epithelial cell size between benign and malignant tumors. On average:

Benign tumors have a smaller cell size (mean = 2.11)

Malignant tumors have a larger cell size (mean = 5.33)

The statistical test shows a very strong difference between the two groups (p < 0.001), meaning the result is statistically significant and not likely due to random chance.

This suggests that larger epithelial cell size is associated with malignant (cancerous) tumors. Therefore, this feature could be helpful in identifying whether a tumor is cancerous.

Q3: Is there a relationship between Clump Thickness and Uniformity of Cell Size?

ggscatterstats(
  data = clean_data,
  x = Cl.thickness,
  y = Cell.size,
  title = "Relationship Between Clump Thickness and Cell Size",
  xlab = "Clump Thickness",
  ylab = "Uniformity of Cell Size"
)
## Registered S3 method overwritten by 'ggside':
##   method from   
##   +.gg   ggplot2
## `stat_xsidebin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_ysidebin()` using `bins = 30`. Pick better value with `binwidth`.

Interpretation:

The plot shows a strong positive relationship between clump thickness and uniformity of cell size.

The Pearson correlation coefficient is 0.64, which means that as clump thickness increases, cell size tends to increase as well.

The p-value is very small (p < 0.001), so the result is statistically significant.

The blue trend line shows this upward trend clearly.

In simple terms, tumors with thicker cell clumps also tend to have less uniform (larger) cell sizes, which could be a sign of abnormal or cancerous growth.

Part 2: Interactive Plots with plotly (Iris Dataset Example)

library(ggplot2)
library(plotly)

data("BreastCancer")

p_clump <- ggplot(clean_data, aes(x = Class, y = Cl.thickness, color =Class,
                                  text = paste("Tumor Type: ", Class,"<br>",
                                               "Clump Thickness: ", Cl.thickness))) + geom_jitter(width = 0.3, height = 0.1, alpha = 0.7) +
  labs(title = "Clump Thickness by Tumor Type",
       x = "Tumor Type",
       y = "Clump Thickness") +
  theme_minimal()

fig_clump <- ggplotly(p_clump, tooltip = "text") %>%
  layout(modebar = list(visible = FALSE))

fig_clump

Interpretation:

Malignant tumors tend to have higher clump thickness compared to benign tumors. This suggests that clump thickness can help differentiate between the two tumor types.

There is a positive relationship between clump thickness and cell size. As clump thickness increases, cell size also tends to increase—especially in malignant tumors.

R Markdown

This is an R Markdown document. Markdown is a simple formatting syntax for authoring HTML, PDF, and MS Word documents. For more details on using R Markdown see http://rmarkdown.rstudio.com.

When you click the Knit button a document will be generated that includes both content as well as the output of any embedded R code chunks within the document. You can embed an R code chunk like this:

Including Plots

You can also embed plots, for example:

Note that the echo = FALSE parameter was added to the code chunk to prevent printing of the R code that generated the plot.