install.packages("tidyverse")
## Installing package into 'C:/Users/Rimalyn Magallamento/AppData/Local/R/win-library/4.4'
## (as 'lib' is unspecified)
## package 'tidyverse' successfully unpacked and MD5 sums checked
## 
## The downloaded binary packages are in
##  C:\Users\Rimalyn Magallamento\AppData\Local\Temp\RtmpEPM13M\downloaded_packages
library(plotly)
## Warning: package 'plotly' was built under R version 4.4.3
## Loading required package: ggplot2
## Warning: package 'ggplot2' was built under R version 4.4.3
## 
## Attaching package: 'plotly'
## The following object is masked from 'package:ggplot2':
## 
##     last_plot
## The following object is masked from 'package:stats':
## 
##     filter
## The following object is masked from 'package:graphics':
## 
##     layout
library(mlbench)
## Warning: package 'mlbench' was built under R version 4.4.3
library(ggplot2)
library(tidyverse)
## Warning: package 'tidyverse' was built under R version 4.4.3
## Warning: package 'tibble' was built under R version 4.4.3
## Warning: package 'tidyr' was built under R version 4.4.3
## Warning: package 'readr' was built under R version 4.4.3
## Warning: package 'purrr' was built under R version 4.4.3
## Warning: package 'dplyr' was built under R version 4.4.3
## Warning: package 'forcats' was built under R version 4.4.3
## Warning: package 'lubridate' was built under R version 4.4.3
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.4     ✔ readr     2.1.5
## ✔ forcats   1.0.0     ✔ stringr   1.5.1
## ✔ lubridate 1.9.4     ✔ tibble    3.2.1
## ✔ purrr     1.0.4     ✔ tidyr     1.3.1
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks plotly::filter(), stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
data(BreastCancer)
head(BreastCancer)
##        Id Cl.thickness Cell.size Cell.shape Marg.adhesion Epith.c.size
## 1 1000025            5         1          1             1            2
## 2 1002945            5         4          4             5            7
## 3 1015425            3         1          1             1            2
## 4 1016277            6         8          8             1            3
## 5 1017023            4         1          1             3            2
## 6 1017122            8        10         10             8            7
##   Bare.nuclei Bl.cromatin Normal.nucleoli Mitoses     Class
## 1           1           3               1       1    benign
## 2          10           3               2       1    benign
## 3           2           3               1       1    benign
## 4           4           3               7       1    benign
## 5           1           3               1       1    benign
## 6          10           9               7       1 malignant
dim(BreastCancer)
## [1] 699  11
str(BreastCancer)
## 'data.frame':    699 obs. of  11 variables:
##  $ Id             : chr  "1000025" "1002945" "1015425" "1016277" ...
##  $ Cl.thickness   : Ord.factor w/ 10 levels "1"<"2"<"3"<"4"<..: 5 5 3 6 4 8 1 2 2 4 ...
##  $ Cell.size      : Ord.factor w/ 10 levels "1"<"2"<"3"<"4"<..: 1 4 1 8 1 10 1 1 1 2 ...
##  $ Cell.shape     : Ord.factor w/ 10 levels "1"<"2"<"3"<"4"<..: 1 4 1 8 1 10 1 2 1 1 ...
##  $ Marg.adhesion  : Ord.factor w/ 10 levels "1"<"2"<"3"<"4"<..: 1 5 1 1 3 8 1 1 1 1 ...
##  $ Epith.c.size   : Ord.factor w/ 10 levels "1"<"2"<"3"<"4"<..: 2 7 2 3 2 7 2 2 2 2 ...
##  $ Bare.nuclei    : Factor w/ 10 levels "1","2","3","4",..: 1 10 2 4 1 10 10 1 1 1 ...
##  $ Bl.cromatin    : Factor w/ 10 levels "1","2","3","4",..: 3 3 3 3 3 9 3 3 1 2 ...
##  $ Normal.nucleoli: Factor w/ 10 levels "1","2","3","4",..: 1 2 1 7 1 7 1 1 1 1 ...
##  $ Mitoses        : Factor w/ 9 levels "1","2","3","4",..: 1 1 1 1 1 1 1 1 5 1 ...
##  $ Class          : Factor w/ 2 levels "benign","malignant": 1 1 1 1 1 2 1 1 1 1 ...
colnames(BreastCancer)
##  [1] "Id"              "Cl.thickness"    "Cell.size"       "Cell.shape"     
##  [5] "Marg.adhesion"   "Epith.c.size"    "Bare.nuclei"     "Bl.cromatin"    
##  [9] "Normal.nucleoli" "Mitoses"         "Class"
summary(BreastCancer)
##       Id             Cl.thickness   Cell.size     Cell.shape  Marg.adhesion
##  Length:699         1      :145   1      :384   1      :353   1      :407  
##  Class :character   5      :130   10     : 67   2      : 59   2      : 58  
##  Mode  :character   3      :108   3      : 52   10     : 58   3      : 58  
##                     4      : 80   2      : 45   3      : 56   10     : 55  
##                     10     : 69   4      : 40   4      : 44   4      : 33  
##                     2      : 50   5      : 30   5      : 34   8      : 25  
##                     (Other):117   (Other): 81   (Other): 95   (Other): 63  
##   Epith.c.size  Bare.nuclei   Bl.cromatin  Normal.nucleoli    Mitoses   
##  2      :386   1      :402   2      :166   1      :443     1      :579  
##  3      : 72   10     :132   3      :165   10     : 61     2      : 35  
##  4      : 48   2      : 30   1      :152   3      : 44     3      : 33  
##  1      : 47   5      : 30   7      : 73   2      : 36     10     : 14  
##  6      : 41   3      : 28   4      : 40   8      : 24     4      : 12  
##  5      : 39   (Other): 61   5      : 34   6      : 22     7      :  9  
##  (Other): 66   NA's   : 16   (Other): 69   (Other): 69     (Other): 17  
##        Class    
##  benign   :458  
##  malignant:241  
##                 
##                 
##                 
##                 
## 
sum(is.na(BreastCancer))
## [1] 16
bc <- BreastCancer[, -1]

# Replace "?" with NA
bc[bc == "?"] <- NA

# Convert all to proper types (except Class)
bc$Class <- factor(bc$Class, levels = c("benign", "malignant"))
bc[ , 1:9] <- lapply(bc[ , 1:9], function(x) as.numeric(as.character(x)))

cleanbc <- na.omit(bc)
sum(is.na(cleanbc))
## [1] 0
cleanbc$Id <- NULL
cleanbc$Class <- factor(cleanbc$Class, levels = c("benign", "malignant"))

##Statistical Questions: ## 1. Do malignant tumors have significantly higher clump thickness than benign ones?

# t-test
t.test(Cl.thickness ~ Class, data = cleanbc)
## 
##  Welch Two Sample t-test
## 
## data:  Cl.thickness by Class
## t = -23.927, df = 361.43, p-value < 2.2e-16
## alternative hypothesis: true difference in means between group benign and group malignant is not equal to 0
## 95 percent confidence interval:
##  -4.571510 -3.877131
## sample estimates:
##    mean in group benign mean in group malignant 
##                2.963964                7.188285
# correlation
cor.test(cleanbc$Cell.size, cleanbc$Cell.shape)
## 
##  Pearson's product-moment correlation
## 
## data:  cleanbc$Cell.size and cleanbc$Cell.shape
## t = 56.283, df = 681, p-value < 2.2e-16
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
##  0.8929850 0.9196562
## sample estimates:
##       cor 
## 0.9072282
# boxplot
library(ggplot2)
ggplot(cleanbc, aes(x = Class, y = Cl.thickness, fill = Class)) +
  geom_boxplot() +
  labs(title = "Clump Thickness by Tumor Type")

2. Is there a difference in the number of mitoses between benign and malignant tumors?

t.test(Mitoses ~ Class, data = cleanbc)
## 
##  Welch Two Sample t-test
## 
## data:  Mitoses by Class
## t = -9.1697, df = 248.17, p-value < 2.2e-16
## alternative hypothesis: true difference in means between group benign and group malignant is not equal to 0
## 95 percent confidence interval:
##  -1.867370 -1.207021
## sample estimates:
##    mean in group benign mean in group malignant 
##                1.065315                2.602510
ggplot(cleanbc, aes(x = Class, y = Mitoses, fill = Class)) +
  geom_boxplot() +
  labs(title = "Mitoses by Class")

3. Is there a correlation between clump thickness and cell size?

cor.test(cleanbc$Cl.thickness, cleanbc$Cell.size)
## 
##  Pearson's product-moment correlation
## 
## data:  cleanbc$Cl.thickness and cleanbc$Cell.size
## t = 21.879, df = 681, p-value < 2.2e-16
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
##  0.5961976 0.6845089
## sample estimates:
##       cor 
## 0.6424815
ggplot(cleanbc, aes(x = Cl.thickness, y = Cell.size)) +
  geom_point() +
  geom_smooth(method = "lm") +
  labs(title = "Clump Thickness vs Cell Size")
## `geom_smooth()` using formula = 'y ~ x'

## 4. Is there a relationship between marginal adhesion and bare nuclei?

cleanbc$Bare.nuclei <- as.numeric(as.character(cleanbc$Bare.nuclei)) # Ensure numeric
cor.test(cleanbc$Marg.adhesion, cleanbc$Bare.nuclei)
## 
##  Pearson's product-moment correlation
## 
## data:  cleanbc$Marg.adhesion and cleanbc$Bare.nuclei
## t = 23.594, df = 681, p-value < 2.2e-16
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
##  0.6271832 0.7099493
## sample estimates:
##       cor 
## 0.6706483
ggplot(cleanbc, aes(x = Marg.adhesion, y = Bare.nuclei)) +
  geom_point() +
  geom_smooth(method = "lm") +
  labs(title = "Marginal Adhesion vs Bare Nuclei")
## `geom_smooth()` using formula = 'y ~ x'

# Correlation test between Cell Shape and Cell Size
cor.test(bc$Cell.shape, bc$Cell.size)
## 
##  Pearson's product-moment correlation
## 
## data:  bc$Cell.shape and bc$Cell.size
## t = 56.818, df = 697, p-value < 2.2e-16
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
##  0.8927650 0.9192197
## sample estimates:
##       cor 
## 0.9068819
# Scatter plot with regression line
library(ggplot2)
p_cleanbc <- ggplot(bc, aes(x = Cell.shape, y = Cell.size)) +
  geom_point(color = "steelblue", alpha = 0.6) +
  geom_smooth(method = "lm", se = TRUE, color = "darkred") +
  labs(
    title = "Relationship Between Cell Shape and Cell Size",
    x = "Uniformity of Cell Shape",
    y = "Uniformity of Cell Size"
  )

fig_bc <- ggplotly(p_cleanbc, tooltip = "text") %>%
  layout(modebar = list(visible = FALSE))
## `geom_smooth()` using formula = 'y ~ x'
# Show plot
fig_bc