install.packages("tidyverse")
## Installing package into 'C:/Users/Rimalyn Magallamento/AppData/Local/R/win-library/4.4'
## (as 'lib' is unspecified)
## package 'tidyverse' successfully unpacked and MD5 sums checked
##
## The downloaded binary packages are in
## C:\Users\Rimalyn Magallamento\AppData\Local\Temp\RtmpEPM13M\downloaded_packages
library(plotly)
## Warning: package 'plotly' was built under R version 4.4.3
## Loading required package: ggplot2
## Warning: package 'ggplot2' was built under R version 4.4.3
##
## Attaching package: 'plotly'
## The following object is masked from 'package:ggplot2':
##
## last_plot
## The following object is masked from 'package:stats':
##
## filter
## The following object is masked from 'package:graphics':
##
## layout
library(mlbench)
## Warning: package 'mlbench' was built under R version 4.4.3
library(ggplot2)
library(tidyverse)
## Warning: package 'tidyverse' was built under R version 4.4.3
## Warning: package 'tibble' was built under R version 4.4.3
## Warning: package 'tidyr' was built under R version 4.4.3
## Warning: package 'readr' was built under R version 4.4.3
## Warning: package 'purrr' was built under R version 4.4.3
## Warning: package 'dplyr' was built under R version 4.4.3
## Warning: package 'forcats' was built under R version 4.4.3
## Warning: package 'lubridate' was built under R version 4.4.3
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.1.4 ✔ readr 2.1.5
## ✔ forcats 1.0.0 ✔ stringr 1.5.1
## ✔ lubridate 1.9.4 ✔ tibble 3.2.1
## ✔ purrr 1.0.4 ✔ tidyr 1.3.1
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks plotly::filter(), stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
data(BreastCancer)
head(BreastCancer)
## Id Cl.thickness Cell.size Cell.shape Marg.adhesion Epith.c.size
## 1 1000025 5 1 1 1 2
## 2 1002945 5 4 4 5 7
## 3 1015425 3 1 1 1 2
## 4 1016277 6 8 8 1 3
## 5 1017023 4 1 1 3 2
## 6 1017122 8 10 10 8 7
## Bare.nuclei Bl.cromatin Normal.nucleoli Mitoses Class
## 1 1 3 1 1 benign
## 2 10 3 2 1 benign
## 3 2 3 1 1 benign
## 4 4 3 7 1 benign
## 5 1 3 1 1 benign
## 6 10 9 7 1 malignant
dim(BreastCancer)
## [1] 699 11
str(BreastCancer)
## 'data.frame': 699 obs. of 11 variables:
## $ Id : chr "1000025" "1002945" "1015425" "1016277" ...
## $ Cl.thickness : Ord.factor w/ 10 levels "1"<"2"<"3"<"4"<..: 5 5 3 6 4 8 1 2 2 4 ...
## $ Cell.size : Ord.factor w/ 10 levels "1"<"2"<"3"<"4"<..: 1 4 1 8 1 10 1 1 1 2 ...
## $ Cell.shape : Ord.factor w/ 10 levels "1"<"2"<"3"<"4"<..: 1 4 1 8 1 10 1 2 1 1 ...
## $ Marg.adhesion : Ord.factor w/ 10 levels "1"<"2"<"3"<"4"<..: 1 5 1 1 3 8 1 1 1 1 ...
## $ Epith.c.size : Ord.factor w/ 10 levels "1"<"2"<"3"<"4"<..: 2 7 2 3 2 7 2 2 2 2 ...
## $ Bare.nuclei : Factor w/ 10 levels "1","2","3","4",..: 1 10 2 4 1 10 10 1 1 1 ...
## $ Bl.cromatin : Factor w/ 10 levels "1","2","3","4",..: 3 3 3 3 3 9 3 3 1 2 ...
## $ Normal.nucleoli: Factor w/ 10 levels "1","2","3","4",..: 1 2 1 7 1 7 1 1 1 1 ...
## $ Mitoses : Factor w/ 9 levels "1","2","3","4",..: 1 1 1 1 1 1 1 1 5 1 ...
## $ Class : Factor w/ 2 levels "benign","malignant": 1 1 1 1 1 2 1 1 1 1 ...
colnames(BreastCancer)
## [1] "Id" "Cl.thickness" "Cell.size" "Cell.shape"
## [5] "Marg.adhesion" "Epith.c.size" "Bare.nuclei" "Bl.cromatin"
## [9] "Normal.nucleoli" "Mitoses" "Class"
summary(BreastCancer)
## Id Cl.thickness Cell.size Cell.shape Marg.adhesion
## Length:699 1 :145 1 :384 1 :353 1 :407
## Class :character 5 :130 10 : 67 2 : 59 2 : 58
## Mode :character 3 :108 3 : 52 10 : 58 3 : 58
## 4 : 80 2 : 45 3 : 56 10 : 55
## 10 : 69 4 : 40 4 : 44 4 : 33
## 2 : 50 5 : 30 5 : 34 8 : 25
## (Other):117 (Other): 81 (Other): 95 (Other): 63
## Epith.c.size Bare.nuclei Bl.cromatin Normal.nucleoli Mitoses
## 2 :386 1 :402 2 :166 1 :443 1 :579
## 3 : 72 10 :132 3 :165 10 : 61 2 : 35
## 4 : 48 2 : 30 1 :152 3 : 44 3 : 33
## 1 : 47 5 : 30 7 : 73 2 : 36 10 : 14
## 6 : 41 3 : 28 4 : 40 8 : 24 4 : 12
## 5 : 39 (Other): 61 5 : 34 6 : 22 7 : 9
## (Other): 66 NA's : 16 (Other): 69 (Other): 69 (Other): 17
## Class
## benign :458
## malignant:241
##
##
##
##
##
sum(is.na(BreastCancer))
## [1] 16
bc <- BreastCancer[, -1]
# Replace "?" with NA
bc[bc == "?"] <- NA
# Convert all to proper types (except Class)
bc$Class <- factor(bc$Class, levels = c("benign", "malignant"))
bc[ , 1:9] <- lapply(bc[ , 1:9], function(x) as.numeric(as.character(x)))
cleanbc <- na.omit(bc)
sum(is.na(cleanbc))
## [1] 0
cleanbc$Id <- NULL
cleanbc$Class <- factor(cleanbc$Class, levels = c("benign", "malignant"))
##Statistical Questions: ## 1. Do malignant tumors have significantly higher clump thickness than benign ones?
# t-test
t.test(Cl.thickness ~ Class, data = cleanbc)
##
## Welch Two Sample t-test
##
## data: Cl.thickness by Class
## t = -23.927, df = 361.43, p-value < 2.2e-16
## alternative hypothesis: true difference in means between group benign and group malignant is not equal to 0
## 95 percent confidence interval:
## -4.571510 -3.877131
## sample estimates:
## mean in group benign mean in group malignant
## 2.963964 7.188285
# correlation
cor.test(cleanbc$Cell.size, cleanbc$Cell.shape)
##
## Pearson's product-moment correlation
##
## data: cleanbc$Cell.size and cleanbc$Cell.shape
## t = 56.283, df = 681, p-value < 2.2e-16
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
## 0.8929850 0.9196562
## sample estimates:
## cor
## 0.9072282
# boxplot
library(ggplot2)
ggplot(cleanbc, aes(x = Class, y = Cl.thickness, fill = Class)) +
geom_boxplot() +
labs(title = "Clump Thickness by Tumor Type")
t.test(Mitoses ~ Class, data = cleanbc)
##
## Welch Two Sample t-test
##
## data: Mitoses by Class
## t = -9.1697, df = 248.17, p-value < 2.2e-16
## alternative hypothesis: true difference in means between group benign and group malignant is not equal to 0
## 95 percent confidence interval:
## -1.867370 -1.207021
## sample estimates:
## mean in group benign mean in group malignant
## 1.065315 2.602510
ggplot(cleanbc, aes(x = Class, y = Mitoses, fill = Class)) +
geom_boxplot() +
labs(title = "Mitoses by Class")
cor.test(cleanbc$Cl.thickness, cleanbc$Cell.size)
##
## Pearson's product-moment correlation
##
## data: cleanbc$Cl.thickness and cleanbc$Cell.size
## t = 21.879, df = 681, p-value < 2.2e-16
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
## 0.5961976 0.6845089
## sample estimates:
## cor
## 0.6424815
ggplot(cleanbc, aes(x = Cl.thickness, y = Cell.size)) +
geom_point() +
geom_smooth(method = "lm") +
labs(title = "Clump Thickness vs Cell Size")
## `geom_smooth()` using formula = 'y ~ x'
## 4. Is there a relationship between marginal adhesion and bare
nuclei?
cleanbc$Bare.nuclei <- as.numeric(as.character(cleanbc$Bare.nuclei)) # Ensure numeric
cor.test(cleanbc$Marg.adhesion, cleanbc$Bare.nuclei)
##
## Pearson's product-moment correlation
##
## data: cleanbc$Marg.adhesion and cleanbc$Bare.nuclei
## t = 23.594, df = 681, p-value < 2.2e-16
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
## 0.6271832 0.7099493
## sample estimates:
## cor
## 0.6706483
ggplot(cleanbc, aes(x = Marg.adhesion, y = Bare.nuclei)) +
geom_point() +
geom_smooth(method = "lm") +
labs(title = "Marginal Adhesion vs Bare Nuclei")
## `geom_smooth()` using formula = 'y ~ x'
# Correlation test between Cell Shape and Cell Size
cor.test(bc$Cell.shape, bc$Cell.size)
##
## Pearson's product-moment correlation
##
## data: bc$Cell.shape and bc$Cell.size
## t = 56.818, df = 697, p-value < 2.2e-16
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
## 0.8927650 0.9192197
## sample estimates:
## cor
## 0.9068819
# Scatter plot with regression line
library(ggplot2)
p_cleanbc <- ggplot(bc, aes(x = Cell.shape, y = Cell.size)) +
geom_point(color = "steelblue", alpha = 0.6) +
geom_smooth(method = "lm", se = TRUE, color = "darkred") +
labs(
title = "Relationship Between Cell Shape and Cell Size",
x = "Uniformity of Cell Shape",
y = "Uniformity of Cell Size"
)
fig_bc <- ggplotly(p_cleanbc, tooltip = "text") %>%
layout(modebar = list(visible = FALSE))
## `geom_smooth()` using formula = 'y ~ x'
# Show plot
fig_bc