This analysis explores the Haberman dataset, which contains data from patients who have undergone surgery for breast cancer. We will perform Exploratory Data Analysis (EDA) and construct a neural network model to predict survival status.
# Load necessary libraries
library(tidyverse) # For data manipulation and visualization
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.1.4 ✔ readr 2.1.5
## ✔ forcats 1.0.0 ✔ stringr 1.5.1
## ✔ ggplot2 3.5.1 ✔ tibble 3.2.1
## ✔ lubridate 1.9.3 ✔ tidyr 1.3.1
## ✔ purrr 1.0.2
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(caret) # For data preprocessing
## Loading required package: lattice
##
## Attaching package: 'caret'
##
## The following object is masked from 'package:purrr':
##
## lift
library(neuralnet) # For neural network modeling
##
## Attaching package: 'neuralnet'
##
## The following object is masked from 'package:dplyr':
##
## compute
# Load the Haberman dataset from an online source
url <- "https://raw.githubusercontent.com/jbrownlee/Datasets/master/haberman.csv"
haberman <- read.csv(url, header = FALSE)
# Rename columns for clarity
colnames(haberman) <- c("Age", "Year_of_Operation", "Positive_Axillary_Nodes", "Survival_Status")
# View the structure and summary of the dataset
str(haberman)
## 'data.frame': 306 obs. of 4 variables:
## $ Age : int 30 30 30 31 31 33 33 34 34 34 ...
## $ Year_of_Operation : int 64 62 65 59 65 58 60 59 66 58 ...
## $ Positive_Axillary_Nodes: int 1 3 0 2 4 10 0 0 9 30 ...
## $ Survival_Status : int 1 1 1 1 1 1 1 2 2 1 ...
summary(haberman)
## Age Year_of_Operation Positive_Axillary_Nodes Survival_Status
## Min. :30.00 Min. :58.00 Min. : 0.000 Min. :1.000
## 1st Qu.:44.00 1st Qu.:60.00 1st Qu.: 0.000 1st Qu.:1.000
## Median :52.00 Median :63.00 Median : 1.000 Median :1.000
## Mean :52.46 Mean :62.85 Mean : 4.026 Mean :1.265
## 3rd Qu.:60.75 3rd Qu.:65.75 3rd Qu.: 4.000 3rd Qu.:2.000
## Max. :83.00 Max. :69.00 Max. :52.000 Max. :2.000
# Visualizing the distribution of Age
ggplot(haberman, aes(x = Age)) +
geom_histogram(bins = 20, fill = "blue", alpha = 0.7) +
labs(title = "Age Distribution", x = "Age", y = "Frequency")
# Visualizing the relationship between Age and Positive Axillary Nodes
ggplot(haberman, aes(x = Age, y = Positive_Axillary_Nodes, color = factor(Survival_Status))) +
geom_point(alpha = 0.7) +
labs(title = "Age vs Positive Axillary Nodes", x = "Age", y = "Positive Axillary Nodes") +
scale_color_discrete(name = "Survival Status", labels = c("Survived", "Died"))
# Bar plot of survival status
ggplot(haberman, aes(x = factor(Survival_Status))) +
geom_bar(fill = "orange", alpha = 0.7) +
labs(title = "Survival Status Count", x = "Survival Status", y = "Count") +
scale_x_discrete(labels = c("1" = "Survived", "2" = "Died"))
# Convert Survival_Status to a factor
haberman$Survival_Status <- as.factor(haberman$Survival_Status)
# Normalize the data
normalize <- function(x) {
(x - min(x)) / (max(x) - min(x))
}
haberman_norm <- as.data.frame(lapply(haberman[, -4], normalize))
haberman_norm$Survival_Status <- haberman$Survival_Status
# Split the data into training and test sets
set.seed(123)
train_index <- createDataPartition(haberman_norm$Survival_Status, p = 0.7, list = FALSE)
train_data <- haberman_norm[train_index, ]
test_data <- haberman_norm[-train_index, ]
# Define the formula for the neural network
formula_nn <- Survival_Status ~ Age + Year_of_Operation + Positive_Axillary_Nodes
# Train the neural network
set.seed(123)
nn_model <- neuralnet(formula_nn, data = train_data, hidden = c(5, 3), linear.output = FALSE)
# Plot the neural network
plot(nn_model)
# Predict using the trained model
nn_predictions <- compute(nn_model, test_data[, -4])$net.result
# Convert predictions to binary class (1 or 2)
nn_pred_class <- ifelse(nn_predictions > 0.5, 2, 1)
# Check lengths of predictions and actuals
length(nn_pred_class)
## [1] 182
length(test_data$Survival_Status)
## [1] 91