# Load necessary libraries
library(ggplot2)
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(neuralnet)
##
## Attaching package: 'neuralnet'
## The following object is masked from 'package:dplyr':
##
## compute
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ forcats 1.0.0 ✔ stringr 1.5.1
## ✔ lubridate 1.9.3 ✔ tibble 3.2.1
## ✔ purrr 1.0.2 ✔ tidyr 1.3.1
## ✔ readr 2.1.5
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ neuralnet::compute() masks dplyr::compute()
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(MASS)
##
## Attaching package: 'MASS'
##
## The following object is masked from 'package:dplyr':
##
## select
# Load the dataset directly from R's built-in datasets
haberman <- read.csv("C://Users//shiva mishra//OneDrive//Desktop//semester 3//machine learning 2//datasets//haberman.csv", header = FALSE)
# Rename columns for clarity
colnames(haberman) <- c("Age", "Year_of_Operation", "Axillary_Nodes", "Survival_Status")
# Convert Survival_Status to a factor for classification
haberman$Survival_Status <- as.factor(ifelse(haberman$Survival_Status == 2, 0, 1))
# View the structure of the dataset
str(haberman)
## 'data.frame': 306 obs. of 4 variables:
## $ Age : int 30 30 30 31 31 33 33 34 34 34 ...
## $ Year_of_Operation: int 64 62 65 59 65 58 60 59 66 58 ...
## $ Axillary_Nodes : int 1 3 0 2 4 10 0 0 9 30 ...
## $ Survival_Status : Factor w/ 2 levels "0","1": 2 2 2 2 2 2 2 1 1 2 ...
# Summary statistics
summary(haberman)
## Age Year_of_Operation Axillary_Nodes Survival_Status
## Min. :30.00 Min. :58.00 Min. : 0.000 0: 81
## 1st Qu.:44.00 1st Qu.:60.00 1st Qu.: 0.000 1:225
## Median :52.00 Median :63.00 Median : 1.000
## Mean :52.46 Mean :62.85 Mean : 4.026
## 3rd Qu.:60.75 3rd Qu.:65.75 3rd Qu.: 4.000
## Max. :83.00 Max. :69.00 Max. :52.000
# Plot Age distribution
ggplot(haberman, aes(x = Age)) +
geom_histogram(binwidth = 5, fill = "blue", color = "black") +
theme_minimal() +
labs(title = "Age Distribution of Patients")

# Plot Axillary Nodes distribution
ggplot(haberman, aes(x = Axillary_Nodes)) +
geom_histogram(binwidth = 1, fill = "green", color = "black") +
theme_minimal() +
labs(title = "Axillary Nodes Distribution")

# Boxplot for Survival_Status vs Age
ggplot(haberman, aes(x = Survival_Status, y = Age)) +
geom_boxplot(fill = "orange", color = "black") +
labs(title = "Survival Status vs Age", x = "Survival Status", y = "Age")

# Boxplot for Survival_Status vs Axillary Nodes
ggplot(haberman, aes(x = Survival_Status, y = Axillary_Nodes)) +
geom_boxplot(fill = "purple", color = "black") +
labs(title = "Survival Status vs Axillary Nodes", x = "Survival Status", y = "Axillary Nodes")

# Boxplot for Survival_Status vs Age
ggplot(haberman, aes(x = Survival_Status, y = Age)) +
geom_boxplot(fill = "orange", color = "black") +
labs(title = "Survival Status vs Age", x = "Survival Status", y = "Age")

# Boxplot for Survival_Status vs Axillary Nodes
ggplot(haberman, aes(x = Survival_Status, y = Axillary_Nodes)) +
geom_boxplot(fill = "purple", color = "black") +
labs(title = "Survival Status vs Axillary Nodes", x = "Survival Status", y = "Axillary Nodes")

# Normalize the numeric columns
haberman_scaled <- haberman %>%
mutate(
Age = (Age - min(Age)) / (max(Age) - min(Age)),
Year_of_Operation = (Year_of_Operation - min(Year_of_Operation)) / (max(Year_of_Operation) - min(Year_of_Operation)),
Axillary_Nodes = (Axillary_Nodes - min(Axillary_Nodes)) / (max(Axillary_Nodes) - min(Axillary_Nodes))
)
# Split data into training (70%) and test (30%)
set.seed(123)
train_index <- sample(1:nrow(haberman_scaled), 0.7 * nrow(haberman_scaled))
train_data <- haberman_scaled[train_index, ]
test_data <- haberman_scaled[-train_index, ]
# Build the neural network model using only training data
nn_model <- neuralnet(Survival_Status ~ Age + Year_of_Operation + Axillary_Nodes,
data = train_data, hidden = 3, linear.output = FALSE)
# Plot the neural network
plot(nn_model)
# Ensure predictions are made ONLY on the test data
test_inputs <- test_data[, c("Age", "Year_of_Operation", "Axillary_Nodes")]
# Make predictions on test data
predictions <- predict(nn_model, test_inputs)
# Convert predictions to binary class labels
predicted_class <- ifelse(predictions > 0.5, 1, 0)
# Check if the lengths match now
length(predicted_class) # Should match the number of rows in the test_data
## [1] 184
length(test_data$Survival_Status)
## [1] 92
# Only proceed if lengths match
if(length(predicted_class) == length(test_data$Survival_Status)) {
# Create confusion matrix: Predicted vs Actual
confusion_matrix <- table(Predicted = predicted_class, Actual = test_data$Survival_Status)
print(confusion_matrix)
# Calculate accuracy
accuracy <- sum(diag(confusion_matrix)) / sum(confusion_matrix)
paste("Accuracy: ", round(accuracy * 100, 2), "%")
}else{
print("Error: Length mismatch between predicted and actual values")
}
## [1] "Error: Length mismatch between predicted and actual values"