# Load necessary libraries

library(ggplot2)
library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(neuralnet)
## 
## Attaching package: 'neuralnet'
## The following object is masked from 'package:dplyr':
## 
##     compute
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ forcats   1.0.0     ✔ stringr   1.5.1
## ✔ lubridate 1.9.3     ✔ tibble    3.2.1
## ✔ purrr     1.0.2     ✔ tidyr     1.3.1
## ✔ readr     2.1.5
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ neuralnet::compute() masks dplyr::compute()
## ✖ dplyr::filter()      masks stats::filter()
## ✖ dplyr::lag()         masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(MASS)
## 
## Attaching package: 'MASS'
## 
## The following object is masked from 'package:dplyr':
## 
##     select
# Load the dataset directly from R's built-in datasets

haberman <- read.csv("C://Users//shiva mishra//OneDrive//Desktop//semester 3//machine learning 2//datasets//haberman.csv", header = FALSE)

# Rename columns for clarity
colnames(haberman) <- c("Age", "Year_of_Operation", "Axillary_Nodes", "Survival_Status")

# Convert Survival_Status to a factor for classification
haberman$Survival_Status <- as.factor(ifelse(haberman$Survival_Status == 2, 0, 1))

# View the structure of the dataset
str(haberman)
## 'data.frame':    306 obs. of  4 variables:
##  $ Age              : int  30 30 30 31 31 33 33 34 34 34 ...
##  $ Year_of_Operation: int  64 62 65 59 65 58 60 59 66 58 ...
##  $ Axillary_Nodes   : int  1 3 0 2 4 10 0 0 9 30 ...
##  $ Survival_Status  : Factor w/ 2 levels "0","1": 2 2 2 2 2 2 2 1 1 2 ...
# Summary statistics
summary(haberman)
##       Age        Year_of_Operation Axillary_Nodes   Survival_Status
##  Min.   :30.00   Min.   :58.00     Min.   : 0.000   0: 81          
##  1st Qu.:44.00   1st Qu.:60.00     1st Qu.: 0.000   1:225          
##  Median :52.00   Median :63.00     Median : 1.000                  
##  Mean   :52.46   Mean   :62.85     Mean   : 4.026                  
##  3rd Qu.:60.75   3rd Qu.:65.75     3rd Qu.: 4.000                  
##  Max.   :83.00   Max.   :69.00     Max.   :52.000
# Plot Age distribution
ggplot(haberman, aes(x = Age)) + 
  geom_histogram(binwidth = 5, fill = "blue", color = "black") +
  theme_minimal() +
  labs(title = "Age Distribution of Patients")

# Plot Axillary Nodes distribution
ggplot(haberman, aes(x = Axillary_Nodes)) + 
  geom_histogram(binwidth = 1, fill = "green", color = "black") +
  theme_minimal() +
  labs(title = "Axillary Nodes Distribution")

# Boxplot for Survival_Status vs Age
ggplot(haberman, aes(x = Survival_Status, y = Age)) +
  geom_boxplot(fill = "orange", color = "black") +
  labs(title = "Survival Status vs Age", x = "Survival Status", y = "Age")

# Boxplot for Survival_Status vs Axillary Nodes
ggplot(haberman, aes(x = Survival_Status, y = Axillary_Nodes)) +
  geom_boxplot(fill = "purple", color = "black") +
  labs(title = "Survival Status vs Axillary Nodes", x = "Survival Status", y = "Axillary Nodes")

# Boxplot for Survival_Status vs Age
ggplot(haberman, aes(x = Survival_Status, y = Age)) +
  geom_boxplot(fill = "orange", color = "black") +
  labs(title = "Survival Status vs Age", x = "Survival Status", y = "Age")

# Boxplot for Survival_Status vs Axillary Nodes
ggplot(haberman, aes(x = Survival_Status, y = Axillary_Nodes)) +
  geom_boxplot(fill = "purple", color = "black") +
  labs(title = "Survival Status vs Axillary Nodes", x = "Survival Status", y = "Axillary Nodes")

# Normalize the numeric columns
haberman_scaled <- haberman %>%
  mutate(
    Age = (Age - min(Age)) / (max(Age) - min(Age)),
    Year_of_Operation = (Year_of_Operation - min(Year_of_Operation)) / (max(Year_of_Operation) - min(Year_of_Operation)),
    Axillary_Nodes = (Axillary_Nodes - min(Axillary_Nodes)) / (max(Axillary_Nodes) - min(Axillary_Nodes))
  )

# Split data into training (70%) and test (30%)
set.seed(123)
train_index <- sample(1:nrow(haberman_scaled), 0.7 * nrow(haberman_scaled))
train_data <- haberman_scaled[train_index, ]
test_data <- haberman_scaled[-train_index, ]
# Build the neural network model using only training data
nn_model <- neuralnet(Survival_Status ~ Age + Year_of_Operation + Axillary_Nodes,
                      data = train_data, hidden = 3, linear.output = FALSE)
# Plot the neural network
plot(nn_model)
# Ensure predictions are made ONLY on the test data
test_inputs <- test_data[, c("Age", "Year_of_Operation", "Axillary_Nodes")]

# Make predictions on test data
predictions <- predict(nn_model, test_inputs)

# Convert predictions to binary class labels
predicted_class <- ifelse(predictions > 0.5, 1, 0)

# Check if the lengths match now
length(predicted_class)  # Should match the number of rows in the test_data
## [1] 184
length(test_data$Survival_Status)
## [1] 92
# Only proceed if lengths match
if(length(predicted_class) == length(test_data$Survival_Status)) {
  # Create confusion matrix: Predicted vs Actual
  confusion_matrix <- table(Predicted = predicted_class, Actual = test_data$Survival_Status)
  print(confusion_matrix)

  # Calculate accuracy
  accuracy <- sum(diag(confusion_matrix)) / sum(confusion_matrix)
  paste("Accuracy: ", round(accuracy * 100, 2), "%")
 }else{
  print("Error: Length mismatch between predicted and actual values")
}
## [1] "Error: Length mismatch between predicted and actual values"