# Load necessary libraries
library(neuralnet)
## Warning: package 'neuralnet' was built under R version 4.3.3
library(NeuralNetTools)
## Warning: package 'NeuralNetTools' was built under R version 4.3.3
library(ggplot2)
library(dplyr)
## 
## Attaching package: 'dplyr'
## The following object is masked from 'package:neuralnet':
## 
##     compute
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(caret)
## Warning: package 'caret' was built under R version 4.3.3
## Loading required package: lattice
# Load the dataset
url <- "http://archive.ics.uci.edu/ml/machine-learning-databases/haberman/haberman.data"
data <- read.csv(url, header = FALSE)
names(data) <- c("Age", "Year_Of_Operation", "+ve_Auxiliary_Nodes", "Survival_Status")
# Summary statistics for Age
cat("Oldest Patient Recorded:", max(data$Age), "\n")
## Oldest Patient Recorded: 83
cat("Youngest Patient Recorded:", min(data$Age), "\n")
## Youngest Patient Recorded: 30
# Visualizing the distribution of Age
hist(data$Age, main = "Distribution of Age", xlab = "Age", ylab = "Frequency", col = "red", border = "white")

# Creating Age groups and calculating survival rates
data$AgeGroups <- cut(data$Age, breaks = c(30, 40, 50, 60, 70, 80), right = FALSE)
survival_rate <- tapply(data$Survival_Status == 1, data$AgeGroups, mean)
for (i in names(survival_rate)) {
  print(paste("Survival Rate for Age Group", i, ":", round(survival_rate[i] * 100, 2), "%"))
}
## [1] "Survival Rate for Age Group [30,40) : 90 %"
## [1] "Survival Rate for Age Group [40,50) : 67.86 %"
## [1] "Survival Rate for Age Group [50,60) : 73.74 %"
## [1] "Survival Rate for Age Group [60,70) : 70.97 %"
## [1] "Survival Rate for Age Group [70,80) : 75 %"
# Finding youngest and oldest deaths recorded
no_survived <- data[data$Survival_Status == 2,]
cat("The Oldest Death Recorded:", max(no_survived$Age), "\n")
## The Oldest Death Recorded: 83
cat("The Youngest Death recorded:", min(no_survived$Age), "\n")
## The Youngest Death recorded: 34
# Checking correlation between Age and Survival Status
data$Survival_Status <- as.numeric(data$Survival_Status)
cat("The correlation percentage between the age and survival:", round(cor(data$Age, data$Survival_Status) * 100, 2), "%\n")
## The correlation percentage between the age and survival: 6.8 %
# Summary for Positive Auxiliary Nodes
cat("Highest No. of Nodes recorded in a patient:", max(data$`+ve_Auxiliary_Nodes`), "\n")
## Highest No. of Nodes recorded in a patient: 52
cat("Lowest No. of Nodes recorded in a patient:", min(data$`+ve_Auxiliary_Nodes`), "\n")
## Lowest No. of Nodes recorded in a patient: 0
# Distribution of Positive Auxiliary Nodes
boxplot(data$`+ve_Auxiliary_Nodes`, main = "Distribution of the No. of Positive Aux. Nodes", xlab = "No. of Nodes", ylab = "Frequency", col = "lightgreen")

cat("The average no. of Nodes presented:", mean(data$`+ve_Auxiliary_Nodes`), "\n")
## The average no. of Nodes presented: 4.026144
# Heatmap of Positive Nodes by Age Group
heatmap_data <- as.data.frame(table(data$AgeGroups, data$`+ve_Auxiliary_Nodes`))
ggplot(heatmap_data, aes(x = Var2, y = Var1)) +
  geom_tile(aes(fill = Freq), color = "white") +
  scale_fill_gradient(low = "white", high = "red") +
  labs(title = "Heatmap of Positive Nodes by Age Group", x = "Number of Positive Nodes", y = "Age Group")