BRIEF DESCRIPTION ABOUT THE DATASET

The dataset consists of the following columns: Age: Age of the patient at the time of the operation (in years). Operation Year: The year in which the operation was performed. Auxillary Nodes: Number of positive axillary lymph nodes detected (an indicator of cancer spread). Survival Status: The target variable indicating whether the patient survived for 5 years or more after the operation. This is a binary outcome: 1: The patient survived 5 years or longer. 2: The patient did not survive 5 years.

#IMPORTING DATASET

library(dplyr)
library(ggplot2)
library(neuralnet)
df<-read.csv("/Users/lavasai/Downloads/haberman.csv")

#ASSIGNING COLUMN NAMES

names(df)<-c("Age","OperationYear","AuxillaryNode","SurvivalStatus")

#UNDERSTANDING THE DATASET

str(df)
## 'data.frame':    305 obs. of  4 variables:
##  $ Age           : int  30 30 31 31 33 33 34 34 34 34 ...
##  $ OperationYear : int  62 65 59 65 58 60 59 66 58 60 ...
##  $ AuxillaryNode : int  3 0 2 4 10 0 0 9 30 1 ...
##  $ SurvivalStatus: int  1 1 1 1 1 1 2 2 1 1 ...
head(df)
##   Age OperationYear AuxillaryNode SurvivalStatus
## 1  30            62             3              1
## 2  30            65             0              1
## 3  31            59             2              1
## 4  31            65             4              1
## 5  33            58            10              1
## 6  33            60             0              1
names(df)
## [1] "Age"            "OperationYear"  "AuxillaryNode"  "SurvivalStatus"
sum(is.null(df))
## [1] 0

#VISUALISING THE DATASET

# Distribution of Age (continuous)
ggplot(df, aes(x = Age)) + 
  geom_histogram(binwidth = 5, fill = "blue", color = "white") + 
  labs(title = "Age Distribution", x = "Age", y = "Frequency")

# Distribution of Auxillary Nodes (continuous)
ggplot(df, aes(x = AuxillaryNode)) + 
  geom_histogram(binwidth = 5, fill = "red", color = "white") + 
  labs(title = "Auxillary Nodes Distribution", x = "Number of Nodes", y = "Frequency")

# Distribution of Operation Year (discrete, so we use geom_bar)
ggplot(df, aes(x = as.factor(OperationYear))) + 
  geom_bar(fill = "green", color = "white") + 
  labs(title = "Operation Year Distribution", x = "Year of Operation", y = "Frequency")

# Survival status distribution (discrete, use geom_bar)
ggplot(df, aes(x = SurvivalStatus)) + 
  geom_bar(fill = "purple", color = "white") + 
  labs(title = "Survival Status Distribution", x = "Survival Status", y = "Frequency")

# Pairwise scatter plots
pairs(df[,1:3], col = ifelse(df$SurvivalStatus == 1, "blue", "red"), 
      main = "Pairwise Scatter Plots", pch = 19)

#NORMALISATION

normalize <- function(x) {
  return ((x - min(x)) / (max(x) - min(x)))
}

numeric_columns <- df[, sapply(df, is.numeric)]
df_norm <- as.data.frame(lapply(numeric_columns, normalize))

# Convert SurvivalStatus to numeric (binary: 0 or 1)
df_norm$SurvivalStatus <- as.numeric(df$SurvivalStatus) - 1  

# Ensure it's binary (should only have 0s and 1s)
table(df_norm$SurvivalStatus)
## 
##   0   1 
## 224  81
# Check structure
str(df_norm)
## 'data.frame':    305 obs. of  4 variables:
##  $ Age           : num  0 0 0.0189 0.0189 0.0566 ...
##  $ OperationYear : num  0.3636 0.6364 0.0909 0.6364 0 ...
##  $ AuxillaryNode : num  0.0577 0 0.0385 0.0769 0.1923 ...
##  $ SurvivalStatus: num  0 0 0 0 0 0 1 1 0 0 ...

#NEURAL NETWORK

# Split the data into training and test sets
set.seed(123)
index <- sample(1:nrow(df_norm), round(0.80 * nrow(df_norm)))
trainset <- df_norm[index,]
testset <- df_norm[-index,]

# Train the neural network
nn <- neuralnet(SurvivalStatus ~ Age + OperationYear + AuxillaryNode, 
                data = trainset, hidden = 3, 
                linear.output = FALSE, 
                act.fct = "tanh", 
                err.fct = "ce", 
                likelihood = TRUE)
## Warning in log(x): NaNs produced
## Warning: 'err.fct' does not fit 'data' or 'act.fct'
# Plot the neural network
plot(nn)
# Save the neural network plot as PDF
pdf("NeuralNet_plot.pdf", width = 8, height = 6)
plot(nn)
# Save the neural network plot as PNG
png("NeuralNet_plot.png", width = 800, height = 600)
plot(nn)