The dataset consists of the following columns: Age: Age of the patient at the time of the operation (in years). Operation Year: The year in which the operation was performed. Auxillary Nodes: Number of positive axillary lymph nodes detected (an indicator of cancer spread). Survival Status: The target variable indicating whether the patient survived for 5 years or more after the operation. This is a binary outcome: 1: The patient survived 5 years or longer. 2: The patient did not survive 5 years.
#IMPORTING DATASET
library(dplyr)
library(ggplot2)
library(neuralnet)
df<-read.csv("/Users/lavasai/Downloads/haberman.csv")
#ASSIGNING COLUMN NAMES
names(df)<-c("Age","OperationYear","AuxillaryNode","SurvivalStatus")
#UNDERSTANDING THE DATASET
str(df)
## 'data.frame': 305 obs. of 4 variables:
## $ Age : int 30 30 31 31 33 33 34 34 34 34 ...
## $ OperationYear : int 62 65 59 65 58 60 59 66 58 60 ...
## $ AuxillaryNode : int 3 0 2 4 10 0 0 9 30 1 ...
## $ SurvivalStatus: int 1 1 1 1 1 1 2 2 1 1 ...
head(df)
## Age OperationYear AuxillaryNode SurvivalStatus
## 1 30 62 3 1
## 2 30 65 0 1
## 3 31 59 2 1
## 4 31 65 4 1
## 5 33 58 10 1
## 6 33 60 0 1
names(df)
## [1] "Age" "OperationYear" "AuxillaryNode" "SurvivalStatus"
sum(is.null(df))
## [1] 0
#VISUALISING THE DATASET
# Distribution of Age (continuous)
ggplot(df, aes(x = Age)) +
geom_histogram(binwidth = 5, fill = "blue", color = "white") +
labs(title = "Age Distribution", x = "Age", y = "Frequency")
# Distribution of Auxillary Nodes (continuous)
ggplot(df, aes(x = AuxillaryNode)) +
geom_histogram(binwidth = 5, fill = "red", color = "white") +
labs(title = "Auxillary Nodes Distribution", x = "Number of Nodes", y = "Frequency")
# Distribution of Operation Year (discrete, so we use geom_bar)
ggplot(df, aes(x = as.factor(OperationYear))) +
geom_bar(fill = "green", color = "white") +
labs(title = "Operation Year Distribution", x = "Year of Operation", y = "Frequency")
# Survival status distribution (discrete, use geom_bar)
ggplot(df, aes(x = SurvivalStatus)) +
geom_bar(fill = "purple", color = "white") +
labs(title = "Survival Status Distribution", x = "Survival Status", y = "Frequency")
# Pairwise scatter plots
pairs(df[,1:3], col = ifelse(df$SurvivalStatus == 1, "blue", "red"),
main = "Pairwise Scatter Plots", pch = 19)
#NORMALISATION
normalize <- function(x) {
return ((x - min(x)) / (max(x) - min(x)))
}
numeric_columns <- df[, sapply(df, is.numeric)]
df_norm <- as.data.frame(lapply(numeric_columns, normalize))
# Convert SurvivalStatus to numeric (binary: 0 or 1)
df_norm$SurvivalStatus <- as.numeric(df$SurvivalStatus) - 1
# Ensure it's binary (should only have 0s and 1s)
table(df_norm$SurvivalStatus)
##
## 0 1
## 224 81
# Check structure
str(df_norm)
## 'data.frame': 305 obs. of 4 variables:
## $ Age : num 0 0 0.0189 0.0189 0.0566 ...
## $ OperationYear : num 0.3636 0.6364 0.0909 0.6364 0 ...
## $ AuxillaryNode : num 0.0577 0 0.0385 0.0769 0.1923 ...
## $ SurvivalStatus: num 0 0 0 0 0 0 1 1 0 0 ...
#NEURAL NETWORK
# Split the data into training and test sets
set.seed(123)
index <- sample(1:nrow(df_norm), round(0.80 * nrow(df_norm)))
trainset <- df_norm[index,]
testset <- df_norm[-index,]
# Train the neural network
nn <- neuralnet(SurvivalStatus ~ Age + OperationYear + AuxillaryNode,
data = trainset, hidden = 3,
linear.output = FALSE,
act.fct = "tanh",
err.fct = "ce",
likelihood = TRUE)
## Warning in log(x): NaNs produced
## Warning: 'err.fct' does not fit 'data' or 'act.fct'
# Plot the neural network
plot(nn)
# Save the neural network plot as PDF
pdf("NeuralNet_plot.pdf", width = 8, height = 6)
plot(nn)
# Save the neural network plot as PNG
png("NeuralNet_plot.png", width = 800, height = 600)
plot(nn)