options(repos = c(CRAN = "https://cloud.r-project.org/"))
if (!requireNamespace("mlbench", quietly = TRUE)) {
  install.packages("mlbench")
}

if (!requireNamespace("corrplot", quietly = TRUE)) {
  install.packages("corrplot")
}
url <- "C:\\Users\\Naman\\Downloads\\haberman.data"
dsurvive <- read.csv(url, header = FALSE)
column_names <- c("age", "year", "nodes", "survival_status")
colnames(dsurvive) <- column_names
library(mlbench)
library(corrplot)
## corrplot 0.95 loaded
library(caret)
## Loading required package: ggplot2
## Loading required package: lattice
install.packages("neuralnet")
## Installing package into 'C:/Users/Naman/AppData/Local/R/win-library/4.4'
## (as 'lib' is unspecified)
## package 'neuralnet' successfully unpacked and MD5 sums checked
## 
## The downloaded binary packages are in
##  C:\Users\Naman\AppData\Local\Temp\Rtmp2ZP5DR\downloaded_packages
library(neuralnet)
library(ggplot2)
library(reshape2)

#EDA

# 1. Basic Data Summary
print("Basic Data Summary:")
## [1] "Basic Data Summary:"
summary(dsurvive)
##       age             year           nodes        survival_status
##  Min.   :30.00   Min.   :58.00   Min.   : 0.000   Min.   :1.000  
##  1st Qu.:44.00   1st Qu.:60.00   1st Qu.: 0.000   1st Qu.:1.000  
##  Median :52.00   Median :63.00   Median : 1.000   Median :1.000  
##  Mean   :52.46   Mean   :62.85   Mean   : 4.026   Mean   :1.265  
##  3rd Qu.:60.75   3rd Qu.:65.75   3rd Qu.: 4.000   3rd Qu.:2.000  
##  Max.   :83.00   Max.   :69.00   Max.   :52.000   Max.   :2.000
# 2. Structure of the dataset
str(dsurvive)
## 'data.frame':    306 obs. of  4 variables:
##  $ age            : int  30 30 30 31 31 33 33 34 34 34 ...
##  $ year           : int  64 62 65 59 65 58 60 59 66 58 ...
##  $ nodes          : int  1 3 0 2 4 10 0 0 9 30 ...
##  $ survival_status: int  1 1 1 1 1 1 1 2 2 1 ...
# 3. Check class distribution
table(dsurvive$survival_status)
## 
##   1   2 
## 225  81
# 4. Correlation Analysis
correlation_matrix <- cor(dsurvive[,1:3])
corrplot(correlation_matrix, 
         method = "color",
         type = "upper",
         addCoef.col = "black",
         tl.col = "black",
         tl.srt = 45,
         title = "Feature Correlation Matrix")

5. Visualizations

Box plots for numerical variables

par(mfrow=c(2,2))
for(i in 1:3) {
    boxplot(dsurvive[,i], main=column_names[i], col="lightblue")
}

Density plots by survival status

par(mfrow=c(2,2))
for(i in 1:3) {
    plot(density(dsurvive[,i]), 
         main=paste("Density Plot of", column_names[i]),
         col="blue")
}

# Scatter plots
pairs(dsurvive[,1:3], 
      main="Scatter Plot Matrix",
      pch=21,
      bg=c("red","blue")[dsurvive$survival_status])

ANN Analysis

1. Data Preprocessing

# Convert survival_status to factor
dsurvive$survival_status <- as.factor(dsurvive$survival_status)
# Scale the features
preProc <- preProcess(dsurvive[,1:3], method=c("center", "scale"))
dsurvive_scaled <- predict(preProc, dsurvive[,1:3])
dsurvive_scaled$survival_status <- dsurvive$survival_status

#2.Split the data

set.seed(123)
trainIndex <- createDataPartition(dsurvive_scaled$survival_status, p=0.7, list=FALSE)
train_data <- dsurvive_scaled[trainIndex,]
test_data <- dsurvive_scaled[-trainIndex,]

3. Create and train the neural network

Convert survival_status to numeric (0/1)

train_data$survival_status <- as.numeric(train_data$survival_status) - 1
test_data$survival_status <- as.numeric(test_data$survival_status) - 1

Define and train the neural network

set.seed(123)
nn_model <- neuralnet(survival_status ~ age + year + nodes,
                      data = train_data,
                      hidden = c(5,3),  # Two hidden layers with 5 and 3 neurons
                      linear.output = FALSE,
                      threshold = 0.01)

4. Visualize the neural network

plot(nn_model, rep="best")

# 5. Make predictions

predictions <- compute(nn_model, test_data[,1:3])
predicted_classes <- ifelse(predictions$net.result > 0.5, 1, 0)

6. Evaluate the model

Confusion Matrix

conf_matrix <- table(Actual = test_data$survival_status, 
                    Predicted = predicted_classes)
print("Confusion Matrix:")
## [1] "Confusion Matrix:"
print(conf_matrix)
##       Predicted
## Actual  0  1
##      0 61  6
##      1 21  3

Calculate metrics

accuracy <- sum(diag(conf_matrix))/sum(conf_matrix)
precision <- conf_matrix[2,2]/sum(conf_matrix[,2])
recall <- conf_matrix[2,2]/sum(conf_matrix[2,])
f1_score <- 2 * (precision * recall)/(precision + recall)

Additional visualization: ROC Curve

library(pROC)
## Type 'citation("pROC")' for a citation.
## 
## Attaching package: 'pROC'
## The following objects are masked from 'package:stats':
## 
##     cov, smooth, var
roc_obj <- roc(test_data$survival_status, predictions$net.result)
## Setting levels: control = 0, case = 1
## Warning in roc.default(test_data$survival_status, predictions$net.result):
## Deprecated use a matrix as predictor. Unexpected results may be produced,
## please pass a numeric vector.
## Setting direction: controls < cases
plot(roc_obj, main="ROC Curve")

auc <- auc(roc_obj)
print(paste("AUC:", round(auc, 3)))
## [1] "AUC: 0.532"