NAME : DEEPIKA D

REG_NO : 2023MDTS07ALA034

ASSIGNMENT SUBMITTED TO : K A VENKATESH SIR

PROGRAM NAME : MSc DATA SCIENCE,3rd SEMESTER , ALLIANCE UNIVERSITY BANGLORE

DATE : 4TH SEP 2024 ,( 04/ 09/ 2024)

rpub link : https://rpubs.com/deepikaD/1216267

About - Haberman’s survival data

1.It Provides a valuable resource for studying survival outcomes in breast cancer patients.

2.It is mainly used for developing predictive models that can help in understanding the impact of age, year of surgery, and the number of positive lymph nodes on survival rates.

3.Characteristic information

4.Age:Patients age at the time of operation (numerical).

5.Year of Operation: recorded as the year minus 1900 (numerical).

6.Positive Axillary Nodes: Number of positive axillary lymph nodes detected during surgery (numerical).

7.Survival Status: The Survival result, where 1 indicates the patient survived 5 years or longer, and 2 indicates the patient died within 5 years.

8.In order to identify the variables affecting long-term survival,the study concentrated on the survival rates of patients who had breast cancer surgery.

9.The goal is to predict survial status based on the other variables

library(ggplot2)
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.4     ✔ readr     2.1.5
## ✔ forcats   1.0.0     ✔ stringr   1.5.1
## ✔ lubridate 1.9.3     ✔ tibble    3.2.1
## ✔ purrr     1.0.2     ✔ tidyr     1.3.1
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(dplyr)
library(neuralnet)
## 
## Attaching package: 'neuralnet'
## 
## The following object is masked from 'package:dplyr':
## 
##     compute
library(caret)
## Loading required package: lattice
## 
## Attaching package: 'caret'
## 
## The following object is masked from 'package:purrr':
## 
##     lift

Install and load required packages and dataset

if (!requireNamespace("mlbench", quietly = TRUE)) {
  install.packages("mlbench")
}

if (!requireNamespace("corrplot", quietly = TRUE)) {
  install.packages("corrplot")
}

##EDA Exploratory Data Analysis

url <- "C:/Users/deepika.d/Downloads/haberman+s+survival/haberman.data"

dsurvive <- read.csv(url, header = FALSE)
head(dsurvive)
##   V1 V2 V3 V4
## 1 30 64  1  1
## 2 30 62  3  1
## 3 30 65  0  1
## 4 31 59  2  1
## 5 31 65  4  1
## 6 33 58 10  1
dim(dsurvive)
## [1] 306   4
column_names <- c("age", "year", "nodes", "survival_status")
colnames(dsurvive) <- column_names
nrow(dsurvive)
## [1] 306
summary(dsurvive)
##       age             year           nodes        survival_status
##  Min.   :30.00   Min.   :58.00   Min.   : 0.000   Min.   :1.000  
##  1st Qu.:44.00   1st Qu.:60.00   1st Qu.: 0.000   1st Qu.:1.000  
##  Median :52.00   Median :63.00   Median : 1.000   Median :1.000  
##  Mean   :52.46   Mean   :62.85   Mean   : 4.026   Mean   :1.265  
##  3rd Qu.:60.75   3rd Qu.:65.75   3rd Qu.: 4.000   3rd Qu.:2.000  
##  Max.   :83.00   Max.   :69.00   Max.   :52.000   Max.   :2.000
str(dsurvive)
## 'data.frame':    306 obs. of  4 variables:
##  $ age            : int  30 30 30 31 31 33 33 34 34 34 ...
##  $ year           : int  64 62 65 59 65 58 60 59 66 58 ...
##  $ nodes          : int  1 3 0 2 4 10 0 0 9 30 ...
##  $ survival_status: int  1 1 1 1 1 1 1 2 2 1 ...
is.null(dsurvive)
## [1] FALSE
colSums(is.na(dsurvive))
##             age            year           nodes survival_status 
##               0               0               0               0
unique_valuesList <- unique(dsurvive$Survival_status)
print(str(dsurvive))
## 'data.frame':    306 obs. of  4 variables:
##  $ age            : int  30 30 30 31 31 33 33 34 34 34 ...
##  $ year           : int  64 62 65 59 65 58 60 59 66 58 ...
##  $ nodes          : int  1 3 0 2 4 10 0 0 9 30 ...
##  $ survival_status: int  1 1 1 1 1 1 1 2 2 1 ...
## NULL

##Univariate and Bivariate Analysis

hist(dsurvive$age, main="Histogram of age", xlab="age")

hist(dsurvive$nodes, main="Histogram of nodes", xlab="nodes")

hist(dsurvive$year, main="Histogram of year", xlab="year")

plot(dsurvive$age, dsurvive$nodes, main="Scatter Plot", xlab="age", ylab="nodes")

barplot(table(dsurvive$nodes), 
        main = "Bar Plot for  Variable",
        xlab = "nodes",
        ylab = "Frequency",
        col = "skyblue",
        border = "black"
)

boxplot(age ~ survival_status,
        data = dsurvive,
        main = 'Survival Status by Age',
        xlab = 'Survival Status',
        ylab = 'Age', 
        col = 'yellow',
        border = 'black')

plot(dsurvive$age, dsurvive$nodes, 
     main = "Scatterplot doctors",
     xlab = "age",
     ylab = "nodes",
     col = "Green",
     pch = 19  
)

# Create a bar plot of nodes 
ggplot(dsurvive, aes(x = nodes )) + 
  geom_bar(fill = "steelblue") + 
  labs(title = "Distribution of nodes", 
       x = "nodes", y = "Count") 

pairs(dsurvive[, sapply(dsurvive, is.numeric)])

plot(density(dsurvive$year), main="Density Plot of year")

plot(density(dsurvive$survival_status), main="Density Plot of survival_status")

Preprocessing

# Checking for duplicates

sum(duplicated(dsurvive))
## [1] 17
unique(dsurvive$survival_Status)
## NULL
# Convert Survival_Status to a factor
dsurvive$survival_status <- factor(dsurvive$survival_status, levels = c(1, 2),
                                   labels = c("Survived", "Not Survived"))
dsurvive$survival_status
##   [1] Survived     Survived     Survived     Survived     Survived    
##   [6] Survived     Survived     Not Survived Not Survived Survived    
##  [11] Survived     Survived     Survived     Survived     Survived    
##  [16] Survived     Survived     Survived     Survived     Survived    
##  [21] Survived     Survived     Survived     Survived     Not Survived
##  [26] Survived     Survived     Survived     Survived     Survived    
##  [31] Survived     Survived     Survived     Survived     Not Survived
##  [36] Survived     Survived     Survived     Survived     Survived    
##  [41] Survived     Survived     Survived     Not Survived Not Survived
##  [46] Not Survived Survived     Survived     Survived     Survived    
##  [51] Survived     Survived     Survived     Not Survived Not Survived
##  [56] Survived     Survived     Survived     Survived     Survived    
##  [61] Survived     Survived     Not Survived Not Survived Not Survived
##  [66] Not Survived Survived     Survived     Survived     Survived    
##  [71] Survived     Survived     Survived     Not Survived Not Survived
##  [76] Not Survived Survived     Survived     Survived     Survived    
##  [81] Not Survived Not Survived Not Survived Survived     Survived    
##  [86] Survived     Survived     Survived     Survived     Not Survived
##  [91] Not Survived Not Survived Not Survived Survived     Survived    
##  [96] Survived     Not Survived Not Survived Not Survived Survived    
## [101] Survived     Survived     Survived     Survived     Survived    
## [106] Survived     Survived     Not Survived Not Survived Not Survived
## [111] Survived     Survived     Survived     Survived     Not Survived
## [116] Not Survived Survived     Survived     Survived     Survived    
## [121] Survived     Survived     Survived     Survived     Not Survived
## [126] Not Survived Survived     Survived     Survived     Survived    
## [131] Survived     Survived     Survived     Survived     Survived    
## [136] Survived     Not Survived Not Survived Survived     Survived    
## [141] Survived     Survived     Not Survived Not Survived Not Survived
## [146] Not Survived Survived     Survived     Survived     Survived    
## [151] Survived     Survived     Survived     Survived     Survived    
## [156] Survived     Not Survived Not Survived Not Survived Not Survived
## [161] Not Survived Not Survived Survived     Survived     Survived    
## [166] Survived     Survived     Not Survived Not Survived Not Survived
## [171] Not Survived Survived     Survived     Survived     Survived    
## [176] Survived     Survived     Survived     Survived     Survived    
## [181] Not Survived Not Survived Survived     Survived     Survived    
## [186] Survived     Survived     Survived     Survived     Survived    
## [191] Not Survived Not Survived Survived     Survived     Survived    
## [196] Survived     Survived     Not Survived Not Survived Not Survived
## [201] Survived     Survived     Survived     Survived     Survived    
## [206] Survived     Survived     Survived     Survived     Survived    
## [211] Survived     Survived     Survived     Survived     Survived    
## [216] Not Survived Survived     Survived     Survived     Survived    
## [221] Survived     Survived     Survived     Not Survived Not Survived
## [226] Survived     Survived     Survived     Survived     Not Survived
## [231] Not Survived Not Survived Survived     Survived     Survived    
## [236] Survived     Survived     Survived     Not Survived Not Survived
## [241] Not Survived Survived     Survived     Survived     Survived    
## [246] Not Survived Survived     Survived     Survived     Survived    
## [251] Survived     Survived     Survived     Survived     Survived    
## [256] Survived     Survived     Survived     Not Survived Not Survived
## [261] Not Survived Not Survived Survived     Survived     Survived    
## [266] Survived     Survived     Survived     Not Survived Not Survived
## [271] Survived     Survived     Survived     Not Survived Not Survived
## [276] Survived     Survived     Survived     Survived     Survived    
## [281] Survived     Not Survived Survived     Survived     Survived    
## [286] Not Survived Not Survived Survived     Survived     Survived    
## [291] Survived     Survived     Survived     Not Survived Survived    
## [296] Survived     Survived     Survived     Survived     Not Survived
## [301] Survived     Survived     Survived     Survived     Not Survived
## [306] Not Survived
## Levels: Survived Not Survived
str(dsurvive)
## 'data.frame':    306 obs. of  4 variables:
##  $ age            : int  30 30 30 31 31 33 33 34 34 34 ...
##  $ year           : int  64 62 65 59 65 58 60 59 66 58 ...
##  $ nodes          : int  1 3 0 2 4 10 0 0 9 30 ...
##  $ survival_status: Factor w/ 2 levels "Survived","Not Survived": 1 1 1 1 1 1 1 2 2 1 ...

Model Selection

Architecture

Problem Solving

1.Above dataset comes under binary classification problem where the goal is to predict the survial_status of patients based on certain features given

2.As the dataset is simple we can go with feedforward neural network

  1. Input layers : 3 neurons (age, year and nodes)

4.Hidden layer : one hidden layer

5.Activation functions used : Sigmoid functions

6.output layer : single neuron with sigmoid activation functions

# Convert the target variable to numeric for neuralnet
dsurvive$survival_status <- as.numeric(dsurvive$survival_status) - 1
dsurvive$survival_status
##   [1] 0 0 0 0 0 0 0 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 1 0 0
##  [38] 0 0 0 0 0 0 1 1 1 0 0 0 0 0 0 0 1 1 0 0 0 0 0 0 0 1 1 1 1 0 0 0 0 0 0 0 1
##  [75] 1 1 0 0 0 0 1 1 1 0 0 0 0 0 0 1 1 1 1 0 0 0 1 1 1 0 0 0 0 0 0 0 0 1 1 1 0
## [112] 0 0 0 1 1 0 0 0 0 0 0 0 0 1 1 0 0 0 0 0 0 0 0 0 0 1 1 0 0 0 0 1 1 1 1 0 0
## [149] 0 0 0 0 0 0 0 0 1 1 1 1 1 1 0 0 0 0 0 1 1 1 1 0 0 0 0 0 0 0 0 0 1 1 0 0 0
## [186] 0 0 0 0 0 1 1 0 0 0 0 0 1 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0
## [223] 0 1 1 0 0 0 0 1 1 1 0 0 0 0 0 0 1 1 1 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 1
## [260] 1 1 1 0 0 0 0 0 0 1 1 0 0 0 1 1 0 0 0 0 0 0 1 0 0 0 1 1 0 0 0 0 0 0 1 0 0
## [297] 0 0 0 1 0 0 0 0 1 1

ANN

formula <- survival_status ~ age + year + nodes
nn_model <- neuralnet(formula, data = dsurvive, hidden = c(4), linear.output = FALSE, 
                      act.fct = "logistic", learningrate = 0.01)
#plot(nn_model)
predictions <- predict(nn_model, dsurvive)
predictions_binary <- ifelse(predictions > 0.5, 1, 0)
table(Predicted = predictions_binary, Actual = dsurvive$survival_status)
##          Actual
## Predicted   0   1
##         0 225  80
##         1   0   1
# Ensure the correct case for the column name
accuracy <- mean(predictions_binary == dsurvive$survival_status)

# Print the accuracy
print(paste("Accuracy:", round(accuracy * 100, 2), "%"))
## [1] "Accuracy: 73.86 %"

##Conclusion

1.This complete procedure offers a basic neural network application workflow for the R Haberman’s Survival dataset.

2.The steps are as follows: define the model, train the network, make predictions, and use accuracy as a metric to assess the model’s performance.

3.It demonstrates how to use the neuralnet package to handle a binary classification task and display the outcomes.