Assignment submitted to K A Venkatesh

MSc. Data Science - 3rd Semester, Alliance University

Introduction

This document presents the analysis of survival data using various methods including data summarization, visualization, and neural network modeling. The dataset used is haberman_data.txt.

The dataset contains 4 columns:

  1. Age of patient at time of operation (numerical)
  2. Patient’s year of operation (year - 1900, numerical)
  3. Number of positive axillary nodes detected (numerical)
  4. Survival status (class attribute):
    • 1 = the patient survived 5 years or longer
    • 2 = the patient died within 5 years

Load dataset

survival_df <- read.table("haberman_data.txt", sep = ',')
names(survival_df) <- c("Age", "Year", "Num_Axillary", "Survival_Status")

Convert Survival_Status to a factor

survival_df$Survival_Status <- factor(survival_df$Survival_Status)

Display the structure and summary of the data

str(survival_df)
## 'data.frame':    306 obs. of  4 variables:
##  $ Age            : int  30 30 30 31 31 33 33 34 34 34 ...
##  $ Year           : int  64 62 65 59 65 58 60 59 66 58 ...
##  $ Num_Axillary   : int  1 3 0 2 4 10 0 0 9 30 ...
##  $ Survival_Status: Factor w/ 2 levels "1","2": 1 1 1 1 1 1 1 2 2 1 ...
summary(survival_df)
##       Age             Year        Num_Axillary    Survival_Status
##  Min.   :30.00   Min.   :58.00   Min.   : 0.000   1:225          
##  1st Qu.:44.00   1st Qu.:60.00   1st Qu.: 0.000   2: 81          
##  Median :52.00   Median :63.00   Median : 1.000                  
##  Mean   :52.46   Mean   :62.85   Mean   : 4.026                  
##  3rd Qu.:60.75   3rd Qu.:65.75   3rd Qu.: 4.000                  
##  Max.   :83.00   Max.   :69.00   Max.   :52.000

Group by Year and summarize survival and death counts

yearwise <- survival_df %>%
  group_by(Year) %>%
  summarize(Total_survived = sum(Survival_Status == 1),
            Total_deaths = sum(Survival_Status == 2))

Convert to long format for plotting

yearwise_long <- pivot_longer(yearwise, cols = c(Total_survived, Total_deaths),
                              names_to = "Category", values_to = "Count")

Plot Yearly Survival and Death Counts

ggplot(yearwise_long, aes(x = Year, y = Count, color = Category)) +
  geom_path(size = 1.2) +
  labs(title = "Yearly Survival and Death Counts",
       subtitle = "Comparison of Total Survived vs. Total Deaths",
       x = "Year",
       y = "Count",
       color = "Category") +
  scale_color_manual(values = c("Total_survived" = "blue", "Total_deaths" = "red")) +
  theme_minimal() +
  theme(plot.title = element_text(hjust = 0.5),
        plot.subtitle = element_text(hjust = 0.5))
## Warning: Using `size` aesthetic for lines was deprecated in ggplot2 3.4.0.
## ℹ Please use `linewidth` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.

Bar Plot for Yearly Survival and Death Counts

ggplot(yearwise_long, aes(x = Year, y = Count, fill = Category)) +
  geom_bar(stat = "identity", position = "dodge") +
  labs(title = "Yearly Survival and Death Counts",
       subtitle = "Comparison of Total Survived vs. Total Deaths",
       x = "Year",
       y = "Count",
       fill = "Category") +
  scale_fill_manual(values = c("Total_survived" = "blue", "Total_deaths" = "red")) +
  theme_minimal() +
  theme(plot.title = element_text(hjust = 0.5),
        plot.subtitle = element_text(hjust = 0.5))

Group by Age and summarize survival and death counts

agewise <- survival_df %>%
  group_by(Age) %>%
  summarize(Total_survived = sum(Survival_Status == 1),
            Total_deaths = sum(Survival_Status == 2))

Convert to long format for plotting

agewise_long <- pivot_longer(agewise, cols = c(Total_survived, Total_deaths),
                             names_to = "Category", values_to = "Count")

Bar Plot for Age-Wise Survival and Death Counts

ggplot(agewise_long, aes(x = Age, y = Count, fill = Category)) +
  geom_bar(stat = "identity", position = "dodge") +
  labs(title = "Age Survival and Death Counts",
       subtitle = "Comparison of Total Survived vs. Total Deaths",
       x = "Age",
       y = "Count",
       fill = "Category") +
  scale_fill_manual(values = c("Total_survived" = "blue", "Total_deaths" = "red")) +
  theme_minimal() +
  theme(plot.title = element_text(hjust = 0.5),
        plot.subtitle = element_text(hjust = 0.5))

Histogram of Yearly Distribution

ggplot(survival_df, aes(Year)) +
  geom_histogram(binwidth = 0.5, col = 'blue', fill = "blue") +
  geom_text(stat = 'bin', aes(label = ..count..), vjust = -0.5, binwidth = 0.5, color = "black") +
  labs(title = "Yearly Distribution of Survival Data",
       x = "Year",
       y = "Count") +
  theme_minimal()
## Warning: The dot-dot notation (`..count..`) was deprecated in ggplot2 3.4.0.
## ℹ Please use `after_stat(count)` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.

Histogram of Age Distribution

ggplot(survival_df, aes(Age)) +
  geom_histogram(binwidth = 0.5, col = 'blue', fill = "blue") +
  geom_text(stat = 'bin', aes(label = ..count..), vjust = -0.5, binwidth = 0.5, color = "black") +
  labs(title = "Age Distribution of Survival Data",
       x = "Age",
       y = "Count") +
  theme_minimal()

Convert data to numeric and compute correlation

sr <- survival_df
sr <- sapply(sr, as.numeric)
cor_matrix <- cor(sr)

# Plot correlation matrix
corrplot(cor_matrix, method = "number")

# Split the data into training and test sets

set.seed(33)  
index <- sample(1:nrow(survival_df), round(0.9 * nrow(survival_df)))
trainset <- survival_df[index, ]
testset <- survival_df[-index, ]

Define and train the neural network model

network <- c(6)
mod.sur <- neuralnet(Survival_Status ~ ., data = trainset, hidden = network,
                     threshold = 0.01, stepmax = 1e8)
plot(mod.sur)

Predict on the test set

actual <- testset$Survival_Status
levels(actual) <- c("yes", "no")

thres <- function(x) {
  if (x[1] > x[2]) {
    return('yes')
  } else if (x[2] > x[1]) {
    return('no')
}}

Predict and convert to matrix

pred <- predict(mod.sur, testset[,-length(testset)])
pred <- matrix(pred, nrow = nrow(testset), ncol = 2)
# threshold function
predicted <- apply(pred, 1, thres)

# predicted values
print(predicted)
##  [1] "yes" "yes" "yes" "yes" "yes" "yes" "yes" "yes" "yes" "yes" "no"  "no" 
## [13] "yes" "no"  "yes" "yes" "yes" "yes" "yes" "yes" "yes" "yes" "yes" "yes"
## [25] "yes" "yes" "yes" "yes" "yes" "yes" "no"
# Confusion matrix
cm <- confusionMatrix(factor(predicted), factor(actual))
## Warning in confusionMatrix.default(factor(predicted), factor(actual)): Levels
## are not in the same order for reference and data. Refactoring data to match.
print(cm)
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction yes no
##        yes  20  7
##        no    1  3
##                                           
##                Accuracy : 0.7419          
##                  95% CI : (0.5539, 0.8814)
##     No Information Rate : 0.6774          
##     P-Value [Acc > NIR] : 0.2879          
##                                           
##                   Kappa : 0.2994          
##                                           
##  Mcnemar's Test P-Value : 0.0771          
##                                           
##             Sensitivity : 0.9524          
##             Specificity : 0.3000          
##          Pos Pred Value : 0.7407          
##          Neg Pred Value : 0.7500          
##              Prevalence : 0.6774          
##          Detection Rate : 0.6452          
##    Detection Prevalence : 0.8710          
##       Balanced Accuracy : 0.6262          
##                                           
##        'Positive' Class : yes             
## 

Conclusion

This analysis of the haberman_data.txt dataset provided valuable insights into survival patterns over time and across different ages. By examining year-wise and age-wise survival and death counts, as well as distribution histograms, we identified key trends and anomalies. The correlation analysis highlighted relationships between variables, while the neural network model demonstrated how machine learning can be applied to predict survival outcomes. Overall, the analysis revealed useful patterns and model performance insights, suggesting potential for further refinement and exploration to enhance predictive accuracy and data understanding.