This document presents the analysis of survival data using various
methods including data summarization, visualization, and neural network
modeling. The dataset used is haberman_data.txt.
The dataset contains 4 columns:
survival_df <- read.table("haberman_data.txt", sep = ',')
names(survival_df) <- c("Age", "Year", "Num_Axillary", "Survival_Status")
survival_df$Survival_Status <- factor(survival_df$Survival_Status)
str(survival_df)
## 'data.frame': 306 obs. of 4 variables:
## $ Age : int 30 30 30 31 31 33 33 34 34 34 ...
## $ Year : int 64 62 65 59 65 58 60 59 66 58 ...
## $ Num_Axillary : int 1 3 0 2 4 10 0 0 9 30 ...
## $ Survival_Status: Factor w/ 2 levels "1","2": 1 1 1 1 1 1 1 2 2 1 ...
summary(survival_df)
## Age Year Num_Axillary Survival_Status
## Min. :30.00 Min. :58.00 Min. : 0.000 1:225
## 1st Qu.:44.00 1st Qu.:60.00 1st Qu.: 0.000 2: 81
## Median :52.00 Median :63.00 Median : 1.000
## Mean :52.46 Mean :62.85 Mean : 4.026
## 3rd Qu.:60.75 3rd Qu.:65.75 3rd Qu.: 4.000
## Max. :83.00 Max. :69.00 Max. :52.000
yearwise <- survival_df %>%
group_by(Year) %>%
summarize(Total_survived = sum(Survival_Status == 1),
Total_deaths = sum(Survival_Status == 2))
yearwise_long <- pivot_longer(yearwise, cols = c(Total_survived, Total_deaths),
names_to = "Category", values_to = "Count")
ggplot(yearwise_long, aes(x = Year, y = Count, color = Category)) +
geom_path(size = 1.2) +
labs(title = "Yearly Survival and Death Counts",
subtitle = "Comparison of Total Survived vs. Total Deaths",
x = "Year",
y = "Count",
color = "Category") +
scale_color_manual(values = c("Total_survived" = "blue", "Total_deaths" = "red")) +
theme_minimal() +
theme(plot.title = element_text(hjust = 0.5),
plot.subtitle = element_text(hjust = 0.5))
## Warning: Using `size` aesthetic for lines was deprecated in ggplot2 3.4.0.
## ℹ Please use `linewidth` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
ggplot(yearwise_long, aes(x = Year, y = Count, fill = Category)) +
geom_bar(stat = "identity", position = "dodge") +
labs(title = "Yearly Survival and Death Counts",
subtitle = "Comparison of Total Survived vs. Total Deaths",
x = "Year",
y = "Count",
fill = "Category") +
scale_fill_manual(values = c("Total_survived" = "blue", "Total_deaths" = "red")) +
theme_minimal() +
theme(plot.title = element_text(hjust = 0.5),
plot.subtitle = element_text(hjust = 0.5))
agewise <- survival_df %>%
group_by(Age) %>%
summarize(Total_survived = sum(Survival_Status == 1),
Total_deaths = sum(Survival_Status == 2))
agewise_long <- pivot_longer(agewise, cols = c(Total_survived, Total_deaths),
names_to = "Category", values_to = "Count")
ggplot(agewise_long, aes(x = Age, y = Count, fill = Category)) +
geom_bar(stat = "identity", position = "dodge") +
labs(title = "Age Survival and Death Counts",
subtitle = "Comparison of Total Survived vs. Total Deaths",
x = "Age",
y = "Count",
fill = "Category") +
scale_fill_manual(values = c("Total_survived" = "blue", "Total_deaths" = "red")) +
theme_minimal() +
theme(plot.title = element_text(hjust = 0.5),
plot.subtitle = element_text(hjust = 0.5))
ggplot(survival_df, aes(Year)) +
geom_histogram(binwidth = 0.5, col = 'blue', fill = "blue") +
geom_text(stat = 'bin', aes(label = ..count..), vjust = -0.5, binwidth = 0.5, color = "black") +
labs(title = "Yearly Distribution of Survival Data",
x = "Year",
y = "Count") +
theme_minimal()
## Warning: The dot-dot notation (`..count..`) was deprecated in ggplot2 3.4.0.
## ℹ Please use `after_stat(count)` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
ggplot(survival_df, aes(Age)) +
geom_histogram(binwidth = 0.5, col = 'blue', fill = "blue") +
geom_text(stat = 'bin', aes(label = ..count..), vjust = -0.5, binwidth = 0.5, color = "black") +
labs(title = "Age Distribution of Survival Data",
x = "Age",
y = "Count") +
theme_minimal()
sr <- survival_df
sr <- sapply(sr, as.numeric)
cor_matrix <- cor(sr)
# Plot correlation matrix
corrplot(cor_matrix, method = "number")
# Split the data into training and test sets
set.seed(33)
index <- sample(1:nrow(survival_df), round(0.9 * nrow(survival_df)))
trainset <- survival_df[index, ]
testset <- survival_df[-index, ]
network <- c(6)
mod.sur <- neuralnet(Survival_Status ~ ., data = trainset, hidden = network,
threshold = 0.01, stepmax = 1e8)
plot(mod.sur)
actual <- testset$Survival_Status
levels(actual) <- c("yes", "no")
thres <- function(x) {
if (x[1] > x[2]) {
return('yes')
} else if (x[2] > x[1]) {
return('no')
}}
pred <- predict(mod.sur, testset[,-length(testset)])
pred <- matrix(pred, nrow = nrow(testset), ncol = 2)
# threshold function
predicted <- apply(pred, 1, thres)
# predicted values
print(predicted)
## [1] "yes" "yes" "yes" "yes" "yes" "yes" "yes" "yes" "yes" "yes" "no" "no"
## [13] "yes" "no" "yes" "yes" "yes" "yes" "yes" "yes" "yes" "yes" "yes" "yes"
## [25] "yes" "yes" "yes" "yes" "yes" "yes" "no"
# Confusion matrix
cm <- confusionMatrix(factor(predicted), factor(actual))
## Warning in confusionMatrix.default(factor(predicted), factor(actual)): Levels
## are not in the same order for reference and data. Refactoring data to match.
print(cm)
## Confusion Matrix and Statistics
##
## Reference
## Prediction yes no
## yes 20 7
## no 1 3
##
## Accuracy : 0.7419
## 95% CI : (0.5539, 0.8814)
## No Information Rate : 0.6774
## P-Value [Acc > NIR] : 0.2879
##
## Kappa : 0.2994
##
## Mcnemar's Test P-Value : 0.0771
##
## Sensitivity : 0.9524
## Specificity : 0.3000
## Pos Pred Value : 0.7407
## Neg Pred Value : 0.7500
## Prevalence : 0.6774
## Detection Rate : 0.6452
## Detection Prevalence : 0.8710
## Balanced Accuracy : 0.6262
##
## 'Positive' Class : yes
##
This analysis of the haberman_data.txt dataset provided
valuable insights into survival patterns over time and across different
ages. By examining year-wise and age-wise survival and death counts, as
well as distribution histograms, we identified key trends and anomalies.
The correlation analysis highlighted relationships between variables,
while the neural network model demonstrated how machine learning can be
applied to predict survival outcomes. Overall, the analysis revealed
useful patterns and model performance insights, suggesting potential for
further refinement and exploration to enhance predictive accuracy and
data understanding.