This document presents a complete end-to-end Eye Cancer Diagnosis analysis including:
Explaination: The dataset used in this analysis contains detailed medical records of eye cancer patients, collected from multiple countries. It includes demographic information (age, gender, country), clinical features (cancer type, laterality, stage at diagnosis), and treatment details such as surgery, radiation therapy, and chemotherapy. The dataset also provides patient outcomes and survival time, enabling both diagnostic and prognostic analysis. This rich combination of variables allows comprehensive exploration of patterns, risk factors, and treatment effectiveness in eye cancer cases.
library("readr")
library("dplyr")
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library("ggplot2")
data_path <- read.csv("~/Downloads/eye_cancer_patients.csv")
data <- raw %>% janitor::clean_names()
data <- data %>%
mutate(
patient_id = as.character(patient_id),
age = as.numeric(age),
gender = as.factor(gender),
cancer_type = as.factor(cancer_type),
laterality = as.factor(laterality),
date_of_diagnosis = lubridate::ymd(date_of_diagnosis),
stage_at_diagnosis = as.factor(stage_at_diagnosis),
treatment_type = as.factor(treatment_type),
surgery_status = ifelse(tolower(as.character(surgery_status)) %in% c("true","1","yes"), TRUE, FALSE),
radiation_therapy = as.numeric(radiation_therapy),
chemotherapy = as.numeric(chemotherapy),
outcome_status = as.factor(outcome_status),
survival_time_months = as.numeric(survival_time_months),
genetic_markers = as.factor(ifelse(is.na(genetic_markers),"Unknown",genetic_markers)),
family_history = ifelse(tolower(as.character(family_history)) %in% c("true","1","yes"), TRUE, FALSE),
country = as.factor(country)
)
data <- data %>%
mutate(
age_group = cut(age, breaks=c(-Inf,18,65,Inf),
labels=c("Children (0-18)","Adults (19-65)","Seniors (65+)")),
deceased_flag = ifelse(outcome_status=="Deceased",1,0)
)
glimpse(data)
## Rows: 5,000
## Columns: 18
## $ patient_id <chr> "PID00001", "PID00002", "PID00003", "PID00004", "…
## $ age <dbl> 58, 15, 64, 33, 8, 41, 67, 26, 5, 23, 63, 22, 62,…
## $ gender <fct> F, Other, M, M, Other, Other, Other, Other, Other…
## $ cancer_type <fct> Retinoblastoma, Retinoblastoma, Retinoblastoma, M…
## $ laterality <fct> Left, Right, Bilateral, Right, Left, Bilateral, R…
## $ date_of_diagnosis <date> 2019-01-25, 2021-10-21, 2021-03-12, 2021-05-10, …
## $ stage_at_diagnosis <fct> Stage IV, Stage III, Stage IV, Stage II, Stage I,…
## $ treatment_type <fct> Radiation, Chemotherapy, Surgery, Radiation, Chem…
## $ surgery_status <lgl> FALSE, TRUE, FALSE, TRUE, FALSE, TRUE, FALSE, FAL…
## $ radiation_therapy <dbl> 15, 69, 47, 36, 14, 11, 50, 46, 48, 42, 45, 65, 4…
## $ chemotherapy <dbl> 3, 6, 6, 6, 14, 17, 2, 13, 7, 2, 17, 3, 7, 15, 5,…
## $ outcome_status <fct> Deceased, In Remission, In Remission, Active, In …
## $ survival_time_months <dbl> 85, 10, 3, 40, 26, 15, 93, 9, 12, 116, 99, 25, 30…
## $ genetic_markers <fct> None, None, BRAF Mutation, None, BRAF Mutation, B…
## $ family_history <lgl> TRUE, TRUE, FALSE, FALSE, TRUE, TRUE, FALSE, TRUE…
## $ country <fct> UK, Japan, UK, Canada, USA, UK, Australia, German…
## $ age_group <fct> Adults (19-65), Children (0-18), Adults (19-65), …
## $ deceased_flag <dbl> 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 0, 1, 0, 1, 1…
# Select relevant columns for modelling
model_df <- data %>%
select(age, gender, cancer_type, laterality, stage_at_diagnosis,
treatment_type, surgery_status, radiation_therapy, chemotherapy,
survival_time_months, genetic_markers, family_history,
country, deceased_flag)
# Numeric variables
num_vars <- c("age","radiation_therapy","chemotherapy","survival_time_months")
# Create model_scaled
model_scaled <- model_df
model_scaled[num_vars] <- scale(model_scaled[num_vars])
glimpse(model_scaled)
## Rows: 5,000
## Columns: 14
## $ age <dbl> 0.50095650, -1.15707379, 0.73230956, -0.46301460,…
## $ gender <fct> F, Other, M, M, Other, Other, Other, Other, Other…
## $ cancer_type <fct> Retinoblastoma, Retinoblastoma, Retinoblastoma, M…
## $ laterality <fct> Left, Right, Bilateral, Right, Left, Bilateral, R…
## $ stage_at_diagnosis <fct> Stage IV, Stage III, Stage IV, Stage II, Stage I,…
## $ treatment_type <fct> Radiation, Chemotherapy, Surgery, Radiation, Chem…
## $ surgery_status <lgl> FALSE, TRUE, FALSE, TRUE, FALSE, TRUE, FALSE, FAL…
## $ radiation_therapy <dbl> -0.99425481, 1.63146947, 0.56172995, 0.02686019, …
## $ chemotherapy <dbl> -1.1701935, -0.6727609, -0.6727609, -0.6727609, 0…
## $ survival_time_months <dbl> 0.70287001, -1.46791120, -1.67051744, -0.59959871…
## $ genetic_markers <fct> None, None, BRAF Mutation, None, BRAF Mutation, B…
## $ family_history <lgl> TRUE, TRUE, FALSE, FALSE, TRUE, TRUE, FALSE, TRUE…
## $ country <fct> UK, Japan, UK, Canada, USA, UK, Australia, German…
## $ deceased_flag <dbl> 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 0, 1, 0, 1, 1…
#1. Age Distribution (Stacked by Cancer Type)
age_plot <- data %>%
group_by(age_group, cancer_type) %>%
summarise(count=n(), .groups="drop") %>%
group_by(age_group) %>%
mutate(pct = count/sum(count)*100)
ggplot(age_plot, aes(age_group, count, fill=cancer_type)) +
geom_bar(stat="identity") +
geom_text(aes(label=paste0(sprintf("%.1f",pct),"%")),
position=position_stack(vjust=0.5), color="white") +
labs(title="Age Distribution by Cancer Type", x="Age Group", y="Patient Count") +
theme_minimal()
#2. Survival by Stage
ggplot(data, aes(stage_at_diagnosis, survival_time_months, fill=stage_at_diagnosis)) +
geom_boxplot() +
labs(title="Survival Time by Stage", y="Survival (months)") +
theme_minimal() +
scale_fill_brewer(palette="Reds")
patients in earlier stages (Stage I–III) show similar survival patterns, with median survival around 55–60 months. Stage IV shows a slightly higher median only because the dataset contains a few long-surviving Stage IV patients, which pulls the median upward. It does NOT mean Stage IV patients live longer. Overall, the plot confirms that survival decreases as the cancer stage becomes more advanced, but the synthetic data creates small variations.
#3. Treatment vs Outcome
df <- data %>%
group_by(treatment_type, outcome_status) %>%
summarise(n=n(), .groups="drop") %>%
group_by(treatment_type) %>%
mutate(pct = n/sum(n)*100)
ggplot(df, aes(treatment_type, n, fill=outcome_status)) +
geom_bar(stat="identity") +
geom_text(aes(label=paste0(sprintf("%.1f",pct),"%")),
position=position_stack(vjust=0.5), color="white") +
labs(title="Treatment vs Outcome", x="Treatment Type", y="Count") +
theme_minimal() +
scale_fill_brewer(palette="Set1")
km_data <- model_scaled %>%
select(age, radiation_therapy, chemotherapy, survival_time_months) %>%
na.omit()
# Find optimal k (silhouette)
sil <- sapply(2:6, function(k){
km <- kmeans(km_data, centers=k, nstart=10)
mean(cluster::silhouette(km$cluster, dist(km_data))[,3])
})
best_k <- which.max(sil) + 1
km_fit <- kmeans(km_data, centers=best_k, nstart=25)
factoextra::fviz_cluster(list(data=km_data, cluster=km_fit$cluster)) +
ggtitle(paste("K-means Clustering (k=",best_k,")"))
The K-means clustering algorithm divided the patients into five distinct groups using four numerical variables: age, radiation therapy, chemotherapy, and survival time. Each colored cluster represents patients with similar characteristics. The X and Y axes (Dim1 and Dim2) are PCA-derived dimensions that summarize the main patterns in the data.
Cluster 1 (Red) - Younger patients, Low treatment, Medium survival
Cluster 2 (Yellow/Orange) - Moderate age, Moderate treatment, Wide survival range
Cluster 3 (Green) - Older patients, High radiation & chemo, Lower survival
Cluster 4 (Blue) - Low radiation, Long survival, Possibly early-stage patients
Cluster 5 (Pink/Purple) - Mixed pattern, Medium treatment, Medium survival
aov_fit <- aov(survival_time_months ~ cancer_type, data=data)
summary(aov_fit)
## Df Sum Sq Mean Sq F value Pr(>F)
## cancer_type 2 330 165.2 0.138 0.871
## Residuals 4997 5966910 1194.1
ggplot(data, aes(cancer_type, survival_time_months, fill=cancer_type)) +
geom_boxplot() + theme_minimal() +
labs(title="Survival by Cancer Type", y="Survival (months)") +
theme(axis.text.x = element_text(angle=30, hjust=1))
There is no significant difference in survival time between the cancer
types (Lymphoma, Melanoma, Retinoblastoma).
The average survival time of all three cancer types is very similar. Their boxplots overlap a lot. There is no strong separation between groups.
Dataset Link - https://www.kaggle.com/datasets/ak0212/eye-cancer-patient-records