This RMarkdown file contains the report of the data analysis done for the project on building and deploying a stroke prediction model in R. It contains analysis such as data exploration, summary statistics and building the prediction models. The final report was completed on Tue Sep 23 19:11:58 2025.
Data Description:
According to the World Health Organization (WHO) stroke is the 2nd leading cause of death globally, responsible for approximately 11% of total deaths.
This data set is used to predict whether a patient is likely to get stroke based on the input parameters like gender, age, various diseases, and smoking status. Each row in the data provides relevant information about the patient.
library(readr)
library(ggplot2)
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
stroke=read_csv('C:/Users/Muthumeena/Downloads/healthcare-dataset-stroke-data.csv')
## Rows: 5110 Columns: 12
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (6): gender, ever_married, work_type, Residence_type, bmi, smoking_status
## dbl (6): id, age, hypertension, heart_disease, avg_glucose_level, stroke
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
glimpse(stroke)
## Rows: 5,110
## Columns: 12
## $ id <dbl> 9046, 51676, 31112, 60182, 1665, 56669, 53882, 10434…
## $ gender <chr> "Male", "Female", "Male", "Female", "Female", "Male"…
## $ age <dbl> 67, 61, 80, 49, 79, 81, 74, 69, 59, 78, 81, 61, 54, …
## $ hypertension <dbl> 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 1…
## $ heart_disease <dbl> 1, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 1, 1, 0, 1, 0…
## $ ever_married <chr> "Yes", "Yes", "Yes", "Yes", "Yes", "Yes", "Yes", "No…
## $ work_type <chr> "Private", "Self-employed", "Private", "Private", "S…
## $ Residence_type <chr> "Urban", "Rural", "Rural", "Urban", "Rural", "Urban"…
## $ avg_glucose_level <dbl> 228.69, 202.21, 105.92, 171.23, 174.12, 186.21, 70.0…
## $ bmi <chr> "36.6", "N/A", "32.5", "34.4", "24", "29", "27.4", "…
## $ smoking_status <chr> "formerly smoked", "never smoked", "never smoked", "…
## $ stroke <dbl> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1…
stroke=stroke%>%select(-id)
stroke=stroke%>%
mutate(
bmi=as.numeric(bmi),
gender=as.factor(gender),
hypertension=as.factor(hypertension),
heart_disease=as.factor(heart_disease),
ever_married=as.factor(ever_married),
work_type=as.factor(work_type),
Residence_type=as.factor(Residence_type),
smoking_status=as.factor(smoking_status),
stroke=factor(stroke,levels=c(0,1),labels=c('No','Yes'))
)
## Warning: There was 1 warning in `mutate()`.
## ℹ In argument: `bmi = as.numeric(bmi)`.
## Caused by warning:
## ! NAs introduced by coercion
stroke$bmi[is.na(stroke$bmi)]=median(stroke$bmi,na.rm=TRUE)
##Describe and explore the data
glimpse(stroke)
## Rows: 5,110
## Columns: 11
## $ gender <fct> Male, Female, Male, Female, Female, Male, Male, Fema…
## $ age <dbl> 67, 61, 80, 49, 79, 81, 74, 69, 59, 78, 81, 61, 54, …
## $ hypertension <fct> 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 1…
## $ heart_disease <fct> 1, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 1, 1, 0, 1, 0…
## $ ever_married <fct> Yes, Yes, Yes, Yes, Yes, Yes, Yes, No, Yes, Yes, Yes…
## $ work_type <fct> Private, Self-employed, Private, Private, Self-emplo…
## $ Residence_type <fct> Urban, Rural, Rural, Urban, Rural, Urban, Rural, Urb…
## $ avg_glucose_level <dbl> 228.69, 202.21, 105.92, 171.23, 174.12, 186.21, 70.0…
## $ bmi <dbl> 36.6, 28.1, 32.5, 34.4, 24.0, 29.0, 27.4, 22.8, 28.1…
## $ smoking_status <fct> formerly smoked, never smoked, never smoked, smokes,…
## $ stroke <fct> Yes, Yes, Yes, Yes, Yes, Yes, Yes, Yes, Yes, Yes, Ye…
summary(stroke)
## gender age hypertension heart_disease ever_married
## Female:2994 Min. : 0.08 0:4612 0:4834 No :1757
## Male :2115 1st Qu.:25.00 1: 498 1: 276 Yes:3353
## Other : 1 Median :45.00
## Mean :43.23
## 3rd Qu.:61.00
## Max. :82.00
## work_type Residence_type avg_glucose_level bmi
## children : 687 Rural:2514 Min. : 55.12 Min. :10.30
## Govt_job : 657 Urban:2596 1st Qu.: 77.25 1st Qu.:23.80
## Never_worked : 22 Median : 91.89 Median :28.10
## Private :2925 Mean :106.15 Mean :28.86
## Self-employed: 819 3rd Qu.:114.09 3rd Qu.:32.80
## Max. :271.74 Max. :97.60
## smoking_status stroke
## formerly smoked: 885 No :4861
## never smoked :1892 Yes: 249
## smokes : 789
## Unknown :1544
##
##
table(stroke$stroke)
##
## No Yes
## 4861 249
prop.table(table(stroke$stroke))
##
## No Yes
## 0.95127202 0.04872798
ggplot(stroke,aes(x=age))+
geom_histogram(binwidth=5,fill='skyblue',color='black')+labs(title = 'age distribution')
ggplot(stroke,aes(x=stroke,fill=stroke))+
geom_bar()+labs(title ='stroke vs no stroke')
#task two: build prediction models
library(caTools)
library(randomForest)
## randomForest 4.7-1.2
## Type rfNews() to see new features/changes/bug fixes.
##
## Attaching package: 'randomForest'
## The following object is masked from 'package:dplyr':
##
## combine
## The following object is masked from 'package:ggplot2':
##
## margin
library(ROSE)
## Loaded ROSE 0.0-4
set.seed(123)
split=sample.split(stroke$stroke,SplitRatio=0.75)
train_set=subset(stroke,split==TRUE)
test_set=subset(stroke,split==FALSE)
set.seed(456)
train_balanced=ROSE(stroke~.,data = train_set)$data
table(train_balanced$stroke)
##
## No Yes
## 1889 1944
log_model=glm(stroke~.,data = train_balanced,family = 'binomial')
rf_model=randomForest(stroke~.,data=train_balanced)
#task three: evaluate and select prediction models
library(caret)
## Loading required package: lattice
log_predictions=predict(log_model,newdata = test_set,type = 'response')
log_predicted_labels=ifelse(log_predictions>0.5,'Yes','No')
log_cm=confusionMatrix(data = as.factor(log_predicted_labels),reference = test_set$stroke)
print("logistic regression results")
## [1] "logistic regression results"
print(log_cm)
## Confusion Matrix and Statistics
##
## Reference
## Prediction No Yes
## No 867 12
## Yes 348 50
##
## Accuracy : 0.7181
## 95% CI : (0.6925, 0.7426)
## No Information Rate : 0.9514
## P-Value [Acc > NIR] : 1
##
## Kappa : 0.1456
##
## Mcnemar's Test P-Value : <2e-16
##
## Sensitivity : 0.7136
## Specificity : 0.8065
## Pos Pred Value : 0.9863
## Neg Pred Value : 0.1256
## Prevalence : 0.9514
## Detection Rate : 0.6789
## Detection Prevalence : 0.6883
## Balanced Accuracy : 0.7600
##
## 'Positive' Class : No
##
rf_predictions=predict(rf_model,newdata = test_set)
rf_cm=confusionMatrix(data = rf_predictions,reference = test_set$stroke)
print("random forest results")
## [1] "random forest results"
print(rf_cm)
## Confusion Matrix and Statistics
##
## Reference
## Prediction No Yes
## No 918 15
## Yes 297 47
##
## Accuracy : 0.7557
## 95% CI : (0.7311, 0.779)
## No Information Rate : 0.9514
## P-Value [Acc > NIR] : 1
##
## Kappa : 0.1626
##
## Mcnemar's Test P-Value : <2e-16
##
## Sensitivity : 0.7556
## Specificity : 0.7581
## Pos Pred Value : 0.9839
## Neg Pred Value : 0.1366
## Prevalence : 0.9514
## Detection Rate : 0.7189
## Detection Prevalence : 0.7306
## Balanced Accuracy : 0.7568
##
## 'Positive' Class : No
##
#task four:deploy the prediction model
set.seed(789)
final_balanced_data=ROSE(stroke~.,data = stroke)$data
final_rf_model=randomForest(stroke~.,data = final_balanced_data)
print("Final model trained on entire data")
## [1] "Final model trained on entire data"
print(final_rf_model)
##
## Call:
## randomForest(formula = stroke ~ ., data = final_balanced_data)
## Type of random forest: classification
## Number of trees: 500
## No. of variables tried at each split: 3
##
## OOB estimate of error rate: 16.97%
## Confusion matrix:
## No Yes class.error
## No 1968 573 0.2255018
## Yes 294 2275 0.1144414
saveRDS(final_rf_model,'stroke_model.rds')
loaded_model=readRDS('stroke_model.rds')
newpatient=data.frame(
gender=factor('Female',levels=levels(stroke$gender)),
age=67,
hypertension=factor(0,levels=levels(stroke$hypertension)),
heart_disease=factor(1,levels=levels(stroke$heart_disease)),
ever_married=factor('Yes',levels=levels(stroke$ever_married)),
work_type=factor('Private',levels=levels(stroke$work_type)),
Residence_type=factor("Urban",levels=levels(stroke$Residence_type)),
avg_glucose_level=228.69,
bmi=3664,
smoking_status=factor('formerly smoked',levels=levels(stroke$smoking_status))
)
prediction=predict(loaded_model,newdata = newpatient)
prediction
## 1
## Yes
## Levels: No Yes
#task five: findings and conclusions EDA reveals several predictors for stroke.patient age and hypertension shown strong associations with stroke incidence. the random forest model demonstrated a better performance than logistic regression model.it shown accuracy of 75.6% compared to logistic regression models 71.8%. but logistic regression model shown higher sensitivity of 80.6% for ‘yes’ stroke cases than random forest which shown only 75.8%. so logistic model is selected as final model due to its higher sensitivity. so the finalised logistic regression model was trained on complete balanced dataset to improve its predictive power.then its saved as RDS for deployment.then we used the same for a new patient whose data is hypothetical which gave ‘yes’ as prediction.