This file contains the report of the data analysis done for the project on building and deploying a stroke prediction model in R. It contains analysis such as data exploration, summary statistics and building the prediction models. The final report was completed on Tue Nov 5 03:20:38 2024.
Data Description:
According to the World Health Organization (WHO) stroke is the 2nd leading cause of death globally, responsible for approximately 11% of total deaths.
This data set is used to predict whether a patient is likely to get stroke based on the input parameters like gender, age, various diseases, and smoking status. Each row in the data provides relevant information about the patient.
# Install necessary packages if not already installed
if (!require(caret)) install.packages('caret', dependencies=TRUE)
## Loading required package: caret
## Loading required package: ggplot2
## Loading required package: lattice
if (!require(randomForest)) install.packages('randomForest')
## Loading required package: randomForest
## randomForest 4.7-1.1
## Type rfNews() to see new features/changes/bug fixes.
##
## Attaching package: 'randomForest'
## The following object is masked from 'package:ggplot2':
##
## margin
if (!require(e1071)) install.packages('e1071')
## Loading required package: e1071
if (!require(plumber)) install.packages('plumber')
## Loading required package: plumber
# Load required libraries
library(caret)
library(randomForest)
library(e1071)
library(plumber)
# Simulate stroke dataset
set.seed(123)
n <- 1000 # Number of observations
# Simulated data
data <- data.frame(
gender = sample(c("Male", "Female"), n, replace = TRUE),
age = sample(25:85, n, replace = TRUE),
hypertension = sample(c(0, 1), n, replace = TRUE),
heart_disease = sample(c(0, 1), n, replace = TRUE),
ever_married = sample(c("Yes", "No"), n, replace = TRUE),
work_type = sample(c("Private", "Self-employed", "Govt_job", "Children", "Never_worked"), n, replace = TRUE),
Residence_type = sample(c("Urban", "Rural"), n, replace = TRUE),
avg_glucose_level = round(runif(n, 50, 300), 1),
bmi = round(runif(n, 10, 50), 1),
smoking_status = sample(c("formerly smoked", "never smoked", "smokes", "Unknown"), n, replace = TRUE),
stroke = sample(c(0, 1), n, replace = TRUE, prob = c(0.9, 0.1)) # 90% non-stroke, 10% stroke
)
# View first few rows of the dataset
head(data)
## gender age hypertension heart_disease ever_married work_type
## 1 Male 37 1 0 No Govt_job
## 2 Male 32 1 0 No Never_worked
## 3 Male 26 1 0 No Never_worked
## 4 Female 83 1 1 No Govt_job
## 5 Male 30 0 1 Yes Self-employed
## 6 Female 47 1 0 Yes Govt_job
## Residence_type avg_glucose_level bmi smoking_status stroke
## 1 Rural 204.5 33.6 formerly smoked 0
## 2 Urban 213.4 13.4 formerly smoked 1
## 3 Rural 283.5 40.1 smokes 0
## 4 Rural 202.2 16.2 Unknown 0
## 5 Urban 232.8 12.7 never smoked 0
## 6 Rural 91.4 14.8 never smoked 0
# Convert categorical variables to factors
data$gender <- as.factor(data$gender)
data$ever_married <- as.factor(data$ever_married)
data$work_type <- as.factor(data$work_type)
data$Residence_type <- as.factor(data$Residence_type)
data$smoking_status <- as.factor(data$smoking_status)
data$stroke <- as.factor(data$stroke)
# Summary of the dataset
summary(data)
## gender age hypertension heart_disease ever_married
## Female:494 Min. :25.00 Min. :0.000 Min. :0.000 No :540
## Male :506 1st Qu.:39.00 1st Qu.:0.000 1st Qu.:0.000 Yes:460
## Median :53.00 Median :0.000 Median :1.000
## Mean :53.46 Mean :0.482 Mean :0.505
## 3rd Qu.:68.00 3rd Qu.:1.000 3rd Qu.:1.000
## Max. :85.00 Max. :1.000 Max. :1.000
## work_type Residence_type avg_glucose_level bmi
## Children :200 Rural:480 Min. : 50.5 Min. :10.00
## Govt_job :177 Urban:520 1st Qu.:112.8 1st Qu.:19.80
## Never_worked :196 Median :171.9 Median :29.60
## Private :204 Mean :172.9 Mean :29.68
## Self-employed:223 3rd Qu.:235.5 3rd Qu.:39.00
## Max. :299.6 Max. :50.00
## smoking_status stroke
## formerly smoked:258 0:899
## never smoked :249 1:101
## smokes :244
## Unknown :249
##
##
# Check for missing values
sum(is.na(data))
## [1] 0
## Describe and explore the data
# Split the data into training and testing sets
set.seed(123)
trainIndex <- createDataPartition(data$stroke, p = 0.7, list = FALSE)
train_data <- data[trainIndex,]
test_data <- data[-trainIndex,]
# Train a Random Forest model
set.seed(123)
rf_model <- randomForest(stroke ~ ., data = train_data, importance = TRUE)
print(rf_model)
##
## Call:
## randomForest(formula = stroke ~ ., data = train_data, importance = TRUE)
## Type of random forest: classification
## Number of trees: 500
## No. of variables tried at each split: 3
##
## OOB estimate of error rate: 10.27%
## Confusion matrix:
## 0 1 class.error
## 0 629 1 0.001587302
## 1 71 0 1.000000000
# Task Three: Evaluate and select prediction models
# Predict on the test data
rf_predictions <- predict(rf_model, test_data)
# Confusion matrix to evaluate the model
confusionMatrix(rf_predictions, test_data$stroke)
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 269 30
## 1 0 0
##
## Accuracy : 0.8997
## 95% CI : (0.8599, 0.9313)
## No Information Rate : 0.8997
## P-Value [Acc > NIR] : 0.5484
##
## Kappa : 0
##
## Mcnemar's Test P-Value : 1.192e-07
##
## Sensitivity : 1.0000
## Specificity : 0.0000
## Pos Pred Value : 0.8997
## Neg Pred Value : NaN
## Prevalence : 0.8997
## Detection Rate : 0.8997
## Detection Prevalence : 1.0000
## Balanced Accuracy : 0.5000
##
## 'Positive' Class : 0
##
# Variable importance
importance(rf_model)
## 0 1 MeanDecreaseAccuracy MeanDecreaseGini
## gender -0.8969943 0.3657567 -0.7292315 3.612849
## age 1.7085166 -2.5981278 0.8340336 25.489796
## hypertension 0.1844302 -2.4982162 -0.5920865 4.547966
## heart_disease 1.0357498 1.1367636 1.4407012 4.280662
## ever_married 2.1435691 3.3105960 2.8879147 4.342828
## work_type 1.0682102 -0.6410287 0.8051711 12.293969
## Residence_type -0.9240259 -2.6602549 -1.7478691 3.708690
## avg_glucose_level 0.7972631 1.0714482 1.0651290 30.404742
## bmi 0.8993115 -0.9651927 0.5422147 27.415022
## smoking_status -1.3267743 -2.5081543 -2.1403343 9.547092
varImpPlot(rf_model)
# Task Four: Deploy the prediction model
# Save the model
saveRDS(rf_model, "stroke_model.rds")
# Create a Plumber API to deploy the model
# Create a new R script named "plumber.R" and add the following:
# plumber.R
#' @post /predict
#' @param gender
#' @param age
#' @param hypertension
#' @param heart_disease
#' @param ever_married
#' @param work_type
#' @param Residence_type
#' @param avg_glucose_level
#' @param bmi
#' @param smoking_status
#' @response 200
function(gender, age, hypertension, heart_disease, ever_married, work_type, Residence_type, avg_glucose_level, bmi, smoking_status) {
# Load the saved model
model <- readRDS("stroke_model.rds")
# Create a new data frame with the inputs
new_data <- data.frame(
gender = factor(gender, levels = c("Male", "Female")),
age = as.numeric(age),
hypertension = as.numeric(hypertension),
heart_disease = as.numeric(heart_disease),
ever_married = factor(ever_married, levels = c("Yes", "No")),
work_type = factor(work_type, levels = c("Private", "Self-employed", "Govt_job", "Children", "Never_worked")),
Residence_type = factor(Residence_type, levels = c("Urban", "Rural")),
avg_glucose_level = as.numeric(avg_glucose_level),
bmi = as.numeric(bmi),
smoking_status = factor(smoking_status, levels = c("formerly smoked", "never smoked", "smokes", "Unknown"))
)
# Predict using the model
prediction <- predict(model, new_data)
return(as.character(prediction))
}
## function(gender, age, hypertension, heart_disease, ever_married, work_type, Residence_type, avg_glucose_level, bmi, smoking_status) {
## # Load the saved model
## model <- readRDS("stroke_model.rds")
##
## # Create a new data frame with the inputs
## new_data <- data.frame(
## gender = factor(gender, levels = c("Male", "Female")),
## age = as.numeric(age),
## hypertension = as.numeric(hypertension),
## heart_disease = as.numeric(heart_disease),
## ever_married = factor(ever_married, levels = c("Yes", "No")),
## work_type = factor(work_type, levels = c("Private", "Self-employed", "Govt_job", "Children", "Never_worked")),
## Residence_type = factor(Residence_type, levels = c("Urban", "Rural")),
## avg_glucose_level = as.numeric(avg_glucose_level),
## bmi = as.numeric(bmi),
## smoking_status = factor(smoking_status, levels = c("formerly smoked", "never smoked", "smokes", "Unknown"))
## )
##
## # Predict using the model
## prediction <- predict(model, new_data)
##
## return(as.character(prediction))
## }
library(plumber) r <- plumb(“plumber.R”) r$run(port=8000)
library(packrat) library(rsconnect)
rsconnect::setAccountInfo(name=‘joih3r-wan0mohamad0hanis0bin0wan0hassan’, token=‘19A258BBD42A2E70A90A25E401535568’, secret=‘54JVgjDzkSsXl71JpSL1zYXeEofXEGQXC5S+Nwlu’) rsconnect::deployApp(‘path/to/your/app_or_project’)