About Data Analysis Report

This file contains the report of the data analysis done for the project on building and deploying a stroke prediction model in R. It contains analysis such as data exploration, summary statistics and building the prediction models. The final report was completed on Tue Nov 5 03:20:38 2024.

Data Description:

According to the World Health Organization (WHO) stroke is the 2nd leading cause of death globally, responsible for approximately 11% of total deaths.

This data set is used to predict whether a patient is likely to get stroke based on the input parameters like gender, age, various diseases, and smoking status. Each row in the data provides relevant information about the patient.

Task 1: Import data and data preprocessing

Load data and install packages

# Install necessary packages if not already installed
if (!require(caret)) install.packages('caret', dependencies=TRUE)

## Loading required package: caret

## Loading required package: ggplot2

## Loading required package: lattice

if (!require(randomForest)) install.packages('randomForest')

## Loading required package: randomForest

## randomForest 4.7-1.1

## Type rfNews() to see new features/changes/bug fixes.

## 
## Attaching package: 'randomForest'

## The following object is masked from 'package:ggplot2':
## 
##     margin

if (!require(e1071)) install.packages('e1071')

## Loading required package: e1071

if (!require(plumber)) install.packages('plumber')

## Loading required package: plumber

# Load required libraries
library(caret)
library(randomForest)
library(e1071)
library(plumber)

# Simulate stroke dataset
set.seed(123)

n <- 1000  # Number of observations


# Simulated data
data <- data.frame(
  gender = sample(c("Male", "Female"), n, replace = TRUE),
  age = sample(25:85, n, replace = TRUE),
  hypertension = sample(c(0, 1), n, replace = TRUE),
  heart_disease = sample(c(0, 1), n, replace = TRUE),
  ever_married = sample(c("Yes", "No"), n, replace = TRUE),
  work_type = sample(c("Private", "Self-employed", "Govt_job", "Children", "Never_worked"), n, replace = TRUE),
  Residence_type = sample(c("Urban", "Rural"), n, replace = TRUE),
  avg_glucose_level = round(runif(n, 50, 300), 1),
  bmi = round(runif(n, 10, 50), 1),
  smoking_status = sample(c("formerly smoked", "never smoked", "smokes", "Unknown"), n, replace = TRUE),
  stroke = sample(c(0, 1), n, replace = TRUE, prob = c(0.9, 0.1))  # 90% non-stroke, 10% stroke
)

# View first few rows of the dataset
head(data)

##   gender age hypertension heart_disease ever_married     work_type
## 1   Male  37            1             0           No      Govt_job
## 2   Male  32            1             0           No  Never_worked
## 3   Male  26            1             0           No  Never_worked
## 4 Female  83            1             1           No      Govt_job
## 5   Male  30            0             1          Yes Self-employed
## 6 Female  47            1             0          Yes      Govt_job
##   Residence_type avg_glucose_level  bmi  smoking_status stroke
## 1          Rural             204.5 33.6 formerly smoked      0
## 2          Urban             213.4 13.4 formerly smoked      1
## 3          Rural             283.5 40.1          smokes      0
## 4          Rural             202.2 16.2         Unknown      0
## 5          Urban             232.8 12.7    never smoked      0
## 6          Rural              91.4 14.8    never smoked      0

# Convert categorical variables to factors
data$gender <- as.factor(data$gender)
data$ever_married <- as.factor(data$ever_married)
data$work_type <- as.factor(data$work_type)
data$Residence_type <- as.factor(data$Residence_type)
data$smoking_status <- as.factor(data$smoking_status)
data$stroke <- as.factor(data$stroke)

# Summary of the dataset
summary(data)

##     gender         age         hypertension   heart_disease   ever_married
##  Female:494   Min.   :25.00   Min.   :0.000   Min.   :0.000   No :540     
##  Male  :506   1st Qu.:39.00   1st Qu.:0.000   1st Qu.:0.000   Yes:460     
##               Median :53.00   Median :0.000   Median :1.000               
##               Mean   :53.46   Mean   :0.482   Mean   :0.505               
##               3rd Qu.:68.00   3rd Qu.:1.000   3rd Qu.:1.000               
##               Max.   :85.00   Max.   :1.000   Max.   :1.000               
##          work_type   Residence_type avg_glucose_level      bmi       
##  Children     :200   Rural:480      Min.   : 50.5     Min.   :10.00  
##  Govt_job     :177   Urban:520      1st Qu.:112.8     1st Qu.:19.80  
##  Never_worked :196                  Median :171.9     Median :29.60  
##  Private      :204                  Mean   :172.9     Mean   :29.68  
##  Self-employed:223                  3rd Qu.:235.5     3rd Qu.:39.00  
##                                     Max.   :299.6     Max.   :50.00  
##          smoking_status stroke 
##  formerly smoked:258    0:899  
##  never smoked   :249    1:101  
##  smokes         :244           
##  Unknown        :249           
##                                
##

# Check for missing values
sum(is.na(data))

## [1] 0

## Describe and explore the data

Task Two: Build prediction models

# Split the data into training and testing sets
set.seed(123)
trainIndex <- createDataPartition(data$stroke, p = 0.7, list = FALSE)
train_data <- data[trainIndex,]
test_data <- data[-trainIndex,]

# Train a Random Forest model
set.seed(123)
rf_model <- randomForest(stroke ~ ., data = train_data, importance = TRUE)
print(rf_model)

## 
## Call:
##  randomForest(formula = stroke ~ ., data = train_data, importance = TRUE) 
##                Type of random forest: classification
##                      Number of trees: 500
## No. of variables tried at each split: 3
## 
##         OOB estimate of  error rate: 10.27%
## Confusion matrix:
##     0 1 class.error
## 0 629 1 0.001587302
## 1  71 0 1.000000000

# Task Three: Evaluate and select prediction models
# Predict on the test data
rf_predictions <- predict(rf_model, test_data)

# Confusion matrix to evaluate the model
confusionMatrix(rf_predictions, test_data$stroke)

## Confusion Matrix and Statistics
## 
##           Reference
## Prediction   0   1
##          0 269  30
##          1   0   0
##                                           
##                Accuracy : 0.8997          
##                  95% CI : (0.8599, 0.9313)
##     No Information Rate : 0.8997          
##     P-Value [Acc > NIR] : 0.5484          
##                                           
##                   Kappa : 0               
##                                           
##  Mcnemar's Test P-Value : 1.192e-07       
##                                           
##             Sensitivity : 1.0000          
##             Specificity : 0.0000          
##          Pos Pred Value : 0.8997          
##          Neg Pred Value :    NaN          
##              Prevalence : 0.8997          
##          Detection Rate : 0.8997          
##    Detection Prevalence : 1.0000          
##       Balanced Accuracy : 0.5000          
##                                           
##        'Positive' Class : 0               
##

# Variable importance
importance(rf_model)

##                            0          1 MeanDecreaseAccuracy MeanDecreaseGini
## gender            -0.8969943  0.3657567           -0.7292315         3.612849
## age                1.7085166 -2.5981278            0.8340336        25.489796
## hypertension       0.1844302 -2.4982162           -0.5920865         4.547966
## heart_disease      1.0357498  1.1367636            1.4407012         4.280662
## ever_married       2.1435691  3.3105960            2.8879147         4.342828
## work_type          1.0682102 -0.6410287            0.8051711        12.293969
## Residence_type    -0.9240259 -2.6602549           -1.7478691         3.708690
## avg_glucose_level  0.7972631  1.0714482            1.0651290        30.404742
## bmi                0.8993115 -0.9651927            0.5422147        27.415022
## smoking_status    -1.3267743 -2.5081543           -2.1403343         9.547092

varImpPlot(rf_model)

# Task Four: Deploy the prediction model

# Save the model
saveRDS(rf_model, "stroke_model.rds")

# Create a Plumber API to deploy the model
# Create a new R script named "plumber.R" and add the following:

# plumber.R
#' @post /predict
#' @param gender
#' @param age
#' @param hypertension
#' @param heart_disease
#' @param ever_married
#' @param work_type
#' @param Residence_type
#' @param avg_glucose_level
#' @param bmi
#' @param smoking_status
#' @response 200
function(gender, age, hypertension, heart_disease, ever_married, work_type, Residence_type, avg_glucose_level, bmi, smoking_status) {
  # Load the saved model
  model <- readRDS("stroke_model.rds")
  
  # Create a new data frame with the inputs
  new_data <- data.frame(
    gender = factor(gender, levels = c("Male", "Female")),
    age = as.numeric(age),
    hypertension = as.numeric(hypertension),
    heart_disease = as.numeric(heart_disease),
    ever_married = factor(ever_married, levels = c("Yes", "No")),
    work_type = factor(work_type, levels = c("Private", "Self-employed", "Govt_job", "Children", "Never_worked")),
    Residence_type = factor(Residence_type, levels = c("Urban", "Rural")),
    avg_glucose_level = as.numeric(avg_glucose_level),
    bmi = as.numeric(bmi),
    smoking_status = factor(smoking_status, levels = c("formerly smoked", "never smoked", "smokes", "Unknown"))
  )
  
  # Predict using the model
  prediction <- predict(model, new_data)
  
  return(as.character(prediction))
}

## function(gender, age, hypertension, heart_disease, ever_married, work_type, Residence_type, avg_glucose_level, bmi, smoking_status) {
##   # Load the saved model
##   model <- readRDS("stroke_model.rds")
##   
##   # Create a new data frame with the inputs
##   new_data <- data.frame(
##     gender = factor(gender, levels = c("Male", "Female")),
##     age = as.numeric(age),
##     hypertension = as.numeric(hypertension),
##     heart_disease = as.numeric(heart_disease),
##     ever_married = factor(ever_married, levels = c("Yes", "No")),
##     work_type = factor(work_type, levels = c("Private", "Self-employed", "Govt_job", "Children", "Never_worked")),
##     Residence_type = factor(Residence_type, levels = c("Urban", "Rural")),
##     avg_glucose_level = as.numeric(avg_glucose_level),
##     bmi = as.numeric(bmi),
##     smoking_status = factor(smoking_status, levels = c("formerly smoked", "never smoked", "smokes", "Unknown"))
##   )
##   
##   # Predict using the model
##   prediction <- predict(model, new_data)
##   
##   return(as.character(prediction))
## }

Final Task: Findings and Conclusions

library(plumber) r <- plumb(“plumber.R”) r$run(port=8000)

library(packrat) library(rsconnect)

rsconnect::setAccountInfo(name=‘joih3r-wan0mohamad0hanis0bin0wan0hassan’, token=‘19A258BBD42A2E70A90A25E401535568’, secret=‘54JVgjDzkSsXl71JpSL1zYXeEofXEGQXC5S+Nwlu’) rsconnect::deployApp(‘path/to/your/app_or_project’)

Build and deploy a stroke prediction model using R

Wan Mohamad Hanis bin Wan Hassan

2024-11-05

About Data Analysis Report

Task 1: Import data and data preprocessing

Load data and install packages

Task Two: Build prediction models

Final Task: Findings and Conclusions