BISHAL DHUNGANA
05-01-2024
ABOUT DATA ANALYSIS REPORT
This RMarkdown file contains the report of the data analysis done for the project on building and deploying a stroke prediction model in R. It contains analysis such as data exploration, summary statistics, and building the prediction models.
DATA DESCRIPTION:
According to the World Health Organization (WHO), STROKE IS THE 2ND LEADING CAUSE OF DEATH GLOBALLY, RESPONSIBLE FOR APPROXIMATELY 11% OF TOTAL DEATHS.
This dataset is used to predict whether a patient is likely to get stroke based on the input parameters like gender, age, various diseases, and smoking status. Each row in the data provides relevant information about the patient.
TASK ONE: IMPORT DATA AND DATA PREPROCESSING Load data and install packages
packages <- c("tidyverse", "lubridate", "ggplot2", "dplyr", "caret", "randomForest", "skimr", "gridExtra", "caTools", "corrplot", "ggcorrplot", "naniar")
# Install and load required packages
for (pkg in packages) {
if (!require(pkg, character.only = TRUE)) {
install.packages(pkg)
library(pkg, character.only = TRUE)
}
}
Data_Stroke <- read.csv('healthcare-dataset-stroke-data.csv')
summary(Data_Stroke)
id gender age hypertension heart_disease ever_married work_type
Min. : 67 Length:5110 Min. : 0.08 Min. :0.00000 Min. :0.00000 Length:5110 Length:5110
1st Qu.:17741 Class :character 1st Qu.:25.00 1st Qu.:0.00000 1st Qu.:0.00000 Class :character Class :character
Median :36932 Mode :character Median :45.00 Median :0.00000 Median :0.00000 Mode :character Mode :character
Mean :36518 Mean :43.23 Mean :0.09746 Mean :0.05401
3rd Qu.:54682 3rd Qu.:61.00 3rd Qu.:0.00000 3rd Qu.:0.00000
Max. :72940 Max. :82.00 Max. :1.00000 Max. :1.00000
Residence_type avg_glucose_level bmi smoking_status stroke
Length:5110 Min. : 55.12 Length:5110 Length:5110 Min. :0.00000
Class :character 1st Qu.: 77.25 Class :character Class :character 1st Qu.:0.00000
Mode :character Median : 91.89 Mode :character Mode :character Median :0.00000
Mean :106.15 Mean :0.04873
3rd Qu.:114.09 3rd Qu.:0.00000
Max. :271.74 Max. :1.00000
glimpse(Data_Stroke)
Rows: 5,110
Columns: 12
$ id <int> 9046, 51676, 31112, 60182, 1665, 56669, 53882, 10434, 27419, 60491, 12109, 12095, 12175, 8213, 5317,…
$ gender <chr> "Male", "Female", "Male", "Female", "Female", "Male", "Male", "Female", "Female", "Female", "Female"…
$ age <dbl> 67, 61, 80, 49, 79, 81, 74, 69, 59, 78, 81, 61, 54, 78, 79, 50, 64, 75, 60, 57, 71, 52, 79, 82, 71, …
$ hypertension <int> 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0…
$ heart_disease <int> 1, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 1, 1, 0, 1, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1…
$ ever_married <chr> "Yes", "Yes", "Yes", "Yes", "Yes", "Yes", "Yes", "No", "Yes", "Yes", "Yes", "Yes", "Yes", "Yes", "Ye…
$ work_type <chr> "Private", "Self-employed", "Private", "Private", "Self-employed", "Private", "Private", "Private", …
$ Residence_type <chr> "Urban", "Rural", "Rural", "Urban", "Rural", "Urban", "Rural", "Urban", "Rural", "Urban", "Rural", "…
$ avg_glucose_level <dbl> 228.69, 202.21, 105.92, 171.23, 174.12, 186.21, 70.09, 94.39, 76.15, 58.57, 80.43, 120.46, 104.51, 2…
$ bmi <chr> "36.6", "N/A", "32.5", "34.4", "24", "29", "27.4", "22.8", "N/A", "24.2", "29.7", "36.8", "27.3", "N…
$ smoking_status <chr> "formerly smoked", "never smoked", "never smoked", "smokes", "never smoked", "formerly smoked", "nev…
$ stroke <int> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1…
skim(Data_Stroke)
── Data Summary ────────────────────────
Values
Name Data_Stroke
Number of rows 5110
Number of columns 12
_______________________
Column type frequency:
character 6
numeric 6
________________________
Group variables None
miss_scan_count(data = Data_Stroke, search = list("Unknown","N/A","Other"))
##Convert NA to median in BMI
Data_Stroke$bmi <- as.numeric(Data_Stroke$bmi)
Warning: NAs introduced by coercion
idx <- complete.cases(Data_Stroke)
bmi_idx <- is.na(Data_Stroke$bmi)
median_bmi <- median(Data_Stroke$bmi, na.rm = TRUE)
Data_Stroke[bmi_idx,]$bmi <- median_bmi
colSums(is.na(Data_Stroke))
id gender age hypertension heart_disease ever_married work_type
0 0 0 0 0 0 0
Residence_type avg_glucose_level bmi smoking_status stroke
0 0 0 0 0
##Check duplicates
sum(duplicated(Data_Stroke))
[1] 0
colSums(Data_Stroke == 'N/A')
id gender age hypertension heart_disease ever_married work_type
0 0 0 0 0 0 0
Residence_type avg_glucose_level bmi smoking_status stroke
0 0 0 0 0
colSums(Data_Stroke == '')
id gender age hypertension heart_disease ever_married work_type
0 0 0 0 0 0 0
Residence_type avg_glucose_level bmi smoking_status stroke
0 0 0 0 0
Data_Stroke %>% count(gender)
##Remove ID and filter out 'Other' values in Gender
Data_Stroke <- Data_Stroke %>%
select(-c(id)) %>%
filter(gender != "Other")
str(Data_Stroke)
'data.frame': 5109 obs. of 11 variables:
$ gender : chr "Male" "Female" "Male" "Female" ...
$ age : num 67 61 80 49 79 81 74 69 59 78 ...
$ hypertension : int 0 0 0 0 1 0 1 0 0 0 ...
$ heart_disease : int 1 0 1 0 0 0 1 0 0 0 ...
$ ever_married : chr "Yes" "Yes" "Yes" "Yes" ...
$ work_type : chr "Private" "Self-employed" "Private" "Private" ...
$ Residence_type : chr "Urban" "Rural" "Rural" "Urban" ...
$ avg_glucose_level: num 229 202 106 171 174 ...
$ bmi : num 36.6 28.1 32.5 34.4 24 29 27.4 22.8 28.1 24.2 ...
$ smoking_status : chr "formerly smoked" "never smoked" "never smoked" "smokes" ...
$ stroke : int 1 1 1 1 1 1 1 1 1 1 ...
##Convert non-numeric variables to factors
Data_Stroke$stroke <- factor(Data_Stroke$stroke, levels = c(0,1), labels = c("No", "Yes"))
Data_Stroke$hypertension <- factor(Data_Stroke$hypertension, levels = c(0,1), labels = c("No", "Yes"))
Data_Stroke$heart_disease <- factor(Data_Stroke$heart_disease, levels = c(0,1), labels = c("No", "Yes"))
TASK TWO: BUILD PREDICTION MODELS
d1 <- Data_Stroke %>%
ggplot(aes(x = gender, fill = gender)) +
geom_bar(fill = c("red", "blue")) +
ggtitle("Gender Distribution") +
geom_text(aes(label=..count..), stat = "Count", vjust = 1.0)
d2 <- Data_Stroke %>%
ggplot(aes(x = hypertension, fill = hypertension)) +
geom_bar(fill = c("red", "blue")) +
ggtitle("Hypertenstion Distribution") +
geom_text(aes(label=..count..), stat = "Count", vjust = 1.0)
d3 <- Data_Stroke %>%
ggplot(aes(x = heart_disease, fill = heart_disease)) +
geom_bar(fill = c("red", "blue")) +
ggtitle("Heart Disease Distribution") +
geom_text(aes(label=..count..), stat = "Count", vjust = 1.0)
d4 <- Data_Stroke %>%
ggplot(aes(x = ever_married, fill = ever_married)) +
geom_bar(fill = c("red","blue")) +
ggtitle("Married distribution") +
geom_text(aes(label=..count..), stat = "Count", vjust = 1.0)
d5 <- Data_Stroke %>%
ggplot(aes(x = work_type, fill = work_type)) +
geom_bar(fill = c("red", "blue","green","orange","aquamarine")) +
ggtitle("Work type distribution") +
geom_text(aes(label=..count..), stat = "Count", vjust = 1.0)
d6 <- Data_Stroke %>%
ggplot(aes(x = stroke, fill = stroke)) +
geom_bar(fill = c("red", "blue")) +
ggtitle("Stroke distribution") +
geom_text(aes(label=..count..), stat = "Count", vjust = 1.0)
d7 <- Data_Stroke %>%
ggplot(aes(x = Residence_type, fill = Residence_type)) +
geom_bar(fill = c("red", "blue")) +
ggtitle("Residence distribution") +
geom_text(aes(label=..count..), stat = "Count", vjust = 1.0)
grid.arrange(d1,d2,d3,d4,d5,d6,d7, ncol=2)
Data_Stroke %>%
ggplot(aes(x = gender, fill = stroke)) +
geom_bar(position = "fill") +
scale_fill_manual(values=c("aquamarine3",
"blueviolet")) +
ggtitle("Gender vs. Stroke")
Data_Stroke %>%
ggplot(aes(x = hypertension, fill = stroke)) +
geom_bar(position = "fill") +
scale_fill_manual(values=c("aquamarine3",
"blueviolet")) +
ggtitle("Hypertension vs. Stroke")
Data_Stroke %>%
ggplot(aes(x = heart_disease, fill = stroke)) +
geom_bar(position = "fill") +
scale_fill_manual(values=c("aquamarine3",
"blueviolet")) +
ggtitle("Heart disease vs. Stroke")
Data_Stroke %>%
ggplot(aes(x = Residence_type, fill = stroke)) +
geom_bar(position = "fill") +
scale_fill_manual(values=c("aquamarine3",
"blueviolet")) +
ggtitle("Residence type vs. Stroke")
Data_Stroke %>%
ggplot(aes(x = smoking_status, fill = stroke)) +
geom_bar(position = "fill") +
scale_fill_manual(values=c("aquamarine3",
"blueviolet")) +
ggtitle("Smoking status vs. Stroke")
Data_Stroke %>%
ggplot(aes(x = work_type, fill = stroke)) +
geom_bar(position = "fill") +
scale_fill_manual(values=c("aquamarine3",
"blueviolet"
)) +
ggtitle("Type of Work vs. Stroke")
Data_Stroke %>%
ggplot(aes(x = avg_glucose_level, fill = stroke)) +
geom_density(alpha = 0.7) +
scale_fill_manual(values=c("aquamarine3",
"blueviolet"
)) +
ggtitle("Average Glucose level vs. Stroke")
Data_Stroke %>% filter(between(bmi, 0, 60)) %>%
ggplot(aes(x = bmi, fill = stroke)) +
geom_density(alpha = 0.7) +
scale_fill_manual(values=c("aquamarine3",
"blueviolet"
)) +
ggtitle("Body Mass Index vs. Stroke")
TASK THREE: EVALUATE AND SELECT PREDICTION MODELS
sample.split(Data_Stroke$stroke,SplitRatio = 0.8)->split_tag
train<-subset(Data_Stroke,split_tag==TRUE)
test<-subset(Data_Stroke,split_tag==FALSE)
dim(train)
[1] 4087 11
dim(test)
[1] 1022 11
TASK FOUR: DEPLOY THE MODEL
set.seed(123)
rf <- randomForest(formula = stroke~.,data = train)
rf
Call:
randomForest(formula = stroke ~ ., data = train)
Type of random forest: classification
Number of trees: 500
No. of variables tried at each split: 3
OOB estimate of error rate: 5.02%
Confusion matrix:
No Yes class.error
No 3881 7 0.001800412
Yes 198 1 0.994974874
plot(rf)
confusionMatrix(predict(rf,test),test$stroke)
Confusion Matrix and Statistics
Reference
Prediction No Yes
No 971 50
Yes 1 0
Accuracy : 0.9501
95% CI : (0.9349, 0.9626)
No Information Rate : 0.9511
P-Value [Acc > NIR] : 0.5942
Kappa : -0.0019
Mcnemar's Test P-Value : 1.801e-11
Sensitivity : 0.9990
Specificity : 0.0000
Pos Pred Value : 0.9510
Neg Pred Value : 0.0000
Prevalence : 0.9511
Detection Rate : 0.9501
Detection Prevalence : 0.9990
Balanced Accuracy : 0.4995
'Positive' Class : No
TASK FIVE: FINDINGS AND CONCLUSIONS As depicted above, our model boasts an accuracy rate exceeding 95%, indicating that it underwent effective training.