Jimmy Chan
Last updated: 1 November 2025
dat_stroke <- read_csv("stroke_dataset.csv")
#Filter only important feature
stroke_ds <- dat_stroke[,c(8,9,12)]Check data frame structure
## 'data.frame': 5110 obs. of 3 variables:
## $ Residence_type : chr "Urban" "Rural" "Rural" "Urban" ...
## $ avg_glucose_level: num 229 202 106 171 174 ...
## $ stroke : num 1 1 1 1 1 1 1 1 1 1 ...
Check missing values
## Residence_type avg_glucose_level stroke
## 0 0 0
Convert to factors
stroke_ds$Residence_type <- factor(stroke_ds$Residence_type,
levels = c("Rural","Urban"))
levels(stroke_ds$Residence_type)## [1] "Rural" "Urban"
stroke_ds$stroke <- factor(stroke_ds$stroke,
levels = c(0,1),labels = c("No","Yes"))
levels(stroke_ds$stroke)## [1] "No" "Yes"
tab1 <- table(stroke_ds$stroke,stroke_ds$Residence_type) %>%
prop.table(margin = 2)
knitr::kable(tab1)| Rural | Urban | |
|---|---|---|
| No | 0.9546539 | 0.9479969 |
| Yes | 0.0453461 | 0.0520031 |
barplot(tab1,
main = "Stroke History by Residence Type",ylab="Proportion within residence type",
ylim=c(0,1),legend=rownames(tab1),beside=TRUE,
args.legend=c(x = "topright",horiz=TRUE, title="Had stroke"),
xlab="Residence type")stroke_ds %>% group_by(stroke) %>% summarise(Min = min(avg_glucose_level,na.rm = TRUE),
Q1 = quantile(avg_glucose_level,probs = .25,na.rm = TRUE),
Median = median(avg_glucose_level, na.rm = TRUE),
Q3 = quantile(avg_glucose_level,probs = .75,na.rm = TRUE),
Max = max(avg_glucose_level,na.rm = TRUE),
Mean = mean(avg_glucose_level, na.rm = TRUE),
SD = sd(avg_glucose_level, na.rm = TRUE),
n = n(),
Missing = sum(is.na(avg_glucose_level))) -> tab2
knitr::kable(tab2)| stroke | Min | Q1 | Median | Q3 | Max | Mean | SD | n | Missing |
|---|---|---|---|---|---|---|---|---|---|
| No | 55.12 | 77.12 | 91.47 | 112.83 | 267.76 | 104.7955 | 43.84607 | 4861 | 0 |
| Yes | 56.11 | 79.79 | 105.22 | 196.71 | 271.74 | 132.5447 | 61.92106 | 249 | 0 |
stroke_ds %>% boxplot(avg_glucose_level ~ stroke,data = ., na.rm=TRUE, main="Box Plot of glucose by stroke",
ylab="stroke", xlab="glucose", horizontal = TRUE, col = "darkorange2")By applying two-sample t-test, we will check if mean difference of average glucose level between stroke and non-stroke patients is statistically significant
Assumptions: 1) average glucose level is normally distributed 2) Homogeneity of variances
Before the test, we will check the normality and variance homogeneity assumption
Check normality using QQ Plot
# QQ plot for stroke = Yes
stroke_yes <- stroke_ds %>% filter(stroke == "Yes")
stroke_yes$avg_glucose_level %>% qqPlot(dist="norm")## [1] 194 136
# QQ plot for stroke = no
stroke_no <- stroke_ds %>% filter(stroke == "No")
stroke_no$avg_glucose_level %>% qqPlot(dist="norm")## [1] 959 2840
results <- t.test(
avg_glucose_level ~ stroke,
data = stroke_ds,
var.equal = FALSE,
alternative = "two.sided"
)
results##
## Welch Two Sample t-test
##
## data: avg_glucose_level by stroke
## t = -6.9824, df = 260.89, p-value = 2.401e-11
## alternative hypothesis: true difference in means between group No and group Yes is not equal to 0
## 95 percent confidence interval:
## -35.57474 -19.92371
## sample estimates:
## mean in group No mean in group Yes
## 104.7955 132.5447
## [1] 2.401437e-11
## [1] -35.57474 -19.92371
## attr(,"conf.level")
## [1] 0.95
Null Hypothesis H0: residence type and stroke has no statistically significant association
Alternative Hypothesis HA: residence type and stroke has statistically significant association
Assumption: Less than 25% cells with expected counts < 5
##
## Rural Urban
## No 2400 2461
## Yes 114 135
##
## Rural Urban
## No 2391.4978 2469.5022
## Yes 122.5022 126.4978
expected counts > 5 in all cells
##
## Pearson's Chi-squared test with Yates' continuity correction
##
## data: table(stroke_ds$stroke, stroke_ds$Residence_type)
## X-squared = 1.0816, df = 1, p-value = 0.2983
df = (r-1)(c-1) = (2-1)(2-1) = 1, X-squared = 1.0816
p-value = 0.2983, \(\alpha\)=0.05, p>\(\alpha\)
Fail to reject H0. The Chi-square test of association was not statistically significant.
Results found no association between residence type and stroke