MATH1324 Applied Analytics Assignment 2

Statistical Analysis Project of Stroke Data

Jimmy Chan

Last updated: 1 November 2025

Introduction

Problem Statement

Data

Data Cont.

dat_stroke <- read_csv("stroke_dataset.csv")

#Filter only important feature
stroke_ds <- dat_stroke[,c(8,9,12)]

Check data frame structure

str(as.data.frame(stroke_ds))
## 'data.frame':    5110 obs. of  3 variables:
##  $ Residence_type   : chr  "Urban" "Rural" "Rural" "Urban" ...
##  $ avg_glucose_level: num  229 202 106 171 174 ...
##  $ stroke           : num  1 1 1 1 1 1 1 1 1 1 ...

Data Cont.

Check missing values

colSums(is.na(stroke_ds))
##    Residence_type avg_glucose_level            stroke 
##                 0                 0                 0

Convert to factors

stroke_ds$Residence_type <- factor(stroke_ds$Residence_type,
                             levels = c("Rural","Urban"))
levels(stroke_ds$Residence_type)
## [1] "Rural" "Urban"
stroke_ds$stroke <- factor(stroke_ds$stroke,
                     levels = c(0,1),labels = c("No","Yes")) 
levels(stroke_ds$stroke)
## [1] "No"  "Yes"

Descriptive Statistics and Visualisation

tab1 <- table(stroke_ds$stroke,stroke_ds$Residence_type) %>%
  prop.table(margin = 2)
knitr::kable(tab1)
Rural Urban
No 0.9546539 0.9479969
Yes 0.0453461 0.0520031
barplot(tab1,
        main = "Stroke History by Residence Type",ylab="Proportion within residence type",
        ylim=c(0,1),legend=rownames(tab1),beside=TRUE,
        args.legend=c(x = "topright",horiz=TRUE, title="Had stroke"),
        xlab="Residence type")

Decsriptive Statistics Cont.

stroke_ds %>% group_by(stroke) %>% summarise(Min = min(avg_glucose_level,na.rm = TRUE),
                                      Q1 = quantile(avg_glucose_level,probs = .25,na.rm = TRUE),
                                      Median = median(avg_glucose_level, na.rm = TRUE),
                                      Q3 = quantile(avg_glucose_level,probs = .75,na.rm = TRUE),
                                      Max = max(avg_glucose_level,na.rm = TRUE),
                                      Mean = mean(avg_glucose_level, na.rm = TRUE),
                                      SD = sd(avg_glucose_level, na.rm = TRUE),
                                      n = n(),
                                      Missing = sum(is.na(avg_glucose_level))) -> tab2

knitr::kable(tab2)
stroke Min Q1 Median Q3 Max Mean SD n Missing
No 55.12 77.12 91.47 112.83 267.76 104.7955 43.84607 4861 0
Yes 56.11 79.79 105.22 196.71 271.74 132.5447 61.92106 249 0

Decsriptive Statistics Cont.

stroke_ds %>% boxplot(avg_glucose_level ~ stroke,data = ., na.rm=TRUE, main="Box Plot of glucose by stroke", 
               ylab="stroke", xlab="glucose", horizontal = TRUE, col = "darkorange2")

Hypothesis Testing and Confidence Interval

# QQ plot for stroke = Yes
stroke_yes <- stroke_ds %>% filter(stroke == "Yes")
stroke_yes$avg_glucose_level %>% qqPlot(dist="norm")

## [1] 194 136

Hypothesis Testing and CI Cont.

# QQ plot for stroke = no
stroke_no <- stroke_ds %>% filter(stroke == "No")
stroke_no$avg_glucose_level %>% qqPlot(dist="norm")

## [1]  959 2840

Hypothesis Testing and CI Cont.

leveneTest(avg_glucose_level ~ stroke, data = stroke_ds)

Hypothesis Testing and CI Cont.

results <- t.test(
  avg_glucose_level ~ stroke,
  data = stroke_ds,
  var.equal = FALSE,
  alternative = "two.sided"
)
results
## 
##  Welch Two Sample t-test
## 
## data:  avg_glucose_level by stroke
## t = -6.9824, df = 260.89, p-value = 2.401e-11
## alternative hypothesis: true difference in means between group No and group Yes is not equal to 0
## 95 percent confidence interval:
##  -35.57474 -19.92371
## sample estimates:
##  mean in group No mean in group Yes 
##          104.7955          132.5447

Hypothesis Testing and CI Cont.

results$p.value
## [1] 2.401437e-11
results$conf.int
## [1] -35.57474 -19.92371
## attr(,"conf.level")
## [1] 0.95

Categorical association

Null Hypothesis H0: residence type and stroke has no statistically significant association

Alternative Hypothesis HA: residence type and stroke has statistically significant association

Assumption: Less than 25% cells with expected counts < 5

chi2 <- chisq.test(table(stroke_ds$stroke,stroke_ds$Residence_type))
chi2$observed
##      
##       Rural Urban
##   No   2400  2461
##   Yes   114   135
chi2$expected
##      
##           Rural     Urban
##   No  2391.4978 2469.5022
##   Yes  122.5022  126.4978

expected counts > 5 in all cells

Categorical association Cont.

chi2
## 
##  Pearson's Chi-squared test with Yates' continuity correction
## 
## data:  table(stroke_ds$stroke, stroke_ds$Residence_type)
## X-squared = 1.0816, df = 1, p-value = 0.2983

Discussion

Discussion Cont.

References