Require packages

library(ggplot2)
## Warning: package 'ggplot2' was built under R version 4.5.2
library(table1)
## Warning: package 'table1' was built under R version 4.5.2
## 
## Attaching package: 'table1'
## The following objects are masked from 'package:base':
## 
##     units, units<-
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.4     ✔ readr     2.1.5
## ✔ forcats   1.0.0     ✔ stringr   1.5.1
## ✔ lubridate 1.9.4     ✔ tibble    3.3.0
## ✔ purrr     1.0.4     ✔ tidyr     1.3.1
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(dplyr)
library(gtsummary)
library(boot)
## Warning: package 'boot' was built under R version 4.5.2
library(lessR)
## Warning: package 'lessR' was built under R version 4.5.2
## 
## lessR 4.5                            feedback: gerbing@pdx.edu 
## --------------------------------------------------------------
## > d <- Read("")  Read data file, many formats available, e.g., Excel
##   d is the default data frame, data= in analysis routines optional
## 
## Many examples of reading, writing, and manipulating data, graphics,
## testing means and proportions, regression, factor analysis,
## customization, forecasting, and aggregation to pivot tables.
##   Enter: browseVignettes("lessR")
## 
## View lessR updates, now including modern time series forecasting
##   and many, new Plotly interactive visualizations output. Most
##   visualization functions are now reorganized to three functions:
##      Chart(): type="bar", "pie", "radar", "bubble", "treemap", "icicle"
##      X(): type="histogram", "density", "vbs" and more
##      XY(): type="scatter" for a scatterplot, or "contour", "smooth"
##    Most previous function calls still work, such as:
##      BarChart(), Histogram, and Plot().
##   Enter: news(package="lessR"), or ?Chart, ?X, or ?XY
## There is also Flows() for Sankey flow diagrams, see ?Flows
## 
## Interactive data analysis for constructing visualizations.
##   Enter: interact()
## 
## 
## Attaching package: 'lessR'
## 
## The following objects are masked from 'package:dplyr':
## 
##     order_by, recode, rename
## 
## The following object is masked from 'package:table1':
## 
##     label

input data

th1 <- read.csv("Stroke Data.csv",header = T)
th1 <- th1 %>% 
  mutate_if(is.character,factor) %>% 
  mutate(hypertension=as.factor(hypertension),
         stroke=as.factor(stroke),
         heart_disease=as.factor(heart_disease),
         id=as.factor(id))

Explore data

# 4.1 Có bao nhiêu biến số (variable) và quan sát (observation)
dim(th1)
## [1] 5110   12
## Data th1 có 5110 obs và 12 variables

# 4.2 Liệt kê 10 quan sát đầu tiên của dữ liệu. 
head(th1,10)
##       id gender age hypertension heart_disease ever_married     work_type
## 1   9046   Male  67            0             1          Yes       Private
## 2  51676 Female  61            0             0          Yes Self-employed
## 3  31112   Male  80            0             1          Yes       Private
## 4  60182 Female  49            0             0          Yes       Private
## 5   1665 Female  79            1             0          Yes Self-employed
## 6  56669   Male  81            0             0          Yes       Private
## 7  53882   Male  74            1             1          Yes       Private
## 8  10434 Female  69            0             0           No       Private
## 9  27419 Female  59            0             0          Yes       Private
## 10 60491 Female  78            0             0          Yes       Private
##    Residence_type avg_glucose_level  bmi  smoking_status stroke
## 1           Urban            228.69 36.6 formerly smoked      1
## 2           Rural            202.21   NA    never smoked      1
## 3           Rural            105.92 32.5    never smoked      1
## 4           Urban            171.23 34.4          smokes      1
## 5           Rural            174.12 24.0    never smoked      1
## 6           Urban            186.21 29.0 formerly smoked      1
## 7           Rural             70.09 27.4    never smoked      1
## 8           Urban             94.39 22.8    never smoked      1
## 9           Rural             76.15   NA         Unknown      1
## 10          Urban             58.57 24.2         Unknown      1
# 4.3 Liệt kê 6 quan sát cuối cùng của dữ liệu 
tail(th1)
##         id gender age hypertension heart_disease ever_married     work_type
## 5105 14180 Female  13            0             0           No      children
## 5106 18234 Female  80            1             0          Yes       Private
## 5107 44873 Female  81            0             0          Yes Self-employed
## 5108 19723 Female  35            0             0          Yes Self-employed
## 5109 37544   Male  51            0             0          Yes       Private
## 5110 44679 Female  44            0             0          Yes      Govt_job
##      Residence_type avg_glucose_level  bmi  smoking_status stroke
## 5105          Rural            103.08 18.6         Unknown      0
## 5106          Urban             83.75   NA    never smoked      0
## 5107          Urban            125.20 40.0    never smoked      0
## 5108          Rural             82.99 30.6    never smoked      0
## 5109          Rural            166.29 25.6 formerly smoked      0
## 5110          Urban             85.28 26.2         Unknown      0
# 4.4 Tóm tắt dữ liệu bằng hàm summary
summary(th1)
##        id          gender          age        hypertension heart_disease
##  67     :   1   Female:2994   Min.   : 0.08   0:4612       0:4834       
##  77     :   1   Male  :2115   1st Qu.:25.00   1: 498       1: 276       
##  84     :   1   Other :   1   Median :45.00                             
##  91     :   1                 Mean   :43.23                             
##  99     :   1                 3rd Qu.:61.00                             
##  121    :   1                 Max.   :82.00                             
##  (Other):5104                                                           
##  ever_married         work_type    Residence_type avg_glucose_level
##  No :1757     children     : 687   Rural:2514     Min.   : 55.12   
##  Yes:3353     Govt_job     : 657   Urban:2596     1st Qu.: 77.25   
##               Never_worked :  22                  Median : 91.89   
##               Private      :2925                  Mean   :106.15   
##               Self-employed: 819                  3rd Qu.:114.09   
##                                                   Max.   :271.74   
##                                                                    
##       bmi                smoking_status stroke  
##  Min.   :10.30   formerly smoked: 885   0:4861  
##  1st Qu.:23.50   never smoked   :1892   1: 249  
##  Median :28.10   smokes         : 789           
##  Mean   :28.89   Unknown        :1544           
##  3rd Qu.:33.10                                  
##  Max.   :97.60                                  
##  NA's   :201
glimpse(th1)
## Rows: 5,110
## Columns: 12
## $ id                <fct> 9046, 51676, 31112, 60182, 1665, 56669, 53882, 10434…
## $ gender            <fct> Male, Female, Male, Female, Female, Male, Male, Fema…
## $ age               <dbl> 67, 61, 80, 49, 79, 81, 74, 69, 59, 78, 81, 61, 54, …
## $ hypertension      <fct> 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 1…
## $ heart_disease     <fct> 1, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 1, 1, 0, 1, 0…
## $ ever_married      <fct> Yes, Yes, Yes, Yes, Yes, Yes, Yes, No, Yes, Yes, Yes…
## $ work_type         <fct> Private, Self-employed, Private, Private, Self-emplo…
## $ Residence_type    <fct> Urban, Rural, Rural, Urban, Rural, Urban, Rural, Urb…
## $ avg_glucose_level <dbl> 228.69, 202.21, 105.92, 171.23, 174.12, 186.21, 70.0…
## $ bmi               <dbl> 36.6, NA, 32.5, 34.4, 24.0, 29.0, 27.4, 22.8, NA, 24…
## $ smoking_status    <fct> formerly smoked, never smoked, never smoked, smokes,…
## $ stroke            <fct> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1…

Edit data

Mã hóa biến sex từ biến gender

th2 <- th1 %>% 
  mutate(sex = case_when(
    gender =="Female"~"0",
    gender =="Male"~"1",
    gender =="Other"~"2"
  )) %>% 
  mutate(sex=as.factor(sex))

Mã hóa biến bmi_cut từ biến bmi

th3 <- th2 %>% 
  mutate(bmi_cut= case_when(
                  bmi<18.5 ~ "underweight",
                  bmi>=18.5 & bmi <25 ~ "normal",
                  bmi>=25 & bmi <30 ~ "overweight",
                  bmi>=30 ~ "obese"
  ))

Mô tả đặc điểm các biến

table1(~age+gender+hypertension+
         heart_disease+ever_married+
         smoking_status+Residence_type+
         bmi+sex+work_type|stroke,data=th3)
0
(N=4861)
1
(N=249)
Overall
(N=5110)
age
Mean (SD) 42.0 (22.3) 67.7 (12.7) 43.2 (22.6)
Median [Min, Max] 43.0 [0.0800, 82.0] 71.0 [1.32, 82.0] 45.0 [0.0800, 82.0]
gender
Female 2853 (58.7%) 141 (56.6%) 2994 (58.6%)
Male 2007 (41.3%) 108 (43.4%) 2115 (41.4%)
Other 1 (0.0%) 0 (0%) 1 (0.0%)
hypertension
0 4429 (91.1%) 183 (73.5%) 4612 (90.3%)
1 432 (8.9%) 66 (26.5%) 498 (9.7%)
heart_disease
0 4632 (95.3%) 202 (81.1%) 4834 (94.6%)
1 229 (4.7%) 47 (18.9%) 276 (5.4%)
ever_married
No 1728 (35.5%) 29 (11.6%) 1757 (34.4%)
Yes 3133 (64.5%) 220 (88.4%) 3353 (65.6%)
smoking_status
formerly smoked 815 (16.8%) 70 (28.1%) 885 (17.3%)
never smoked 1802 (37.1%) 90 (36.1%) 1892 (37.0%)
smokes 747 (15.4%) 42 (16.9%) 789 (15.4%)
Unknown 1497 (30.8%) 47 (18.9%) 1544 (30.2%)
Residence_type
Rural 2400 (49.4%) 114 (45.8%) 2514 (49.2%)
Urban 2461 (50.6%) 135 (54.2%) 2596 (50.8%)
bmi
Mean (SD) 28.8 (7.91) 30.5 (6.33) 28.9 (7.85)
Median [Min, Max] 28.0 [10.3, 97.6] 29.7 [16.9, 56.6] 28.1 [10.3, 97.6]
Missing 161 (3.3%) 40 (16.1%) 201 (3.9%)
sex
0 2853 (58.7%) 141 (56.6%) 2994 (58.6%)
1 2007 (41.3%) 108 (43.4%) 2115 (41.4%)
2 1 (0.0%) 0 (0%) 1 (0.0%)
work_type
children 685 (14.1%) 2 (0.8%) 687 (13.4%)
Govt_job 624 (12.8%) 33 (13.3%) 657 (12.9%)
Never_worked 22 (0.5%) 0 (0%) 22 (0.4%)
Private 2776 (57.1%) 149 (59.8%) 2925 (57.2%)
Self-employed 754 (15.5%) 65 (26.1%) 819 (16.0%)
a <- table1(~hypertension+heart_disease+smoking_status+
         bmi+bmi_cut|stroke,data=th3)
a
0
(N=4861)
1
(N=249)
Overall
(N=5110)
hypertension
0 4429 (91.1%) 183 (73.5%) 4612 (90.3%)
1 432 (8.9%) 66 (26.5%) 498 (9.7%)
heart_disease
0 4632 (95.3%) 202 (81.1%) 4834 (94.6%)
1 229 (4.7%) 47 (18.9%) 276 (5.4%)
smoking_status
formerly smoked 815 (16.8%) 70 (28.1%) 885 (17.3%)
never smoked 1802 (37.1%) 90 (36.1%) 1892 (37.0%)
smokes 747 (15.4%) 42 (16.9%) 789 (15.4%)
Unknown 1497 (30.8%) 47 (18.9%) 1544 (30.2%)
bmi
Mean (SD) 28.8 (7.91) 30.5 (6.33) 28.9 (7.85)
Median [Min, Max] 28.0 [10.3, 97.6] 29.7 [16.9, 56.6] 28.1 [10.3, 97.6]
Missing 161 (3.3%) 40 (16.1%) 201 (3.9%)
bmi_cut
normal 1208 (24.9%) 35 (14.1%) 1243 (24.3%)
obese 1822 (37.5%) 98 (39.4%) 1920 (37.6%)
overweight 1334 (27.4%) 75 (30.1%) 1409 (27.6%)
underweight 336 (6.9%) 1 (0.4%) 337 (6.6%)
Missing 161 (3.3%) 40 (16.1%) 201 (3.9%)