NHẬP DỮ LIỆU VÀO R

df <- read.csv("E:/Prof.TuanNguyen/BV.ThongNhatDec2025/Stroke.Data.csv")

head(df)
##    id gender age hypertension heart.disease ever.married work.type
## 1  67 Female  17            0             0           No   Private
## 2  77 Female  13            0             0           No  children
## 3  84   Male  55            0             0          Yes   Private
## 4  91 Female  42            0             0           No   Private
## 5  99 Female  31            0             0           No   Private
## 6 121 Female  38            0             0          Yes   Private
##   Residence.type glucose.level  bmi         smoking stroke
## 1          Urban         92.97   NA formerly smoked      0
## 2          Rural         85.81 18.6         Unknown      0
## 3          Urban         89.17 31.5    never smoked      0
## 4          Urban         98.53 18.5    never smoked      0
## 5          Urban        108.89 52.3         Unknown      0
## 6          Urban         91.44   NA         Unknown      0

TẠO CỘT MỚI STROKE1 DỄ ĐỌC HƠN

df$stroke1 = ifelse(df$stroke == 1,"Yes", "No")

head(df)
##    id gender age hypertension heart.disease ever.married work.type
## 1  67 Female  17            0             0           No   Private
## 2  77 Female  13            0             0           No  children
## 3  84   Male  55            0             0          Yes   Private
## 4  91 Female  42            0             0           No   Private
## 5  99 Female  31            0             0           No   Private
## 6 121 Female  38            0             0          Yes   Private
##   Residence.type glucose.level  bmi         smoking stroke stroke1
## 1          Urban         92.97   NA formerly smoked      0      No
## 2          Rural         85.81 18.6         Unknown      0      No
## 3          Urban         89.17 31.5    never smoked      0      No
## 4          Urban         98.53 18.5    never smoked      0      No
## 5          Urban        108.89 52.3         Unknown      0      No
## 6          Urban         91.44   NA         Unknown      0      No

Biểu đồ Histogram theo BMI

library(lessR)
## 
## lessR 4.5                            feedback: gerbing@pdx.edu 
## --------------------------------------------------------------
## > d <- Read("")  Read data file, many formats available, e.g., Excel
##   d is the default data frame, data= in analysis routines optional
## 
## Many examples of reading, writing, and manipulating data, graphics,
## testing means and proportions, regression, factor analysis,
## customization, forecasting, and aggregation to pivot tables.
##   Enter: browseVignettes("lessR")
## 
## View lessR updates, now including modern time series forecasting
##   and many, new Plotly interactive visualizations output. Most
##   visualization functions are now reorganized to three functions:
##      Chart(): type="bar", "pie", "radar", "bubble", "treemap", "icicle"
##      X(): type="histogram", "density", "vbs" and more
##      XY(): type="scatter" for a scatterplot, or "contour", "smooth"
##    Most previous function calls still work, such as:
##      BarChart(), Histogram, and Plot().
##   Enter: news(package="lessR"), or ?Chart, ?X, or ?XY
## There is also Flows() for Sankey flow diagrams, see ?Flows
## 
## Interactive data analysis for constructing visualizations.
##   Enter: interact()
Histogram(bmi, fill="green", xlab="Body mass index (g)", ylab="Frequency", data=df)
## lessR visualizations are now unified over just three core functions:
##   - Chart() for pivot tables, such as bar charts. More info: ?Chart
##   - X() for a single variable x, such as histograms. More info: ?X
##   - XY() for scatterplots of two variables, x and y. More info: ?XY
## 
## Histogram() is deprecated, though still working for now.
## Please use X(..., type = "histogram") going forward.
## [Interactive plot from the Plotly R package (Sievert, 2020)]

## >>> Suggestions 
## bin_width: set the width of each bin 
## bin_start: set the start of the first bin 
## bin_end: set the end of the last bin 
## X(bmi, type="density")  # smoothed curve + histogram 
## X(bmi, type="vbs")  # Violin/Box/Scatterplot (VBS) plot 
## 
## --- bmi --- 
##  
##        n     miss     mean       sd      min      mdn      max 
##      4909      201    28.89     7.85    10.30    28.10    97.60 
##  
## 
##   
## --- Outliers ---     from the box plot: 110 
##  
## Small      Large 
## -----      ----- 
##             97.6 
##             92.0 
##             78.0 
##             71.9 
##             66.8 
##             64.8 
##             64.4 
##             63.3 
##             61.6 
##             61.2 
##             60.9 
##             60.9 
##             60.2 
##             59.7 
##             58.1 
##             57.9 
##             57.7 
##             57.5 
## 
## + 92 more outliers 
## 
## 
## Bin Width: 5 
## Number of Bins: 18 
##  
##        Bin  Midpnt  Count    Prop  Cumul.c  Cumul.p 
## --------------------------------------------------- 
##   10 >  15    12.5     44    0.01       44     0.01 
##   15 >  20    17.5    493    0.10      537     0.11 
##   20 >  25    22.5   1070    0.22     1607     0.33 
##   25 >  30    27.5   1409    0.29     3016     0.61 
##   30 >  35    32.5    985    0.20     4001     0.82 
##   35 >  40    37.5    500    0.10     4501     0.92 
##   40 >  45    42.5    253    0.05     4754     0.97 
##   45 >  50    47.5     76    0.02     4830     0.98 
##   50 >  55    52.5     46    0.01     4876     0.99 
##   55 >  60    57.5     20    0.00     4896     1.00 
##   60 >  65    62.5      8    0.00     4904     1.00 
##   65 >  70    67.5      1    0.00     4905     1.00 
##   70 >  75    72.5      1    0.00     4906     1.00 
##   75 >  80    77.5      1    0.00     4907     1.00 
##   80 >  85    82.5      0    0.00     4907     1.00 
##   85 >  90    87.5      0    0.00     4907     1.00 
##   90 >  95    92.5      1    0.00     4908     1.00 
##   95 > 100    97.5      1    0.00     4909     1.00 
## 

Biểu đồ thanh

library(lessR)
BarChart(work.type, data=df)
## lessR visualizations are now unified over just three core functions:
##   - Chart() for pivot tables, such as bar charts. More info: ?Chart
##   - X() for a single variable x, such as histograms. More info: ?X
##   - XY() for scatterplots of two variables, x and y. More info: ?XY
## 
## BarChart() is deprecated, though still working for now.
## Please use Chart(..., type = "bar") going forward.
## [Interactive chart from the Plotly R package (Sievert, 2020)] 
## 
## >>> Suggestions  or  enter: style(suggest=FALSE)
## Chart(work.type type="radar")  # Plotly radar chart
## Chart(work.type type="treemap")  # Plotly treemap chart
## Chart(work.type type="pie")  # Plotly pie/sunburst chart
## Chart(work.type type="icicle")  # Plotly icicle chart
## Chart(work.type type="bubble")  # Plotly bubble chart
## 

## --- work.type --- 
## 
## Missing Values: 0 
## 
##     work.type  Count   Prop 
## ---------------------------- 
##      children    687   0.134 
##      Govt_job    657   0.129 
##  Never_worked     22   0.004 
##       Private   2925   0.572 
## Self-employed    819   0.160 
## ---------------------------- 
##         Total   5110   1.000 
## 
## Chi-squared test of null hypothesis of equal probabilities 
##   Chisq = 4802.415, df = 4, p-value = 0.000

Biểu đồ tương quan

Plot(age, bmi, fit="lm", data=df)
## lessR visualizations are now unified over just three core functions:
##   - Chart() for pivot tables, such as bar charts. More info: ?Chart
##   - X() for a single variable x, such as histograms. More info: ?X
##   - XY() for scatterplots of two variables, x and y. More info: ?XY
## 
## Plot() is deprecated, though still working for now.
## Please use X(...) or XY(...) going forward.
## [Interactive chart from the Plotly R package (Sievert, 2020)]

## 
## 
## >>> Suggestions  or  enter: style(suggest=FALSE)
## XY(age, bmi, enhance=TRUE)  # many options
## XY(age, bmi, color="red")  # exterior edge color of points
## XY(age, bmi, out_cut=.10)  # label top 10% from center as outliers 
## 
## 
## >>> Pearson's product-moment correlation 
##  
## Number of paired values with neither missing, n = 4909 
## Sample Correlation of age and bmi: r = 0.333 
##   
## Hypothesis Test of 0 Correlation:  t = 24.772,  df = 4907,  p-value = 0.000 
## 95% Confidence Interval for Correlation:  0.308 to 0.358 
##   
## 
##   Line: b0 = 23.917    b1 = 0.116
##   Linear Model MSE = 54.8408   Rsq = 0.111
## 

Biểu đồ tương quan bằng ggplot2

library(ggplot2)

ggplot(df, aes(x = age, y = bmi)) +
  geom_point(alpha = 0.5, size = 2) +
  geom_smooth(method = "lm", se = TRUE) +
  labs(
    title = "Relationship between Age and BMI",
    x = "Age (years)",
    y = "Body Mass Index (kg/m²)"
  ) +
  theme_bw(base_size = 14)
## `geom_smooth()` using formula = 'y ~ x'
## Warning: Removed 201 rows containing non-finite outside the scale range
## (`stat_smooth()`).
## Warning: Removed 201 rows containing missing values or values outside the scale range
## (`geom_point()`).

# Biểu đồ tương quan tô màu theo giới tính

ggplot(df, aes(x = age, y = bmi, color = gender)) +
  geom_point(alpha = 0.6) +
  geom_smooth(method = "lm", se = FALSE) +
  labs(
    title = "Age vs BMI by Gender",
    x = "Age (years)",
    y = "BMI (kg/m²)",
    color = "Gender"
  ) +
  theme_bw(base_size = 14)
## `geom_smooth()` using formula = 'y ~ x'
## Warning: Removed 201 rows containing non-finite outside the scale range
## (`stat_smooth()`).
## Warning: Removed 201 rows containing missing values or values outside the scale range
## (`geom_point()`).

# Mô hình hồi quy logistic (Thầy)

fit = Logit(stroke ~ gender + age, data=df)
## 
## >>> Note:  gender is not a numeric variable.
##            Indicator variables are created and analyzed.
## 
## Response Variable:   stroke
## Predictor Variable 1:  genderMale
## Predictor Variable 2:  age
## 
## Number of cases (rows) of data:  5110 
## Number of cases retained for analysis:  5110 
## 
## 
##    BASIC ANALYSIS 
## 
## -- Estimated Model of stroke for the Logit of Reference Group Membership
## 
##              Estimate    Std Err  z-value  p-value   Lower 95%   Upper 95%
## (Intercept)   -7.2834     0.3418  -21.308    0.000     -7.9533     -6.6134 
##  genderMale    0.1153     0.1374    0.839    0.401     -0.1539      0.3845 
##         age    0.0748     0.0049   15.167    0.000      0.0651      0.0844 
## 
## 
## -- Odds Ratios and Confidence Intervals
## 
##              Odds Ratio   Lower 95%   Upper 95%
## (Intercept)      0.0007      0.0004      0.0013 
##  genderMale      1.1222      0.8573      1.4689 
##         age      1.0776      1.0673      1.0881 
## 
## 
## -- Model Fit
## 
##     Null deviance: 1990.373 on 5109 degrees of freedom
## Residual deviance: 1615.595 on 5107 degrees of freedom
## 
## AIC: 1621.595 
## 
## Number of iterations to convergence: 7 
## 
## 
## Collinearity
## 
##            Tolerance       VIF
## genderMale     0.999     1.001
## age            0.999     1.001
## 
##    ANALYSIS OF RESIDUALS AND INFLUENCE 
## Data, Fitted, Residual, Studentized Residual, Dffits, Cook's Distance
##    [sorted by Cook's Distance]
##    [res_rows = 20 out of 5110 cases (rows) of data]
## --------------------------------------------------------------------
##      genderMale age stroke    P(Y=1) residual rstudent  dffits    cooks
## 4897          0   1      1 0.0007396   0.9993    3.812 0.06215 0.037826
## 3451          0  14      1 0.0019529   0.9980    3.543 0.07678 0.025275
## 2764          0  32      1 0.0074615   0.9925    3.136 0.09328 0.012387
## 2190          0  38      1 0.0116372   0.9884    2.989 0.09620 0.009255
## 2205          0  38      1 0.0116372   0.9884    2.989 0.09620 0.009255
## 2159          0  39      1 0.0125294   0.9875    2.964 0.09651 0.008790
## 2349          0  39      1 0.0125294   0.9875    2.964 0.09651 0.008790
## 2507          0  39      1 0.0125294   0.9875    2.964 0.09651 0.008790
## 2345          1  42      1 0.0175082   0.9825    2.849 0.10540 0.008086
## 2922          1  43      1 0.0188421   0.9812    2.822 0.10574 0.007692
## 3818          1  45      1 0.0218153   0.9782    2.770 0.10630 0.006953
## 1387          0  45      1 0.0194856   0.9805    2.810 0.09724 0.006341
## 2848          0  45      1 0.0194856   0.9805    2.810 0.09724 0.006341
## 1051          1  47      1 0.0252456   0.9748    2.716 0.10675 0.006278
## 3827          0  46      1 0.0209669   0.9790    2.783 0.09719 0.005989
## 1029          1  48      1 0.0271526   0.9728    2.689 0.10693 0.005964
## 659           1  49      1 0.0291993   0.9708    2.662 0.10710 0.005667
## 4688          0  48      1 0.0242671   0.9757    2.730 0.09696 0.005336
## 4636          1  51      1 0.0337507   0.9662    2.606 0.10742 0.005119
## 4227          0  49      1 0.0261021   0.9739    2.703 0.09679 0.005033
## 
## 
##    PREDICTION 
## 
## Probability threshold for classification : 0.5
## 
## 
## Data, Fitted Values, Standard Errors
##    [sorted by fitted value]
##    [pred_all=TRUE to see all intervals displayed]
## --------------------------------------------------------------------
##      genderMale age stroke label    fitted   std.err
## 136           0   0      0     0 0.0006864 0.0002345
## 556           0   0      0     0 0.0006864 0.0002345
## 1135          0   0      0     0 0.0006864 0.0002345
## 1597          0   0      0     0 0.0006864 0.0002345
## 
## ... for the rows of data where fitted is close to 0.5 ...
## 
##      genderMale age stroke label fitted std.err
## 4624          1  81      0     0 0.2477 0.02353
## 4885          1  81      0     0 0.2477 0.02353
## 337           1  82      0     0 0.2619 0.02497
## 546           1  82      0     0 0.2619 0.02497
## 1367          1  82      0     0 0.2619 0.02497
## 
## ... for the last 4 rows of sorted data ...
## 
##      genderMale age stroke label fitted std.err
## 4533          1  82      1     0 0.2619 0.02497
## 4799          1  82      0     0 0.2619 0.02497
## 4921          1  82      0     0 0.2619 0.02497
## 4925          1  82      0     0 0.2619 0.02497
## --------------------------------------------------------------------
## 
## 
## ----------------------------
## Specified confusion matrices
## ----------------------------
## 
## Probability threshold for predicting : 0.5
## 
##                Baseline         Predicted 
## ---------------------------------------------------
##               Total  %Tot        0      1  %Correct 
## ---------------------------------------------------
##          1      249   4.9      249      0     0.0 
## stroke   0     4861  95.1     4861      0     100.0 
## ---------------------------------------------------
##        Total   5110                           95.1 
## 
## Accuracy: 95.13 
## Sensitivity: 0.00 
## Precision: NaN

# PHÂN TÍCH MÔ TẢ DÙNG TABLE1

library(table1)
## 
## Attaching package: 'table1'
## The following object is masked from 'package:lessR':
## 
##     label
## The following objects are masked from 'package:base':
## 
##     units, units<-
table1 (~ gender + age + hypertension + heart.disease + ever.married + work.type + glucose.level + bmi + smoking | stroke1, data = df)
No
(N=4861)
Yes
(N=249)
Overall
(N=5110)
gender
Female 2853 (58.7%) 141 (56.6%) 2994 (58.6%)
Male 2008 (41.3%) 108 (43.4%) 2116 (41.4%)
age
Mean (SD) 42.0 (22.3) 67.7 (12.7) 43.2 (22.6)
Median [Min, Max] 43.0 [0.0800, 82.0] 71.0 [1.32, 82.0] 45.0 [0.0800, 82.0]
hypertension
Mean (SD) 0.0889 (0.285) 0.265 (0.442) 0.0975 (0.297)
Median [Min, Max] 0 [0, 1.00] 0 [0, 1.00] 0 [0, 1.00]
heart.disease
Mean (SD) 0.0471 (0.212) 0.189 (0.392) 0.0540 (0.226)
Median [Min, Max] 0 [0, 1.00] 0 [0, 1.00] 0 [0, 1.00]
ever.married
No 1728 (35.5%) 29 (11.6%) 1757 (34.4%)
Yes 3133 (64.5%) 220 (88.4%) 3353 (65.6%)
work.type
children 685 (14.1%) 2 (0.8%) 687 (13.4%)
Govt_job 624 (12.8%) 33 (13.3%) 657 (12.9%)
Never_worked 22 (0.5%) 0 (0%) 22 (0.4%)
Private 2776 (57.1%) 149 (59.8%) 2925 (57.2%)
Self-employed 754 (15.5%) 65 (26.1%) 819 (16.0%)
glucose.level
Mean (SD) 105 (43.8) 133 (61.9) 106 (45.3)
Median [Min, Max] 91.5 [55.1, 268] 105 [56.1, 272] 91.9 [55.1, 272]
bmi
Mean (SD) 28.8 (7.91) 30.5 (6.33) 28.9 (7.85)
Median [Min, Max] 28.0 [10.3, 97.6] 29.7 [16.9, 56.6] 28.1 [10.3, 97.6]
Missing 161 (3.3%) 40 (16.1%) 201 (3.9%)
smoking
formerly smoked 815 (16.8%) 70 (28.1%) 885 (17.3%)
never smoked 1802 (37.1%) 90 (36.1%) 1892 (37.0%)
smokes 747 (15.4%) 42 (16.9%) 789 (15.4%)
Unknown 1497 (30.8%) 47 (18.9%) 1544 (30.2%)

Vẽ mối tương quan giữa Độtquỵ và thuốc lá với giới tính

library(ggplot2)

ggplot(df, aes(x = stroke1, y = smoking, color = gender)) +
  geom_jitter(width = 0.2, height = 0.2, alpha = 0.6) +
  labs(
    title = "Stroke vs Smoking by Gender",
    x = "Stroke",
    y = "Smoking"
  ) +
  theme_bw()

PHÂN TÍCH HỒI QUY logistic (self)

Chuẩn hóa biến (rất quan trọng)

df$stroke1  <- as.factor(df$stroke1)   # outcome nhị phân
df$smoking  <- as.factor(df$smoking)
df$gender   <- as.factor(df$gender)

Logistic regression

model <- glm(
  stroke1 ~ smoking + gender,
  data = df,
  family = binomial
)

Xem kết quả

summary(model)
## 
## Call:
## glm(formula = stroke1 ~ smoking + gender, family = binomial, 
##     data = df)
## 
## Coefficients:
##                     Estimate Std. Error z value Pr(>|z|)    
## (Intercept)          -2.4935     0.1397 -17.850  < 2e-16 ***
## smokingnever smoked  -0.5330     0.1655  -3.220  0.00128 ** 
## smokingsmokes        -0.4209     0.2017  -2.087  0.03690 *  
## smokingUnknown       -1.0063     0.1935  -5.199    2e-07 ***
## genderMale            0.0826     0.1322   0.625  0.53217    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 1990.4  on 5109  degrees of freedom
## Residual deviance: 1961.9  on 5105  degrees of freedom
## AIC: 1971.9
## 
## Number of Fisher Scoring iterations: 6

Odds Ratio + 95% CI

exp(cbind(
  OR = coef(model),
  confint(model)
))
## Waiting for profiling to be done...
##                            OR      2.5 %    97.5 %
## (Intercept)         0.0826214 0.06231184 0.1078079
## smokingnever smoked 0.5868373 0.42480778 0.8137105
## smokingsmokes       0.6564317 0.43906807 0.9702071
## smokingUnknown      0.3655850 0.24885439 0.5324817
## genderMale          1.0861056 0.83680627 1.4059910

Vẽ biểu đồ diễn giải kết quả

library(broom)
library(ggplot2)

or_df <- tidy(model, exponentiate = TRUE, conf.int = TRUE)

or_df <- or_df[or_df$term != "(Intercept)", ]

ggplot(or_df, aes(
  x = term,
  y = estimate,
  ymin = conf.low,
  ymax = conf.high
)) +
  geom_pointrange() +
  geom_hline(yintercept = 1, linetype = "dashed") +
  coord_flip() +
  labs(
    title = "Odds Ratios for Stroke",
    y = "Odds Ratio (95% CI)",
    x = ""
  ) +
  theme_bw()