df <- read.csv("E:/Prof.TuanNguyen/BV.ThongNhatDec2025/Stroke.Data.csv")
head(df)
## id gender age hypertension heart.disease ever.married work.type
## 1 67 Female 17 0 0 No Private
## 2 77 Female 13 0 0 No children
## 3 84 Male 55 0 0 Yes Private
## 4 91 Female 42 0 0 No Private
## 5 99 Female 31 0 0 No Private
## 6 121 Female 38 0 0 Yes Private
## Residence.type glucose.level bmi smoking stroke
## 1 Urban 92.97 NA formerly smoked 0
## 2 Rural 85.81 18.6 Unknown 0
## 3 Urban 89.17 31.5 never smoked 0
## 4 Urban 98.53 18.5 never smoked 0
## 5 Urban 108.89 52.3 Unknown 0
## 6 Urban 91.44 NA Unknown 0
df$stroke1 = ifelse(df$stroke == 1,"Yes", "No")
head(df)
## id gender age hypertension heart.disease ever.married work.type
## 1 67 Female 17 0 0 No Private
## 2 77 Female 13 0 0 No children
## 3 84 Male 55 0 0 Yes Private
## 4 91 Female 42 0 0 No Private
## 5 99 Female 31 0 0 No Private
## 6 121 Female 38 0 0 Yes Private
## Residence.type glucose.level bmi smoking stroke stroke1
## 1 Urban 92.97 NA formerly smoked 0 No
## 2 Rural 85.81 18.6 Unknown 0 No
## 3 Urban 89.17 31.5 never smoked 0 No
## 4 Urban 98.53 18.5 never smoked 0 No
## 5 Urban 108.89 52.3 Unknown 0 No
## 6 Urban 91.44 NA Unknown 0 No
library(lessR)
##
## lessR 4.5 feedback: gerbing@pdx.edu
## --------------------------------------------------------------
## > d <- Read("") Read data file, many formats available, e.g., Excel
## d is the default data frame, data= in analysis routines optional
##
## Many examples of reading, writing, and manipulating data, graphics,
## testing means and proportions, regression, factor analysis,
## customization, forecasting, and aggregation to pivot tables.
## Enter: browseVignettes("lessR")
##
## View lessR updates, now including modern time series forecasting
## and many, new Plotly interactive visualizations output. Most
## visualization functions are now reorganized to three functions:
## Chart(): type="bar", "pie", "radar", "bubble", "treemap", "icicle"
## X(): type="histogram", "density", "vbs" and more
## XY(): type="scatter" for a scatterplot, or "contour", "smooth"
## Most previous function calls still work, such as:
## BarChart(), Histogram, and Plot().
## Enter: news(package="lessR"), or ?Chart, ?X, or ?XY
## There is also Flows() for Sankey flow diagrams, see ?Flows
##
## Interactive data analysis for constructing visualizations.
## Enter: interact()
Histogram(bmi, fill="green", xlab="Body mass index (g)", ylab="Frequency", data=df)
## lessR visualizations are now unified over just three core functions:
## - Chart() for pivot tables, such as bar charts. More info: ?Chart
## - X() for a single variable x, such as histograms. More info: ?X
## - XY() for scatterplots of two variables, x and y. More info: ?XY
##
## Histogram() is deprecated, though still working for now.
## Please use X(..., type = "histogram") going forward.
## [Interactive plot from the Plotly R package (Sievert, 2020)]
## >>> Suggestions
## bin_width: set the width of each bin
## bin_start: set the start of the first bin
## bin_end: set the end of the last bin
## X(bmi, type="density") # smoothed curve + histogram
## X(bmi, type="vbs") # Violin/Box/Scatterplot (VBS) plot
##
## --- bmi ---
##
## n miss mean sd min mdn max
## 4909 201 28.89 7.85 10.30 28.10 97.60
##
##
##
## --- Outliers --- from the box plot: 110
##
## Small Large
## ----- -----
## 97.6
## 92.0
## 78.0
## 71.9
## 66.8
## 64.8
## 64.4
## 63.3
## 61.6
## 61.2
## 60.9
## 60.9
## 60.2
## 59.7
## 58.1
## 57.9
## 57.7
## 57.5
##
## + 92 more outliers
##
##
## Bin Width: 5
## Number of Bins: 18
##
## Bin Midpnt Count Prop Cumul.c Cumul.p
## ---------------------------------------------------
## 10 > 15 12.5 44 0.01 44 0.01
## 15 > 20 17.5 493 0.10 537 0.11
## 20 > 25 22.5 1070 0.22 1607 0.33
## 25 > 30 27.5 1409 0.29 3016 0.61
## 30 > 35 32.5 985 0.20 4001 0.82
## 35 > 40 37.5 500 0.10 4501 0.92
## 40 > 45 42.5 253 0.05 4754 0.97
## 45 > 50 47.5 76 0.02 4830 0.98
## 50 > 55 52.5 46 0.01 4876 0.99
## 55 > 60 57.5 20 0.00 4896 1.00
## 60 > 65 62.5 8 0.00 4904 1.00
## 65 > 70 67.5 1 0.00 4905 1.00
## 70 > 75 72.5 1 0.00 4906 1.00
## 75 > 80 77.5 1 0.00 4907 1.00
## 80 > 85 82.5 0 0.00 4907 1.00
## 85 > 90 87.5 0 0.00 4907 1.00
## 90 > 95 92.5 1 0.00 4908 1.00
## 95 > 100 97.5 1 0.00 4909 1.00
##
library(lessR)
BarChart(work.type, data=df)
## lessR visualizations are now unified over just three core functions:
## - Chart() for pivot tables, such as bar charts. More info: ?Chart
## - X() for a single variable x, such as histograms. More info: ?X
## - XY() for scatterplots of two variables, x and y. More info: ?XY
##
## BarChart() is deprecated, though still working for now.
## Please use Chart(..., type = "bar") going forward.
## [Interactive chart from the Plotly R package (Sievert, 2020)]
##
## >>> Suggestions or enter: style(suggest=FALSE)
## Chart(work.type type="radar") # Plotly radar chart
## Chart(work.type type="treemap") # Plotly treemap chart
## Chart(work.type type="pie") # Plotly pie/sunburst chart
## Chart(work.type type="icicle") # Plotly icicle chart
## Chart(work.type type="bubble") # Plotly bubble chart
##
## --- work.type ---
##
## Missing Values: 0
##
## work.type Count Prop
## ----------------------------
## children 687 0.134
## Govt_job 657 0.129
## Never_worked 22 0.004
## Private 2925 0.572
## Self-employed 819 0.160
## ----------------------------
## Total 5110 1.000
##
## Chi-squared test of null hypothesis of equal probabilities
## Chisq = 4802.415, df = 4, p-value = 0.000
Plot(age, bmi, fit="lm", data=df)
## lessR visualizations are now unified over just three core functions:
## - Chart() for pivot tables, such as bar charts. More info: ?Chart
## - X() for a single variable x, such as histograms. More info: ?X
## - XY() for scatterplots of two variables, x and y. More info: ?XY
##
## Plot() is deprecated, though still working for now.
## Please use X(...) or XY(...) going forward.
## [Interactive chart from the Plotly R package (Sievert, 2020)]
##
##
## >>> Suggestions or enter: style(suggest=FALSE)
## XY(age, bmi, enhance=TRUE) # many options
## XY(age, bmi, color="red") # exterior edge color of points
## XY(age, bmi, out_cut=.10) # label top 10% from center as outliers
##
##
## >>> Pearson's product-moment correlation
##
## Number of paired values with neither missing, n = 4909
## Sample Correlation of age and bmi: r = 0.333
##
## Hypothesis Test of 0 Correlation: t = 24.772, df = 4907, p-value = 0.000
## 95% Confidence Interval for Correlation: 0.308 to 0.358
##
##
## Line: b0 = 23.917 b1 = 0.116
## Linear Model MSE = 54.8408 Rsq = 0.111
##
library(ggplot2)
ggplot(df, aes(x = age, y = bmi)) +
geom_point(alpha = 0.5, size = 2) +
geom_smooth(method = "lm", se = TRUE) +
labs(
title = "Relationship between Age and BMI",
x = "Age (years)",
y = "Body Mass Index (kg/m²)"
) +
theme_bw(base_size = 14)
## `geom_smooth()` using formula = 'y ~ x'
## Warning: Removed 201 rows containing non-finite outside the scale range
## (`stat_smooth()`).
## Warning: Removed 201 rows containing missing values or values outside the scale range
## (`geom_point()`).
# Biểu đồ tương quan tô màu theo giới tính
ggplot(df, aes(x = age, y = bmi, color = gender)) +
geom_point(alpha = 0.6) +
geom_smooth(method = "lm", se = FALSE) +
labs(
title = "Age vs BMI by Gender",
x = "Age (years)",
y = "BMI (kg/m²)",
color = "Gender"
) +
theme_bw(base_size = 14)
## `geom_smooth()` using formula = 'y ~ x'
## Warning: Removed 201 rows containing non-finite outside the scale range
## (`stat_smooth()`).
## Warning: Removed 201 rows containing missing values or values outside the scale range
## (`geom_point()`).
# Mô hình hồi quy logistic (Thầy)
fit = Logit(stroke ~ gender + age, data=df)
##
## >>> Note: gender is not a numeric variable.
## Indicator variables are created and analyzed.
##
## Response Variable: stroke
## Predictor Variable 1: genderMale
## Predictor Variable 2: age
##
## Number of cases (rows) of data: 5110
## Number of cases retained for analysis: 5110
##
##
## BASIC ANALYSIS
##
## -- Estimated Model of stroke for the Logit of Reference Group Membership
##
## Estimate Std Err z-value p-value Lower 95% Upper 95%
## (Intercept) -7.2834 0.3418 -21.308 0.000 -7.9533 -6.6134
## genderMale 0.1153 0.1374 0.839 0.401 -0.1539 0.3845
## age 0.0748 0.0049 15.167 0.000 0.0651 0.0844
##
##
## -- Odds Ratios and Confidence Intervals
##
## Odds Ratio Lower 95% Upper 95%
## (Intercept) 0.0007 0.0004 0.0013
## genderMale 1.1222 0.8573 1.4689
## age 1.0776 1.0673 1.0881
##
##
## -- Model Fit
##
## Null deviance: 1990.373 on 5109 degrees of freedom
## Residual deviance: 1615.595 on 5107 degrees of freedom
##
## AIC: 1621.595
##
## Number of iterations to convergence: 7
##
##
## Collinearity
##
## Tolerance VIF
## genderMale 0.999 1.001
## age 0.999 1.001
##
## ANALYSIS OF RESIDUALS AND INFLUENCE
## Data, Fitted, Residual, Studentized Residual, Dffits, Cook's Distance
## [sorted by Cook's Distance]
## [res_rows = 20 out of 5110 cases (rows) of data]
## --------------------------------------------------------------------
## genderMale age stroke P(Y=1) residual rstudent dffits cooks
## 4897 0 1 1 0.0007396 0.9993 3.812 0.06215 0.037826
## 3451 0 14 1 0.0019529 0.9980 3.543 0.07678 0.025275
## 2764 0 32 1 0.0074615 0.9925 3.136 0.09328 0.012387
## 2190 0 38 1 0.0116372 0.9884 2.989 0.09620 0.009255
## 2205 0 38 1 0.0116372 0.9884 2.989 0.09620 0.009255
## 2159 0 39 1 0.0125294 0.9875 2.964 0.09651 0.008790
## 2349 0 39 1 0.0125294 0.9875 2.964 0.09651 0.008790
## 2507 0 39 1 0.0125294 0.9875 2.964 0.09651 0.008790
## 2345 1 42 1 0.0175082 0.9825 2.849 0.10540 0.008086
## 2922 1 43 1 0.0188421 0.9812 2.822 0.10574 0.007692
## 3818 1 45 1 0.0218153 0.9782 2.770 0.10630 0.006953
## 1387 0 45 1 0.0194856 0.9805 2.810 0.09724 0.006341
## 2848 0 45 1 0.0194856 0.9805 2.810 0.09724 0.006341
## 1051 1 47 1 0.0252456 0.9748 2.716 0.10675 0.006278
## 3827 0 46 1 0.0209669 0.9790 2.783 0.09719 0.005989
## 1029 1 48 1 0.0271526 0.9728 2.689 0.10693 0.005964
## 659 1 49 1 0.0291993 0.9708 2.662 0.10710 0.005667
## 4688 0 48 1 0.0242671 0.9757 2.730 0.09696 0.005336
## 4636 1 51 1 0.0337507 0.9662 2.606 0.10742 0.005119
## 4227 0 49 1 0.0261021 0.9739 2.703 0.09679 0.005033
##
##
## PREDICTION
##
## Probability threshold for classification : 0.5
##
##
## Data, Fitted Values, Standard Errors
## [sorted by fitted value]
## [pred_all=TRUE to see all intervals displayed]
## --------------------------------------------------------------------
## genderMale age stroke label fitted std.err
## 136 0 0 0 0 0.0006864 0.0002345
## 556 0 0 0 0 0.0006864 0.0002345
## 1135 0 0 0 0 0.0006864 0.0002345
## 1597 0 0 0 0 0.0006864 0.0002345
##
## ... for the rows of data where fitted is close to 0.5 ...
##
## genderMale age stroke label fitted std.err
## 4624 1 81 0 0 0.2477 0.02353
## 4885 1 81 0 0 0.2477 0.02353
## 337 1 82 0 0 0.2619 0.02497
## 546 1 82 0 0 0.2619 0.02497
## 1367 1 82 0 0 0.2619 0.02497
##
## ... for the last 4 rows of sorted data ...
##
## genderMale age stroke label fitted std.err
## 4533 1 82 1 0 0.2619 0.02497
## 4799 1 82 0 0 0.2619 0.02497
## 4921 1 82 0 0 0.2619 0.02497
## 4925 1 82 0 0 0.2619 0.02497
## --------------------------------------------------------------------
##
##
## ----------------------------
## Specified confusion matrices
## ----------------------------
##
## Probability threshold for predicting : 0.5
##
## Baseline Predicted
## ---------------------------------------------------
## Total %Tot 0 1 %Correct
## ---------------------------------------------------
## 1 249 4.9 249 0 0.0
## stroke 0 4861 95.1 4861 0 100.0
## ---------------------------------------------------
## Total 5110 95.1
##
## Accuracy: 95.13
## Sensitivity: 0.00
## Precision: NaN
# PHÂN TÍCH MÔ TẢ DÙNG TABLE1
library(table1)
##
## Attaching package: 'table1'
## The following object is masked from 'package:lessR':
##
## label
## The following objects are masked from 'package:base':
##
## units, units<-
table1 (~ gender + age + hypertension + heart.disease + ever.married + work.type + glucose.level + bmi + smoking | stroke1, data = df)
| No (N=4861) |
Yes (N=249) |
Overall (N=5110) |
|
|---|---|---|---|
| gender | |||
| Female | 2853 (58.7%) | 141 (56.6%) | 2994 (58.6%) |
| Male | 2008 (41.3%) | 108 (43.4%) | 2116 (41.4%) |
| age | |||
| Mean (SD) | 42.0 (22.3) | 67.7 (12.7) | 43.2 (22.6) |
| Median [Min, Max] | 43.0 [0.0800, 82.0] | 71.0 [1.32, 82.0] | 45.0 [0.0800, 82.0] |
| hypertension | |||
| Mean (SD) | 0.0889 (0.285) | 0.265 (0.442) | 0.0975 (0.297) |
| Median [Min, Max] | 0 [0, 1.00] | 0 [0, 1.00] | 0 [0, 1.00] |
| heart.disease | |||
| Mean (SD) | 0.0471 (0.212) | 0.189 (0.392) | 0.0540 (0.226) |
| Median [Min, Max] | 0 [0, 1.00] | 0 [0, 1.00] | 0 [0, 1.00] |
| ever.married | |||
| No | 1728 (35.5%) | 29 (11.6%) | 1757 (34.4%) |
| Yes | 3133 (64.5%) | 220 (88.4%) | 3353 (65.6%) |
| work.type | |||
| children | 685 (14.1%) | 2 (0.8%) | 687 (13.4%) |
| Govt_job | 624 (12.8%) | 33 (13.3%) | 657 (12.9%) |
| Never_worked | 22 (0.5%) | 0 (0%) | 22 (0.4%) |
| Private | 2776 (57.1%) | 149 (59.8%) | 2925 (57.2%) |
| Self-employed | 754 (15.5%) | 65 (26.1%) | 819 (16.0%) |
| glucose.level | |||
| Mean (SD) | 105 (43.8) | 133 (61.9) | 106 (45.3) |
| Median [Min, Max] | 91.5 [55.1, 268] | 105 [56.1, 272] | 91.9 [55.1, 272] |
| bmi | |||
| Mean (SD) | 28.8 (7.91) | 30.5 (6.33) | 28.9 (7.85) |
| Median [Min, Max] | 28.0 [10.3, 97.6] | 29.7 [16.9, 56.6] | 28.1 [10.3, 97.6] |
| Missing | 161 (3.3%) | 40 (16.1%) | 201 (3.9%) |
| smoking | |||
| formerly smoked | 815 (16.8%) | 70 (28.1%) | 885 (17.3%) |
| never smoked | 1802 (37.1%) | 90 (36.1%) | 1892 (37.0%) |
| smokes | 747 (15.4%) | 42 (16.9%) | 789 (15.4%) |
| Unknown | 1497 (30.8%) | 47 (18.9%) | 1544 (30.2%) |
library(ggplot2)
ggplot(df, aes(x = stroke1, y = smoking, color = gender)) +
geom_jitter(width = 0.2, height = 0.2, alpha = 0.6) +
labs(
title = "Stroke vs Smoking by Gender",
x = "Stroke",
y = "Smoking"
) +
theme_bw()
df$stroke1 <- as.factor(df$stroke1) # outcome nhị phân
df$smoking <- as.factor(df$smoking)
df$gender <- as.factor(df$gender)
model <- glm(
stroke1 ~ smoking + gender,
data = df,
family = binomial
)
summary(model)
##
## Call:
## glm(formula = stroke1 ~ smoking + gender, family = binomial,
## data = df)
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -2.4935 0.1397 -17.850 < 2e-16 ***
## smokingnever smoked -0.5330 0.1655 -3.220 0.00128 **
## smokingsmokes -0.4209 0.2017 -2.087 0.03690 *
## smokingUnknown -1.0063 0.1935 -5.199 2e-07 ***
## genderMale 0.0826 0.1322 0.625 0.53217
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 1990.4 on 5109 degrees of freedom
## Residual deviance: 1961.9 on 5105 degrees of freedom
## AIC: 1971.9
##
## Number of Fisher Scoring iterations: 6
exp(cbind(
OR = coef(model),
confint(model)
))
## Waiting for profiling to be done...
## OR 2.5 % 97.5 %
## (Intercept) 0.0826214 0.06231184 0.1078079
## smokingnever smoked 0.5868373 0.42480778 0.8137105
## smokingsmokes 0.6564317 0.43906807 0.9702071
## smokingUnknown 0.3655850 0.24885439 0.5324817
## genderMale 1.0861056 0.83680627 1.4059910
library(broom)
library(ggplot2)
or_df <- tidy(model, exponentiate = TRUE, conf.int = TRUE)
or_df <- or_df[or_df$term != "(Intercept)", ]
ggplot(or_df, aes(
x = term,
y = estimate,
ymin = conf.low,
ymax = conf.high
)) +
geom_pointrange() +
geom_hline(yintercept = 1, linetype = "dashed") +
coord_flip() +
labs(
title = "Odds Ratios for Stroke",
y = "Odds Ratio (95% CI)",
x = ""
) +
theme_bw()