R Markdown

Đây là một phiên thực hành R trên R Markdown. File này sẽ đọc dữ liệu vào R và thực hiện một số phân tích căn bản.

Bắt đầu đọc dữ liệu vào R

library(Hmisc)
## 
## Attaching package: 'Hmisc'
## The following objects are masked from 'package:base':
## 
##     format.pval, units
df = read.csv("/Users/121493/Dropbox/_Conferences and Workshops/BV Thong Nhat/Tháng 12-2025/Dataset/Stroke Data.csv")

# Liệt kê 6 dòng đầu 
head(df, 10)
##     id gender age hypertension heart.disease ever.married work.type
## 1   67 Female  17            0             0           No   Private
## 2   77 Female  13            0             0           No  children
## 3   84   Male  55            0             0          Yes   Private
## 4   91 Female  42            0             0           No   Private
## 5   99 Female  31            0             0           No   Private
## 6  121 Female  38            0             0          Yes   Private
## 7  129 Female  24            0             0           No   Private
## 8  132 Female  80            0             0          Yes  Govt_job
## 9  156 Female  33            0             0          Yes   Private
## 10 163 Female  20            0             0           No   Private
##    Residence.type glucose.level  bmi         smoking stroke
## 1           Urban         92.97   NA formerly smoked      0
## 2           Rural         85.81 18.6         Unknown      0
## 3           Urban         89.17 31.5    never smoked      0
## 4           Urban         98.53 18.5    never smoked      0
## 5           Urban        108.89 52.3         Unknown      0
## 6           Urban         91.44   NA         Unknown      0
## 7           Urban         97.55 26.2    never smoked      0
## 8           Urban         84.86   NA         Unknown      0
## 9           Rural         86.97 42.2    never smoked      0
## 10          Rural         94.67 28.8         Unknown      0
# Mã hoá 
library(lessR)
## 
## lessR 4.3.9                         feedback: gerbing@pdx.edu 
## --------------------------------------------------------------
## > d <- Read("")   Read text, Excel, SPSS, SAS, or R data file
##   d is default data frame, data= in analysis routines optional
## 
## Many examples of reading, writing, and manipulating data, 
## graphics, testing means and proportions, regression, factor analysis,
## customization, and descriptive statistics from pivot tables
##   Enter: browseVignettes("lessR")
## 
## View lessR updates, now including time series forecasting
##   Enter: news(package="lessR")
## 
## Interactive data analysis
##   Enter: interact()
## 
## Attaching package: 'lessR'
## The following objects are masked from 'package:Hmisc':
## 
##     label, Merge
## The following object is masked from 'package:base':
## 
##     sort_by
df$stroke <- factor(df$stroke, levels = c(0, 1), labels = c("No", "Yes"))

# You may also want to ensure other categorical variables are factors
# Adjust these based on your actual data structure
df$gender <- as.factor(df$gender)
df$hypertension <- factor(df$hypertension, levels = c(0, 1), labels = c("No", "Yes"))
df$heart.disease <- factor(df$heart.disease, levels = c(0, 1), labels = c("No", "Yes"))
df$ever.married <- as.factor(df$ever.married)
df$work.type <- as.factor(df$work.type)
df$Residence.type <- as.factor(df$Residence.type)
df$smoking <- as.factor(df$smoking)

# Add labels for better presentation
label(df$gender) <- "Gender"
label(df$age) <- "Age (years)"
label(df$hypertension) <- "Hypertension"
label(df$heart.disease) <- "Heart Disease"
label(df$ever.married) <- "Ever Married"
label(df$work.type) <- "Work Type"
label(df$Residence.type) <- "Residence Type"
label(df$glucose.level) <- "Glucose Level (mg/dL)"
label(df$bmi) <- "BMI (kg/m²)"
label(df$smoking) <- "Smoking Status"

head(df)
##    id gender age hypertension heart.disease ever.married work.type
## 1  67 Female  17           No            No           No   Private
## 2  77 Female  13           No            No           No  children
## 3  84   Male  55           No            No          Yes   Private
## 4  91 Female  42           No            No           No   Private
## 5  99 Female  31           No            No           No   Private
## 6 121 Female  38           No            No          Yes   Private
##   Residence.type glucose.level  bmi         smoking stroke
## 1          Urban         92.97   NA formerly smoked     No
## 2          Rural         85.81 18.6         Unknown     No
## 3          Urban         89.17 31.5    never smoked     No
## 4          Urban         98.53 18.5    never smoked     No
## 5          Urban        108.89 52.3         Unknown     No
## 6          Urban         91.44   NA         Unknown     No

Phân tích mô tả dùng table1

library(table1)
## 
## Attaching package: 'table1'
## The following object is masked from 'package:lessR':
## 
##     label
## The following objects are masked from 'package:Hmisc':
## 
##     label, label<-, units
## The following objects are masked from 'package:base':
## 
##     units, units<-
# Phân tích toàn bộ dữ liệu 
table1(~age + gender + hypertension + heart.disease + ever.married + work.type + Residence.type + glucose.level + bmi + smoking, data=df)
Overall
(N=5110)
Age (years)
Mean (SD) 43.2 (22.6)
Median [Min, Max] 45.0 [0.0800, 82.0]
Gender
Female 2994 (58.6%)
Male 2116 (41.4%)
Hypertension
No 4612 (90.3%)
Yes 498 (9.7%)
Heart Disease
No 4834 (94.6%)
Yes 276 (5.4%)
Ever Married
No 1757 (34.4%)
Yes 3353 (65.6%)
Work Type
children 687 (13.4%)
Govt_job 657 (12.9%)
Never_worked 22 (0.4%)
Private 2925 (57.2%)
Self-employed 819 (16.0%)
Residence Type
Rural 2514 (49.2%)
Urban 2596 (50.8%)
Glucose Level (mg/dL)
Mean (SD) 106 (45.3)
Median [Min, Max] 91.9 [55.1, 272]
BMI (kg/m²)
Mean (SD) 28.9 (7.85)
Median [Min, Max] 28.1 [10.3, 97.6]
Missing 201 (3.9%)
Smoking Status
formerly smoked 885 (17.3%)
never smoked 1892 (37.0%)
smokes 789 (15.4%)
Unknown 1544 (30.2%)
# Phân tích theo nhóm stroke  
table1(~age + gender + hypertension + heart.disease + ever.married + work.type + Residence.type + glucose.level + bmi + smoking | stroke, data=df)
No
(N=4861)
Yes
(N=249)
Overall
(N=5110)
Age (years)
Mean (SD) 42.0 (22.3) 67.7 (12.7) 43.2 (22.6)
Median [Min, Max] 43.0 [0.0800, 82.0] 71.0 [1.32, 82.0] 45.0 [0.0800, 82.0]
Gender
Female 2853 (58.7%) 141 (56.6%) 2994 (58.6%)
Male 2008 (41.3%) 108 (43.4%) 2116 (41.4%)
Hypertension
No 4429 (91.1%) 183 (73.5%) 4612 (90.3%)
Yes 432 (8.9%) 66 (26.5%) 498 (9.7%)
Heart Disease
No 4632 (95.3%) 202 (81.1%) 4834 (94.6%)
Yes 229 (4.7%) 47 (18.9%) 276 (5.4%)
Ever Married
No 1728 (35.5%) 29 (11.6%) 1757 (34.4%)
Yes 3133 (64.5%) 220 (88.4%) 3353 (65.6%)
Work Type
children 685 (14.1%) 2 (0.8%) 687 (13.4%)
Govt_job 624 (12.8%) 33 (13.3%) 657 (12.9%)
Never_worked 22 (0.5%) 0 (0%) 22 (0.4%)
Private 2776 (57.1%) 149 (59.8%) 2925 (57.2%)
Self-employed 754 (15.5%) 65 (26.1%) 819 (16.0%)
Residence Type
Rural 2400 (49.4%) 114 (45.8%) 2514 (49.2%)
Urban 2461 (50.6%) 135 (54.2%) 2596 (50.8%)
Glucose Level (mg/dL)
Mean (SD) 105 (43.8) 133 (61.9) 106 (45.3)
Median [Min, Max] 91.5 [55.1, 268] 105 [56.1, 272] 91.9 [55.1, 272]
BMI (kg/m²)
Mean (SD) 28.8 (7.91) 30.5 (6.33) 28.9 (7.85)
Median [Min, Max] 28.0 [10.3, 97.6] 29.7 [16.9, 56.6] 28.1 [10.3, 97.6]
Missing 161 (3.3%) 40 (16.1%) 201 (3.9%)
Smoking Status
formerly smoked 815 (16.8%) 70 (28.1%) 885 (17.3%)
never smoked 1802 (37.1%) 90 (36.1%) 1892 (37.0%)
smokes 747 (15.4%) 42 (16.9%) 789 (15.4%)
Unknown 1497 (30.8%) 47 (18.9%) 1544 (30.2%)

Hiển thị dữ liệu dùng lessR

# Biểu đồ phân bố bmi
Histogram(bmi, data=df)

## >>> Suggestions 
## bin_width: set the width of each bin 
## bin_start: set the start of the first bin 
## bin_end: set the end of the last bin 
## Histogram(bmi, density=TRUE)  # smoothed curve + histogram 
## Plot(bmi)  # Violin/Box/Scatterplot (VBS) plot 
## 
## --- bmi --- 
##  
##        n     miss     mean       sd      min      mdn      max 
##      4909      201    28.89     7.85    10.30    28.10    97.60 
## 
##   
## --- Outliers ---     from the box plot: 110 
##  
## Small      Large 
## -----      ----- 
##             97.6 
##             92.0 
##             78.0 
##             71.9 
##             66.8 
##             64.8 
##             64.4 
##             63.3 
##             61.6 
##             61.2 
##             60.9 
##             60.9 
##             60.2 
##             59.7 
##             58.1 
##             57.9 
##             57.7 
##             57.5 
## 
## + 92 more outliers 
## 
## 
## Bin Width: 5 
## Number of Bins: 18 
##  
##        Bin  Midpnt  Count    Prop  Cumul.c  Cumul.p 
## --------------------------------------------------- 
##   10 >  15    12.5     44    0.01       44     0.01 
##   15 >  20    17.5    493    0.10      537     0.11 
##   20 >  25    22.5   1070    0.21     1607     0.31 
##   25 >  30    27.5   1409    0.28     3016     0.59 
##   30 >  35    32.5    985    0.19     4001     0.78 
##   35 >  40    37.5    500    0.10     4501     0.88 
##   40 >  45    42.5    253    0.05     4754     0.93 
##   45 >  50    47.5     76    0.01     4830     0.95 
##   50 >  55    52.5     46    0.01     4876     0.95 
##   55 >  60    57.5     20    0.00     4896     0.96 
##   60 >  65    62.5      8    0.00     4904     0.96 
##   65 >  70    67.5      1    0.00     4905     0.96 
##   70 >  75    72.5      1    0.00     4906     0.96 
##   75 >  80    77.5      1    0.00     4907     0.96 
##   80 >  85    82.5      0    0.00     4907     0.96 
##   85 >  90    87.5      0    0.00     4907     0.96 
##   90 >  95    92.5      1    0.00     4908     0.96 
##   95 > 100    97.5      1    0.00     4909     0.96
Histogram(bmi, fill="blue", xlab="Body mass index (g)", ylab="Frequency", data=df)

## >>> Suggestions 
## bin_width: set the width of each bin 
## bin_start: set the start of the first bin 
## bin_end: set the end of the last bin 
## Histogram(bmi, density=TRUE)  # smoothed curve + histogram 
## Plot(bmi)  # Violin/Box/Scatterplot (VBS) plot 
## 
## --- bmi --- 
##  
##        n     miss     mean       sd      min      mdn      max 
##      4909      201    28.89     7.85    10.30    28.10    97.60 
## 
##   
## --- Outliers ---     from the box plot: 110 
##  
## Small      Large 
## -----      ----- 
##             97.6 
##             92.0 
##             78.0 
##             71.9 
##             66.8 
##             64.8 
##             64.4 
##             63.3 
##             61.6 
##             61.2 
##             60.9 
##             60.9 
##             60.2 
##             59.7 
##             58.1 
##             57.9 
##             57.7 
##             57.5 
## 
## + 92 more outliers 
## 
## 
## Bin Width: 5 
## Number of Bins: 18 
##  
##        Bin  Midpnt  Count    Prop  Cumul.c  Cumul.p 
## --------------------------------------------------- 
##   10 >  15    12.5     44    0.01       44     0.01 
##   15 >  20    17.5    493    0.10      537     0.11 
##   20 >  25    22.5   1070    0.21     1607     0.31 
##   25 >  30    27.5   1409    0.28     3016     0.59 
##   30 >  35    32.5    985    0.19     4001     0.78 
##   35 >  40    37.5    500    0.10     4501     0.88 
##   40 >  45    42.5    253    0.05     4754     0.93 
##   45 >  50    47.5     76    0.01     4830     0.95 
##   50 >  55    52.5     46    0.01     4876     0.95 
##   55 >  60    57.5     20    0.00     4896     0.96 
##   60 >  65    62.5      8    0.00     4904     0.96 
##   65 >  70    67.5      1    0.00     4905     0.96 
##   70 >  75    72.5      1    0.00     4906     0.96 
##   75 >  80    77.5      1    0.00     4907     0.96 
##   80 >  85    82.5      0    0.00     4907     0.96 
##   85 >  90    87.5      0    0.00     4907     0.96 
##   90 >  95    92.5      1    0.00     4908     0.96 
##   95 > 100    97.5      1    0.00     4909     0.96
# Biểu đồ thanh
BarChart(work.type, data=df)

## >>> Suggestions
## BarChart(work.type, horiz=TRUE)  # horizontal bar chart
## BarChart(work.type, fill="reds")  # red bars of varying lightness
## PieChart(work.type)  # doughnut (ring) chart
## Plot(work.type)  # bubble plot
## Plot(work.type, stat="count")  # lollipop plot 
## 
## --- work.type --- 
## 
## Missing Values: 0 
## 
##     work.type  Count   Prop 
## ---------------------------- 
##      children    687   0.134 
##      Govt_job    657   0.129 
##  Never_worked     22   0.004 
##       Private   2925   0.572 
## Self-employed    819   0.160 
## ---------------------------- 
##         Total   5110   1.000 
## 
## Chi-squared test of null hypothesis of equal probabilities 
##   Chisq = 4802.415, df = 4, p-value = 0.000
# Biểu đồ tương quan
Plot(age, bmi, fit="lm", data=df)

## 
## >>> Suggestions  or  enter: style(suggest=FALSE)
## Plot(age, bmi, enhance=TRUE)  # many options
## Plot(age, bmi, color="red")  # exterior edge color of points
## Plot(age, bmi, MD_cut=6)  # Mahalanobis distance from center > 6 is an outlier 
## 
## 
## >>> Pearson's product-moment correlation 
##  
## Number of paired values with neither missing, n = 4909 
## Sample Correlation of age and bmi: r = 0.333 
##   
## Hypothesis Test of 0 Correlation:  t = 24.772,  df = 4907,  p-value = 0.000 
## 95% Confidence Interval for Correlation:  0.308 to 0.358 
##   
## 
##  Line: b0 = 23.92   b1 = 0.12    Fit: MSE = 54.841   Rsq = 0.111
## 

Mô hình hồi qui logistic

fit = Logit(stroke ~ gender + age, data=df)
## 
## >>> Note:  gender is not a numeric variable.
##            Indicator variables are created and analyzed.
## 
## Response Variable:   stroke
## Predictor Variable 1:  genderMale
## Predictor Variable 2:  age
## 
## Number of cases (rows) of data:  5110 
## Number of cases retained for analysis:  5110 
## 
## 
##    BASIC ANALYSIS 
## 
## -- Estimated Model of stroke for the Logit of Reference Group Membership
## 
##              Estimate    Std Err  z-value  p-value   Lower 95%   Upper 95%
## (Intercept)   -7.2834     0.3418  -21.308    0.000     -7.9533     -6.6134 
##  genderMale    0.1153     0.1374    0.839    0.401     -0.1539      0.3845 
##         age    0.0748     0.0049   15.167    0.000      0.0651      0.0844 
## 
## 
## -- Odds Ratios and Confidence Intervals
## 
##              Odds Ratio   Lower 95%   Upper 95%
## (Intercept)      0.0007      0.0004      0.0013 
##  genderMale      1.1222      0.8573      1.4689 
##         age      1.0776      1.0673      1.0881 
## 
## 
## -- Model Fit
## 
##     Null deviance: 1990.373 on 5109 degrees of freedom
## Residual deviance: 1615.595 on 5107 degrees of freedom
## 
## AIC: 1621.595 
## 
## Number of iterations to convergence: 7 
## 
## 
## Collinearity
## 
##            Tolerance       VIF
## genderMale     0.999     1.001
## age            0.999     1.001
## 
##    ANALYSIS OF RESIDUALS AND INFLUENCE 
## Data, Fitted, Residual, Studentized Residual, Dffits, Cook's Distance
##    [sorted by Cook's Distance]
##    [res_rows = 20 out of 5110 cases (rows) of data]
## --------------------------------------------------------------------
##      genderMale age stroke    P(Y=1) residual rstudent  dffits    cooks
## 4897          0   1    Yes 0.0007396   0.9993    3.812 0.06215 0.037826
## 3451          0  14    Yes 0.0019529   0.9980    3.543 0.07678 0.025275
## 2764          0  32    Yes 0.0074615   0.9925    3.136 0.09328 0.012387
## 2190          0  38    Yes 0.0116372   0.9884    2.989 0.09620 0.009255
## 2205          0  38    Yes 0.0116372   0.9884    2.989 0.09620 0.009255
## 2159          0  39    Yes 0.0125294   0.9875    2.964 0.09651 0.008790
## 2349          0  39    Yes 0.0125294   0.9875    2.964 0.09651 0.008790
## 2507          0  39    Yes 0.0125294   0.9875    2.964 0.09651 0.008790
## 2345          1  42    Yes 0.0175082   0.9825    2.849 0.10540 0.008086
## 2922          1  43    Yes 0.0188421   0.9812    2.822 0.10574 0.007692
## 3818          1  45    Yes 0.0218153   0.9782    2.770 0.10630 0.006953
## 1387          0  45    Yes 0.0194856   0.9805    2.810 0.09724 0.006341
## 2848          0  45    Yes 0.0194856   0.9805    2.810 0.09724 0.006341
## 1051          1  47    Yes 0.0252456   0.9748    2.716 0.10675 0.006278
## 3827          0  46    Yes 0.0209669   0.9790    2.783 0.09719 0.005989
## 1029          1  48    Yes 0.0271526   0.9728    2.689 0.10693 0.005964
## 659           1  49    Yes 0.0291993   0.9708    2.662 0.10710 0.005667
## 4688          0  48    Yes 0.0242671   0.9757    2.730 0.09696 0.005336
## 4636          1  51    Yes 0.0337507   0.9662    2.606 0.10742 0.005119
## 4227          0  49    Yes 0.0261021   0.9739    2.703 0.09679 0.005033
## 
## 
##    PREDICTION 
## 
## Probability threshold for classification Yes: 0.5
## 
##  0: No
##  1: Yes
## 
## Data, Fitted Values, Standard Errors
##    [sorted by fitted value]
##    [pred_all=TRUE to see all intervals displayed]
## --------------------------------------------------------------------
##      genderMale age stroke label    fitted   std.err
## 136           0   0     No     0 0.0006864 0.0002345
## 556           0   0     No     0 0.0006864 0.0002345
## 1135          0   0     No     0 0.0006864 0.0002345
## 1597          0   0     No     0 0.0006864 0.0002345
## 
## ... for the rows of data where fitted is close to 0.5 ...
## 
##      genderMale age stroke label fitted std.err
## 4624          1  81     No     0 0.2477 0.02353
## 4885          1  81     No     0 0.2477 0.02353
## 337           1  82     No     0 0.2619 0.02497
## 546           1  82     No     0 0.2619 0.02497
## 1367          1  82     No     0 0.2619 0.02497
## 
## ... for the last 4 rows of sorted data ...
## 
##      genderMale age stroke label fitted std.err
## 4533          1  82    Yes     0 0.2619 0.02497
## 4799          1  82     No     0 0.2619 0.02497
## 4921          1  82     No     0 0.2619 0.02497
## 4925          1  82     No     0 0.2619 0.02497
## --------------------------------------------------------------------
## 
## 
## ----------------------------
## Specified confusion matrices
## ----------------------------
## 
## Probability threshold for predicting Yes: 0.5
## 
##                Baseline         Predicted 
## ---------------------------------------------------
##               Total  %Tot        0      1  %Correct 
## ---------------------------------------------------
##          1      249   4.9      249      0     0.0 
## stroke   0     4861  95.1     4861      0     100.0 
## ---------------------------------------------------
##        Total   5110                           95.1 
## 
## Accuracy: 95.13 
## Sensitivity: 0.00 
## Precision: NaN