Đọc dữ liệu vào R

df=read.csv("C:\\Data-training\\Stroke Data.csv")

4. Thong tin ve du lieu

4.1 Co bao nhieu bien so va quan sat

dim(df)
## [1] 5110   12

4.2 Doc 15 dong dau

head(df, 15)
##       id gender age hypertension heart_disease ever_married     work_type
## 1   9046   Male  67            0             1          Yes       Private
## 2  51676 Female  61            0             0          Yes Self-employed
## 3  31112   Male  80            0             1          Yes       Private
## 4  60182 Female  49            0             0          Yes       Private
## 5   1665 Female  79            1             0          Yes Self-employed
## 6  56669   Male  81            0             0          Yes       Private
## 7  53882   Male  74            1             1          Yes       Private
## 8  10434 Female  69            0             0           No       Private
## 9  27419 Female  59            0             0          Yes       Private
## 10 60491 Female  78            0             0          Yes       Private
## 11 12109 Female  81            1             0          Yes       Private
## 12 12095 Female  61            0             1          Yes      Govt_job
## 13 12175 Female  54            0             0          Yes       Private
## 14  8213   Male  78            0             1          Yes       Private
## 15  5317 Female  79            0             1          Yes       Private
##    Residence_type avg_glucose_level  bmi  smoking_status stroke
## 1           Urban            228.69 36.6 formerly smoked      1
## 2           Rural            202.21   NA    never smoked      1
## 3           Rural            105.92 32.5    never smoked      1
## 4           Urban            171.23 34.4          smokes      1
## 5           Rural            174.12 24.0    never smoked      1
## 6           Urban            186.21 29.0 formerly smoked      1
## 7           Rural             70.09 27.4    never smoked      1
## 8           Urban             94.39 22.8    never smoked      1
## 9           Rural             76.15   NA         Unknown      1
## 10          Urban             58.57 24.2         Unknown      1
## 11          Rural             80.43 29.7    never smoked      1
## 12          Rural            120.46 36.8          smokes      1
## 13          Urban            104.51 27.3          smokes      1
## 14          Urban            219.84   NA         Unknown      1
## 15          Urban            214.09 28.2    never smoked      1

4.3 Doc 10 dong cuoi

tail(df, 10)
##         id gender age hypertension heart_disease ever_married     work_type
## 5101 68398   Male  82            1             0          Yes Self-employed
## 5102 36901 Female  45            0             0          Yes       Private
## 5103 45010 Female  57            0             0          Yes       Private
## 5104 22127 Female  18            0             0           No       Private
## 5105 14180 Female  13            0             0           No      children
## 5106 18234 Female  80            1             0          Yes       Private
## 5107 44873 Female  81            0             0          Yes Self-employed
## 5108 19723 Female  35            0             0          Yes Self-employed
## 5109 37544   Male  51            0             0          Yes       Private
## 5110 44679 Female  44            0             0          Yes      Govt_job
##      Residence_type avg_glucose_level  bmi  smoking_status stroke
## 5101          Rural             71.97 28.3    never smoked      0
## 5102          Urban             97.95 24.5         Unknown      0
## 5103          Rural             77.93 21.7    never smoked      0
## 5104          Urban             82.85 46.9         Unknown      0
## 5105          Rural            103.08 18.6         Unknown      0
## 5106          Urban             83.75   NA    never smoked      0
## 5107          Urban            125.20 40.0    never smoked      0
## 5108          Rural             82.99 30.6    never smoked      0
## 5109          Rural            166.29 25.6 formerly smoked      0
## 5110          Urban             85.28 26.2         Unknown      0

4.4 Mo ta du lieu

summary(df)
##        id           gender               age         hypertension    
##  Min.   :   67   Length:5110        Min.   : 0.08   Min.   :0.00000  
##  1st Qu.:17741   Class :character   1st Qu.:25.00   1st Qu.:0.00000  
##  Median :36932   Mode  :character   Median :45.00   Median :0.00000  
##  Mean   :36518                      Mean   :43.23   Mean   :0.09746  
##  3rd Qu.:54682                      3rd Qu.:61.00   3rd Qu.:0.00000  
##  Max.   :72940                      Max.   :82.00   Max.   :1.00000  
##                                                                      
##  heart_disease     ever_married        work_type         Residence_type    
##  Min.   :0.00000   Length:5110        Length:5110        Length:5110       
##  1st Qu.:0.00000   Class :character   Class :character   Class :character  
##  Median :0.00000   Mode  :character   Mode  :character   Mode  :character  
##  Mean   :0.05401                                                           
##  3rd Qu.:0.00000                                                           
##  Max.   :1.00000                                                           
##                                                                            
##  avg_glucose_level      bmi        smoking_status         stroke       
##  Min.   : 55.12    Min.   :10.30   Length:5110        Min.   :0.00000  
##  1st Qu.: 77.25    1st Qu.:23.50   Class :character   1st Qu.:0.00000  
##  Median : 91.89    Median :28.10   Mode  :character   Median :0.00000  
##  Mean   :106.15    Mean   :28.89                      Mean   :0.04873  
##  3rd Qu.:114.09    3rd Qu.:33.10                      3rd Qu.:0.00000  
##  Max.   :271.74    Max.   :97.60                      Max.   :1.00000  
##                    NA's   :201

5. Bien tap du lieu

5.1 Ma hoa bien gender thanh sex

df$sex[df$gender == "Female"] = 0
df$sex[df$gender == "Male"] = 1
df$sex[df$gender == "Other"] = 2

head(df)
##      id gender age hypertension heart_disease ever_married     work_type
## 1  9046   Male  67            0             1          Yes       Private
## 2 51676 Female  61            0             0          Yes Self-employed
## 3 31112   Male  80            0             1          Yes       Private
## 4 60182 Female  49            0             0          Yes       Private
## 5  1665 Female  79            1             0          Yes Self-employed
## 6 56669   Male  81            0             0          Yes       Private
##   Residence_type avg_glucose_level  bmi  smoking_status stroke sex
## 1          Urban            228.69 36.6 formerly smoked      1   1
## 2          Rural            202.21   NA    never smoked      1   0
## 3          Rural            105.92 32.5    never smoked      1   1
## 4          Urban            171.23 34.4          smokes      1   0
## 5          Rural            174.12 24.0    never smoked      1   0
## 6          Urban            186.21 29.0 formerly smoked      1   1
table(df$gender, df$sex)
##         
##             0    1    2
##   Female 2994    0    0
##   Male      0 2115    0
##   Other     0    0    1

5.2 Ma hoa bien bmi

df$bmi_cat[df$bmi<18.5]="Underweight"
df$bmi_cat[df$bmi>=18.5 & df$bmi<25]="Normal"
df$bmi_cat[df$bmi>=25 & df$bmi<30]="Overweight"
df$bmi_cat[df$bmi>=30]="Obese"
table(df$bmi_cat)
## 
##      Normal       Obese  Overweight Underweight 
##        1243        1920        1409         337

5.3 Bien stroke

df$stroke1=as.factor(df$stroke)

table(df$stroke1)
## 
##    0    1 
## 4861  249
head(df)
##      id gender age hypertension heart_disease ever_married     work_type
## 1  9046   Male  67            0             1          Yes       Private
## 2 51676 Female  61            0             0          Yes Self-employed
## 3 31112   Male  80            0             1          Yes       Private
## 4 60182 Female  49            0             0          Yes       Private
## 5  1665 Female  79            1             0          Yes Self-employed
## 6 56669   Male  81            0             0          Yes       Private
##   Residence_type avg_glucose_level  bmi  smoking_status stroke sex    bmi_cat
## 1          Urban            228.69 36.6 formerly smoked      1   1      Obese
## 2          Rural            202.21   NA    never smoked      1   0       <NA>
## 3          Rural            105.92 32.5    never smoked      1   1      Obese
## 4          Urban            171.23 34.4          smokes      1   0      Obese
## 5          Rural            174.12 24.0    never smoked      1   0     Normal
## 6          Urban            186.21 29.0 formerly smoked      1   1 Overweight
##   stroke1
## 1       1
## 2       1
## 3       1
## 4       1
## 5       1
## 6       1
summary(df)
##        id           gender               age         hypertension    
##  Min.   :   67   Length:5110        Min.   : 0.08   Min.   :0.00000  
##  1st Qu.:17741   Class :character   1st Qu.:25.00   1st Qu.:0.00000  
##  Median :36932   Mode  :character   Median :45.00   Median :0.00000  
##  Mean   :36518                      Mean   :43.23   Mean   :0.09746  
##  3rd Qu.:54682                      3rd Qu.:61.00   3rd Qu.:0.00000  
##  Max.   :72940                      Max.   :82.00   Max.   :1.00000  
##                                                                      
##  heart_disease     ever_married        work_type         Residence_type    
##  Min.   :0.00000   Length:5110        Length:5110        Length:5110       
##  1st Qu.:0.00000   Class :character   Class :character   Class :character  
##  Median :0.00000   Mode  :character   Mode  :character   Mode  :character  
##  Mean   :0.05401                                                           
##  3rd Qu.:0.00000                                                           
##  Max.   :1.00000                                                           
##                                                                            
##  avg_glucose_level      bmi        smoking_status         stroke       
##  Min.   : 55.12    Min.   :10.30   Length:5110        Min.   :0.00000  
##  1st Qu.: 77.25    1st Qu.:23.50   Class :character   1st Qu.:0.00000  
##  Median : 91.89    Median :28.10   Mode  :character   Median :0.00000  
##  Mean   :106.15    Mean   :28.89                      Mean   :0.04873  
##  3rd Qu.:114.09    3rd Qu.:33.10                      3rd Qu.:0.00000  
##  Max.   :271.74    Max.   :97.60                      Max.   :1.00000  
##                    NA's   :201                                         
##       sex           bmi_cat          stroke1 
##  Min.   :0.0000   Length:5110        0:4861  
##  1st Qu.:0.0000   Class :character   1: 249  
##  Median :0.0000   Mode  :character           
##  Mean   :0.4143                              
##  3rd Qu.:1.0000                              
##  Max.   :2.0000                              
## 

Viêc 6. Phân tích mô tả

6.1. Mô tả đặc điểm

6.1 Mô tả đặc điểm tuổi (age), giới tính (gender), bệnh cao huyết áp (hypertension), bệnh tim (heart_disease), tình trạng gia đình (ever_married), việc làm (work_type), nơi ở (Residence_type), nồng độ đường huyết (avg_glucose_level), chỉ số khối cơ thể (bmi), và tình trạng hút thuốc (smoking_status) theo tình trạng đột quị (stroke)

library(table1)
## 
## Attaching package: 'table1'
## The following objects are masked from 'package:base':
## 
##     units, units<-
table1(~age+gender+hypertension+heart_disease+ever_married+work_type+Residence_type+avg_glucose_level+bmi+smoking_status | stroke1, data=df)
0
(N=4861)
1
(N=249)
Overall
(N=5110)
age
Mean (SD) 42.0 (22.3) 67.7 (12.7) 43.2 (22.6)
Median [Min, Max] 43.0 [0.0800, 82.0] 71.0 [1.32, 82.0] 45.0 [0.0800, 82.0]
gender
Female 2853 (58.7%) 141 (56.6%) 2994 (58.6%)
Male 2007 (41.3%) 108 (43.4%) 2115 (41.4%)
Other 1 (0.0%) 0 (0%) 1 (0.0%)
hypertension
Mean (SD) 0.0889 (0.285) 0.265 (0.442) 0.0975 (0.297)
Median [Min, Max] 0 [0, 1.00] 0 [0, 1.00] 0 [0, 1.00]
heart_disease
Mean (SD) 0.0471 (0.212) 0.189 (0.392) 0.0540 (0.226)
Median [Min, Max] 0 [0, 1.00] 0 [0, 1.00] 0 [0, 1.00]
ever_married
No 1728 (35.5%) 29 (11.6%) 1757 (34.4%)
Yes 3133 (64.5%) 220 (88.4%) 3353 (65.6%)
work_type
children 685 (14.1%) 2 (0.8%) 687 (13.4%)
Govt_job 624 (12.8%) 33 (13.3%) 657 (12.9%)
Never_worked 22 (0.5%) 0 (0%) 22 (0.4%)
Private 2776 (57.1%) 149 (59.8%) 2925 (57.2%)
Self-employed 754 (15.5%) 65 (26.1%) 819 (16.0%)
Residence_type
Rural 2400 (49.4%) 114 (45.8%) 2514 (49.2%)
Urban 2461 (50.6%) 135 (54.2%) 2596 (50.8%)
avg_glucose_level
Mean (SD) 105 (43.8) 133 (61.9) 106 (45.3)
Median [Min, Max] 91.5 [55.1, 268] 105 [56.1, 272] 91.9 [55.1, 272]
bmi
Mean (SD) 28.8 (7.91) 30.5 (6.33) 28.9 (7.85)
Median [Min, Max] 28.0 [10.3, 97.6] 29.7 [16.9, 56.6] 28.1 [10.3, 97.6]
Missing 161 (3.3%) 40 (16.1%) 201 (3.9%)
smoking_status
formerly smoked 815 (16.8%) 70 (28.1%) 885 (17.3%)
never smoked 1802 (37.1%) 90 (36.1%) 1892 (37.0%)
smokes 747 (15.4%) 42 (16.9%) 789 (15.4%)
Unknown 1497 (30.8%) 47 (18.9%) 1544 (30.2%)
table1(~age + gender + hypertension + heart_disease + ever_married + work_type + Residence_type + avg_glucose_level + bmi + smoking_status | stroke1, data=df)
0
(N=4861)
1
(N=249)
Overall
(N=5110)
age
Mean (SD) 42.0 (22.3) 67.7 (12.7) 43.2 (22.6)
Median [Min, Max] 43.0 [0.0800, 82.0] 71.0 [1.32, 82.0] 45.0 [0.0800, 82.0]
gender
Female 2853 (58.7%) 141 (56.6%) 2994 (58.6%)
Male 2007 (41.3%) 108 (43.4%) 2115 (41.4%)
Other 1 (0.0%) 0 (0%) 1 (0.0%)
hypertension
Mean (SD) 0.0889 (0.285) 0.265 (0.442) 0.0975 (0.297)
Median [Min, Max] 0 [0, 1.00] 0 [0, 1.00] 0 [0, 1.00]
heart_disease
Mean (SD) 0.0471 (0.212) 0.189 (0.392) 0.0540 (0.226)
Median [Min, Max] 0 [0, 1.00] 0 [0, 1.00] 0 [0, 1.00]
ever_married
No 1728 (35.5%) 29 (11.6%) 1757 (34.4%)
Yes 3133 (64.5%) 220 (88.4%) 3353 (65.6%)
work_type
children 685 (14.1%) 2 (0.8%) 687 (13.4%)
Govt_job 624 (12.8%) 33 (13.3%) 657 (12.9%)
Never_worked 22 (0.5%) 0 (0%) 22 (0.4%)
Private 2776 (57.1%) 149 (59.8%) 2925 (57.2%)
Self-employed 754 (15.5%) 65 (26.1%) 819 (16.0%)
Residence_type
Rural 2400 (49.4%) 114 (45.8%) 2514 (49.2%)
Urban 2461 (50.6%) 135 (54.2%) 2596 (50.8%)
avg_glucose_level
Mean (SD) 105 (43.8) 133 (61.9) 106 (45.3)
Median [Min, Max] 91.5 [55.1, 268] 105 [56.1, 272] 91.9 [55.1, 272]
bmi
Mean (SD) 28.8 (7.91) 30.5 (6.33) 28.9 (7.85)
Median [Min, Max] 28.0 [10.3, 97.6] 29.7 [16.9, 56.6] 28.1 [10.3, 97.6]
Missing 161 (3.3%) 40 (16.1%) 201 (3.9%)
smoking_status
formerly smoked 815 (16.8%) 70 (28.1%) 885 (17.3%)
never smoked 1802 (37.1%) 90 (36.1%) 1892 (37.0%)
smokes 747 (15.4%) 42 (16.9%) 789 (15.4%)
Unknown 1497 (30.8%) 47 (18.9%) 1544 (30.2%)
table1(~ age + gender + hypertension + heart_disease + ever_married + work_type + Residence_type + avg_glucose_level + bmi + smoking_status | stroke1, data=df)
0
(N=4861)
1
(N=249)
Overall
(N=5110)
age
Mean (SD) 42.0 (22.3) 67.7 (12.7) 43.2 (22.6)
Median [Min, Max] 43.0 [0.0800, 82.0] 71.0 [1.32, 82.0] 45.0 [0.0800, 82.0]
gender
Female 2853 (58.7%) 141 (56.6%) 2994 (58.6%)
Male 2007 (41.3%) 108 (43.4%) 2115 (41.4%)
Other 1 (0.0%) 0 (0%) 1 (0.0%)
hypertension
Mean (SD) 0.0889 (0.285) 0.265 (0.442) 0.0975 (0.297)
Median [Min, Max] 0 [0, 1.00] 0 [0, 1.00] 0 [0, 1.00]
heart_disease
Mean (SD) 0.0471 (0.212) 0.189 (0.392) 0.0540 (0.226)
Median [Min, Max] 0 [0, 1.00] 0 [0, 1.00] 0 [0, 1.00]
ever_married
No 1728 (35.5%) 29 (11.6%) 1757 (34.4%)
Yes 3133 (64.5%) 220 (88.4%) 3353 (65.6%)
work_type
children 685 (14.1%) 2 (0.8%) 687 (13.4%)
Govt_job 624 (12.8%) 33 (13.3%) 657 (12.9%)
Never_worked 22 (0.5%) 0 (0%) 22 (0.4%)
Private 2776 (57.1%) 149 (59.8%) 2925 (57.2%)
Self-employed 754 (15.5%) 65 (26.1%) 819 (16.0%)
Residence_type
Rural 2400 (49.4%) 114 (45.8%) 2514 (49.2%)
Urban 2461 (50.6%) 135 (54.2%) 2596 (50.8%)
avg_glucose_level
Mean (SD) 105 (43.8) 133 (61.9) 106 (45.3)
Median [Min, Max] 91.5 [55.1, 268] 105 [56.1, 272] 91.9 [55.1, 272]
bmi
Mean (SD) 28.8 (7.91) 30.5 (6.33) 28.9 (7.85)
Median [Min, Max] 28.0 [10.3, 97.6] 29.7 [16.9, 56.6] 28.1 [10.3, 97.6]
Missing 161 (3.3%) 40 (16.1%) 201 (3.9%)
smoking_status
formerly smoked 815 (16.8%) 70 (28.1%) 885 (17.3%)
never smoked 1802 (37.1%) 90 (36.1%) 1892 (37.0%)
smokes 747 (15.4%) 42 (16.9%) 789 (15.4%)
Unknown 1497 (30.8%) 47 (18.9%) 1544 (30.2%)
table1(~ hypertension + as.factor(hypertension) +  heart_disease + as.factor(heart_disease) | stroke1, data=df)
0
(N=4861)
1
(N=249)
Overall
(N=5110)
hypertension
Mean (SD) 0.0889 (0.285) 0.265 (0.442) 0.0975 (0.297)
Median [Min, Max] 0 [0, 1.00] 0 [0, 1.00] 0 [0, 1.00]
as.factor(hypertension)
0 4429 (91.1%) 183 (73.5%) 4612 (90.3%)
1 432 (8.9%) 66 (26.5%) 498 (9.7%)
heart_disease
Mean (SD) 0.0471 (0.212) 0.189 (0.392) 0.0540 (0.226)
Median [Min, Max] 0 [0, 1.00] 0 [0, 1.00] 0 [0, 1.00]
as.factor(heart_disease)
0 4632 (95.3%) 202 (81.1%) 4834 (94.6%)
1 229 (4.7%) 47 (18.9%) 276 (5.4%)