df=read.csv("C:\\Data-training\\Stroke Data.csv")
dim(df)
## [1] 5110 12
head(df, 15)
## id gender age hypertension heart_disease ever_married work_type
## 1 9046 Male 67 0 1 Yes Private
## 2 51676 Female 61 0 0 Yes Self-employed
## 3 31112 Male 80 0 1 Yes Private
## 4 60182 Female 49 0 0 Yes Private
## 5 1665 Female 79 1 0 Yes Self-employed
## 6 56669 Male 81 0 0 Yes Private
## 7 53882 Male 74 1 1 Yes Private
## 8 10434 Female 69 0 0 No Private
## 9 27419 Female 59 0 0 Yes Private
## 10 60491 Female 78 0 0 Yes Private
## 11 12109 Female 81 1 0 Yes Private
## 12 12095 Female 61 0 1 Yes Govt_job
## 13 12175 Female 54 0 0 Yes Private
## 14 8213 Male 78 0 1 Yes Private
## 15 5317 Female 79 0 1 Yes Private
## Residence_type avg_glucose_level bmi smoking_status stroke
## 1 Urban 228.69 36.6 formerly smoked 1
## 2 Rural 202.21 NA never smoked 1
## 3 Rural 105.92 32.5 never smoked 1
## 4 Urban 171.23 34.4 smokes 1
## 5 Rural 174.12 24.0 never smoked 1
## 6 Urban 186.21 29.0 formerly smoked 1
## 7 Rural 70.09 27.4 never smoked 1
## 8 Urban 94.39 22.8 never smoked 1
## 9 Rural 76.15 NA Unknown 1
## 10 Urban 58.57 24.2 Unknown 1
## 11 Rural 80.43 29.7 never smoked 1
## 12 Rural 120.46 36.8 smokes 1
## 13 Urban 104.51 27.3 smokes 1
## 14 Urban 219.84 NA Unknown 1
## 15 Urban 214.09 28.2 never smoked 1
tail(df, 10)
## id gender age hypertension heart_disease ever_married work_type
## 5101 68398 Male 82 1 0 Yes Self-employed
## 5102 36901 Female 45 0 0 Yes Private
## 5103 45010 Female 57 0 0 Yes Private
## 5104 22127 Female 18 0 0 No Private
## 5105 14180 Female 13 0 0 No children
## 5106 18234 Female 80 1 0 Yes Private
## 5107 44873 Female 81 0 0 Yes Self-employed
## 5108 19723 Female 35 0 0 Yes Self-employed
## 5109 37544 Male 51 0 0 Yes Private
## 5110 44679 Female 44 0 0 Yes Govt_job
## Residence_type avg_glucose_level bmi smoking_status stroke
## 5101 Rural 71.97 28.3 never smoked 0
## 5102 Urban 97.95 24.5 Unknown 0
## 5103 Rural 77.93 21.7 never smoked 0
## 5104 Urban 82.85 46.9 Unknown 0
## 5105 Rural 103.08 18.6 Unknown 0
## 5106 Urban 83.75 NA never smoked 0
## 5107 Urban 125.20 40.0 never smoked 0
## 5108 Rural 82.99 30.6 never smoked 0
## 5109 Rural 166.29 25.6 formerly smoked 0
## 5110 Urban 85.28 26.2 Unknown 0
summary(df)
## id gender age hypertension
## Min. : 67 Length:5110 Min. : 0.08 Min. :0.00000
## 1st Qu.:17741 Class :character 1st Qu.:25.00 1st Qu.:0.00000
## Median :36932 Mode :character Median :45.00 Median :0.00000
## Mean :36518 Mean :43.23 Mean :0.09746
## 3rd Qu.:54682 3rd Qu.:61.00 3rd Qu.:0.00000
## Max. :72940 Max. :82.00 Max. :1.00000
##
## heart_disease ever_married work_type Residence_type
## Min. :0.00000 Length:5110 Length:5110 Length:5110
## 1st Qu.:0.00000 Class :character Class :character Class :character
## Median :0.00000 Mode :character Mode :character Mode :character
## Mean :0.05401
## 3rd Qu.:0.00000
## Max. :1.00000
##
## avg_glucose_level bmi smoking_status stroke
## Min. : 55.12 Min. :10.30 Length:5110 Min. :0.00000
## 1st Qu.: 77.25 1st Qu.:23.50 Class :character 1st Qu.:0.00000
## Median : 91.89 Median :28.10 Mode :character Median :0.00000
## Mean :106.15 Mean :28.89 Mean :0.04873
## 3rd Qu.:114.09 3rd Qu.:33.10 3rd Qu.:0.00000
## Max. :271.74 Max. :97.60 Max. :1.00000
## NA's :201
df$sex[df$gender == "Female"] = 0
df$sex[df$gender == "Male"] = 1
df$sex[df$gender == "Other"] = 2
head(df)
## id gender age hypertension heart_disease ever_married work_type
## 1 9046 Male 67 0 1 Yes Private
## 2 51676 Female 61 0 0 Yes Self-employed
## 3 31112 Male 80 0 1 Yes Private
## 4 60182 Female 49 0 0 Yes Private
## 5 1665 Female 79 1 0 Yes Self-employed
## 6 56669 Male 81 0 0 Yes Private
## Residence_type avg_glucose_level bmi smoking_status stroke sex
## 1 Urban 228.69 36.6 formerly smoked 1 1
## 2 Rural 202.21 NA never smoked 1 0
## 3 Rural 105.92 32.5 never smoked 1 1
## 4 Urban 171.23 34.4 smokes 1 0
## 5 Rural 174.12 24.0 never smoked 1 0
## 6 Urban 186.21 29.0 formerly smoked 1 1
table(df$gender, df$sex)
##
## 0 1 2
## Female 2994 0 0
## Male 0 2115 0
## Other 0 0 1
df$bmi_cat[df$bmi<18.5]="Underweight"
df$bmi_cat[df$bmi>=18.5 & df$bmi<25]="Normal"
df$bmi_cat[df$bmi>=25 & df$bmi<30]="Overweight"
df$bmi_cat[df$bmi>=30]="Obese"
table(df$bmi_cat)
##
## Normal Obese Overweight Underweight
## 1243 1920 1409 337
df$stroke1=as.factor(df$stroke)
table(df$stroke1)
##
## 0 1
## 4861 249
head(df)
## id gender age hypertension heart_disease ever_married work_type
## 1 9046 Male 67 0 1 Yes Private
## 2 51676 Female 61 0 0 Yes Self-employed
## 3 31112 Male 80 0 1 Yes Private
## 4 60182 Female 49 0 0 Yes Private
## 5 1665 Female 79 1 0 Yes Self-employed
## 6 56669 Male 81 0 0 Yes Private
## Residence_type avg_glucose_level bmi smoking_status stroke sex bmi_cat
## 1 Urban 228.69 36.6 formerly smoked 1 1 Obese
## 2 Rural 202.21 NA never smoked 1 0 <NA>
## 3 Rural 105.92 32.5 never smoked 1 1 Obese
## 4 Urban 171.23 34.4 smokes 1 0 Obese
## 5 Rural 174.12 24.0 never smoked 1 0 Normal
## 6 Urban 186.21 29.0 formerly smoked 1 1 Overweight
## stroke1
## 1 1
## 2 1
## 3 1
## 4 1
## 5 1
## 6 1
summary(df)
## id gender age hypertension
## Min. : 67 Length:5110 Min. : 0.08 Min. :0.00000
## 1st Qu.:17741 Class :character 1st Qu.:25.00 1st Qu.:0.00000
## Median :36932 Mode :character Median :45.00 Median :0.00000
## Mean :36518 Mean :43.23 Mean :0.09746
## 3rd Qu.:54682 3rd Qu.:61.00 3rd Qu.:0.00000
## Max. :72940 Max. :82.00 Max. :1.00000
##
## heart_disease ever_married work_type Residence_type
## Min. :0.00000 Length:5110 Length:5110 Length:5110
## 1st Qu.:0.00000 Class :character Class :character Class :character
## Median :0.00000 Mode :character Mode :character Mode :character
## Mean :0.05401
## 3rd Qu.:0.00000
## Max. :1.00000
##
## avg_glucose_level bmi smoking_status stroke
## Min. : 55.12 Min. :10.30 Length:5110 Min. :0.00000
## 1st Qu.: 77.25 1st Qu.:23.50 Class :character 1st Qu.:0.00000
## Median : 91.89 Median :28.10 Mode :character Median :0.00000
## Mean :106.15 Mean :28.89 Mean :0.04873
## 3rd Qu.:114.09 3rd Qu.:33.10 3rd Qu.:0.00000
## Max. :271.74 Max. :97.60 Max. :1.00000
## NA's :201
## sex bmi_cat stroke1
## Min. :0.0000 Length:5110 0:4861
## 1st Qu.:0.0000 Class :character 1: 249
## Median :0.0000 Mode :character
## Mean :0.4143
## 3rd Qu.:1.0000
## Max. :2.0000
##
6.1 Mô tả đặc điểm tuổi (age), giới tính (gender), bệnh cao huyết áp (hypertension), bệnh tim (heart_disease), tình trạng gia đình (ever_married), việc làm (work_type), nơi ở (Residence_type), nồng độ đường huyết (avg_glucose_level), chỉ số khối cơ thể (bmi), và tình trạng hút thuốc (smoking_status) theo tình trạng đột quị (stroke)
library(table1)
##
## Attaching package: 'table1'
## The following objects are masked from 'package:base':
##
## units, units<-
table1(~age+gender+hypertension+heart_disease+ever_married+work_type+Residence_type+avg_glucose_level+bmi+smoking_status | stroke1, data=df)
| 0 (N=4861) |
1 (N=249) |
Overall (N=5110) |
|
|---|---|---|---|
| age | |||
| Mean (SD) | 42.0 (22.3) | 67.7 (12.7) | 43.2 (22.6) |
| Median [Min, Max] | 43.0 [0.0800, 82.0] | 71.0 [1.32, 82.0] | 45.0 [0.0800, 82.0] |
| gender | |||
| Female | 2853 (58.7%) | 141 (56.6%) | 2994 (58.6%) |
| Male | 2007 (41.3%) | 108 (43.4%) | 2115 (41.4%) |
| Other | 1 (0.0%) | 0 (0%) | 1 (0.0%) |
| hypertension | |||
| Mean (SD) | 0.0889 (0.285) | 0.265 (0.442) | 0.0975 (0.297) |
| Median [Min, Max] | 0 [0, 1.00] | 0 [0, 1.00] | 0 [0, 1.00] |
| heart_disease | |||
| Mean (SD) | 0.0471 (0.212) | 0.189 (0.392) | 0.0540 (0.226) |
| Median [Min, Max] | 0 [0, 1.00] | 0 [0, 1.00] | 0 [0, 1.00] |
| ever_married | |||
| No | 1728 (35.5%) | 29 (11.6%) | 1757 (34.4%) |
| Yes | 3133 (64.5%) | 220 (88.4%) | 3353 (65.6%) |
| work_type | |||
| children | 685 (14.1%) | 2 (0.8%) | 687 (13.4%) |
| Govt_job | 624 (12.8%) | 33 (13.3%) | 657 (12.9%) |
| Never_worked | 22 (0.5%) | 0 (0%) | 22 (0.4%) |
| Private | 2776 (57.1%) | 149 (59.8%) | 2925 (57.2%) |
| Self-employed | 754 (15.5%) | 65 (26.1%) | 819 (16.0%) |
| Residence_type | |||
| Rural | 2400 (49.4%) | 114 (45.8%) | 2514 (49.2%) |
| Urban | 2461 (50.6%) | 135 (54.2%) | 2596 (50.8%) |
| avg_glucose_level | |||
| Mean (SD) | 105 (43.8) | 133 (61.9) | 106 (45.3) |
| Median [Min, Max] | 91.5 [55.1, 268] | 105 [56.1, 272] | 91.9 [55.1, 272] |
| bmi | |||
| Mean (SD) | 28.8 (7.91) | 30.5 (6.33) | 28.9 (7.85) |
| Median [Min, Max] | 28.0 [10.3, 97.6] | 29.7 [16.9, 56.6] | 28.1 [10.3, 97.6] |
| Missing | 161 (3.3%) | 40 (16.1%) | 201 (3.9%) |
| smoking_status | |||
| formerly smoked | 815 (16.8%) | 70 (28.1%) | 885 (17.3%) |
| never smoked | 1802 (37.1%) | 90 (36.1%) | 1892 (37.0%) |
| smokes | 747 (15.4%) | 42 (16.9%) | 789 (15.4%) |
| Unknown | 1497 (30.8%) | 47 (18.9%) | 1544 (30.2%) |
table1(~age + gender + hypertension + heart_disease + ever_married + work_type + Residence_type + avg_glucose_level + bmi + smoking_status | stroke1, data=df)
| 0 (N=4861) |
1 (N=249) |
Overall (N=5110) |
|
|---|---|---|---|
| age | |||
| Mean (SD) | 42.0 (22.3) | 67.7 (12.7) | 43.2 (22.6) |
| Median [Min, Max] | 43.0 [0.0800, 82.0] | 71.0 [1.32, 82.0] | 45.0 [0.0800, 82.0] |
| gender | |||
| Female | 2853 (58.7%) | 141 (56.6%) | 2994 (58.6%) |
| Male | 2007 (41.3%) | 108 (43.4%) | 2115 (41.4%) |
| Other | 1 (0.0%) | 0 (0%) | 1 (0.0%) |
| hypertension | |||
| Mean (SD) | 0.0889 (0.285) | 0.265 (0.442) | 0.0975 (0.297) |
| Median [Min, Max] | 0 [0, 1.00] | 0 [0, 1.00] | 0 [0, 1.00] |
| heart_disease | |||
| Mean (SD) | 0.0471 (0.212) | 0.189 (0.392) | 0.0540 (0.226) |
| Median [Min, Max] | 0 [0, 1.00] | 0 [0, 1.00] | 0 [0, 1.00] |
| ever_married | |||
| No | 1728 (35.5%) | 29 (11.6%) | 1757 (34.4%) |
| Yes | 3133 (64.5%) | 220 (88.4%) | 3353 (65.6%) |
| work_type | |||
| children | 685 (14.1%) | 2 (0.8%) | 687 (13.4%) |
| Govt_job | 624 (12.8%) | 33 (13.3%) | 657 (12.9%) |
| Never_worked | 22 (0.5%) | 0 (0%) | 22 (0.4%) |
| Private | 2776 (57.1%) | 149 (59.8%) | 2925 (57.2%) |
| Self-employed | 754 (15.5%) | 65 (26.1%) | 819 (16.0%) |
| Residence_type | |||
| Rural | 2400 (49.4%) | 114 (45.8%) | 2514 (49.2%) |
| Urban | 2461 (50.6%) | 135 (54.2%) | 2596 (50.8%) |
| avg_glucose_level | |||
| Mean (SD) | 105 (43.8) | 133 (61.9) | 106 (45.3) |
| Median [Min, Max] | 91.5 [55.1, 268] | 105 [56.1, 272] | 91.9 [55.1, 272] |
| bmi | |||
| Mean (SD) | 28.8 (7.91) | 30.5 (6.33) | 28.9 (7.85) |
| Median [Min, Max] | 28.0 [10.3, 97.6] | 29.7 [16.9, 56.6] | 28.1 [10.3, 97.6] |
| Missing | 161 (3.3%) | 40 (16.1%) | 201 (3.9%) |
| smoking_status | |||
| formerly smoked | 815 (16.8%) | 70 (28.1%) | 885 (17.3%) |
| never smoked | 1802 (37.1%) | 90 (36.1%) | 1892 (37.0%) |
| smokes | 747 (15.4%) | 42 (16.9%) | 789 (15.4%) |
| Unknown | 1497 (30.8%) | 47 (18.9%) | 1544 (30.2%) |
table1(~ age + gender + hypertension + heart_disease + ever_married + work_type + Residence_type + avg_glucose_level + bmi + smoking_status | stroke1, data=df)
| 0 (N=4861) |
1 (N=249) |
Overall (N=5110) |
|
|---|---|---|---|
| age | |||
| Mean (SD) | 42.0 (22.3) | 67.7 (12.7) | 43.2 (22.6) |
| Median [Min, Max] | 43.0 [0.0800, 82.0] | 71.0 [1.32, 82.0] | 45.0 [0.0800, 82.0] |
| gender | |||
| Female | 2853 (58.7%) | 141 (56.6%) | 2994 (58.6%) |
| Male | 2007 (41.3%) | 108 (43.4%) | 2115 (41.4%) |
| Other | 1 (0.0%) | 0 (0%) | 1 (0.0%) |
| hypertension | |||
| Mean (SD) | 0.0889 (0.285) | 0.265 (0.442) | 0.0975 (0.297) |
| Median [Min, Max] | 0 [0, 1.00] | 0 [0, 1.00] | 0 [0, 1.00] |
| heart_disease | |||
| Mean (SD) | 0.0471 (0.212) | 0.189 (0.392) | 0.0540 (0.226) |
| Median [Min, Max] | 0 [0, 1.00] | 0 [0, 1.00] | 0 [0, 1.00] |
| ever_married | |||
| No | 1728 (35.5%) | 29 (11.6%) | 1757 (34.4%) |
| Yes | 3133 (64.5%) | 220 (88.4%) | 3353 (65.6%) |
| work_type | |||
| children | 685 (14.1%) | 2 (0.8%) | 687 (13.4%) |
| Govt_job | 624 (12.8%) | 33 (13.3%) | 657 (12.9%) |
| Never_worked | 22 (0.5%) | 0 (0%) | 22 (0.4%) |
| Private | 2776 (57.1%) | 149 (59.8%) | 2925 (57.2%) |
| Self-employed | 754 (15.5%) | 65 (26.1%) | 819 (16.0%) |
| Residence_type | |||
| Rural | 2400 (49.4%) | 114 (45.8%) | 2514 (49.2%) |
| Urban | 2461 (50.6%) | 135 (54.2%) | 2596 (50.8%) |
| avg_glucose_level | |||
| Mean (SD) | 105 (43.8) | 133 (61.9) | 106 (45.3) |
| Median [Min, Max] | 91.5 [55.1, 268] | 105 [56.1, 272] | 91.9 [55.1, 272] |
| bmi | |||
| Mean (SD) | 28.8 (7.91) | 30.5 (6.33) | 28.9 (7.85) |
| Median [Min, Max] | 28.0 [10.3, 97.6] | 29.7 [16.9, 56.6] | 28.1 [10.3, 97.6] |
| Missing | 161 (3.3%) | 40 (16.1%) | 201 (3.9%) |
| smoking_status | |||
| formerly smoked | 815 (16.8%) | 70 (28.1%) | 885 (17.3%) |
| never smoked | 1802 (37.1%) | 90 (36.1%) | 1892 (37.0%) |
| smokes | 747 (15.4%) | 42 (16.9%) | 789 (15.4%) |
| Unknown | 1497 (30.8%) | 47 (18.9%) | 1544 (30.2%) |
table1(~ hypertension + as.factor(hypertension) + heart_disease + as.factor(heart_disease) | stroke1, data=df)
| 0 (N=4861) |
1 (N=249) |
Overall (N=5110) |
|
|---|---|---|---|
| hypertension | |||
| Mean (SD) | 0.0889 (0.285) | 0.265 (0.442) | 0.0975 (0.297) |
| Median [Min, Max] | 0 [0, 1.00] | 0 [0, 1.00] | 0 [0, 1.00] |
| as.factor(hypertension) | |||
| 0 | 4429 (91.1%) | 183 (73.5%) | 4612 (90.3%) |
| 1 | 432 (8.9%) | 66 (26.5%) | 498 (9.7%) |
| heart_disease | |||
| Mean (SD) | 0.0471 (0.212) | 0.189 (0.392) | 0.0540 (0.226) |
| Median [Min, Max] | 0 [0, 1.00] | 0 [0, 1.00] | 0 [0, 1.00] |
| as.factor(heart_disease) | |||
| 0 | 4632 (95.3%) | 202 (81.1%) | 4834 (94.6%) |
| 1 | 229 (4.7%) | 47 (18.9%) | 276 (5.4%) |