Ngày 1: Giới thiệu R

Việc 1: Tải R

Việc 2: Cài đặt Packages

library(lessR)
## 
## lessR 4.5                            feedback: gerbing@pdx.edu 
## --------------------------------------------------------------
## > d <- Read("")  Read data file, many formats available, e.g., Excel
##   d is the default data frame, data= in analysis routines optional
## 
## Many examples of reading, writing, and manipulating data, graphics,
## testing means and proportions, regression, factor analysis,
## customization, forecasting, and aggregation to pivot tables.
##   Enter: browseVignettes("lessR")
## 
## View lessR updates, now including modern time series forecasting
##   and many, new Plotly interactive visualizations output. Most
##   visualization functions are now reorganized to three functions:
##      Chart(): type="bar", "pie", "radar", "bubble", "treemap", "icicle"
##      X(): type="histogram", "density", "vbs" and more
##      XY(): type="scatter" for a scatterplot, or "contour", "smooth"
##    Most previous function calls still work, such as:
##      BarChart(), Histogram, and Plot().
##   Enter: news(package="lessR"), or ?Chart, ?X, or ?XY
## There is also Flows() for Sankey flow diagrams, see ?Flows
## 
## Interactive data analysis for constructing visualizations.
##   Enter: interact()
library(table1)
## 
## Attaching package: 'table1'
## The following object is masked from 'package:lessR':
## 
##     label
## The following objects are masked from 'package:base':
## 
##     units, units<-
library(ggplot2)

Việc 3: Đọc dữ liệu “Stroke Data.csv” vào R và gọi dữ liệu là “df”

df <- read.csv("Stroke Data.csv")

Việc 4: Thông tin về dữ liệu df này

4.1 Có bao nhiêu biến số (variable) và quan sát (observation)

dim(df)
## [1] 5110   12

4.2 Liệt kê 10 quan sát đầu tiên của dữ liệu.

head(df, 10)
##       id gender age hypertension heart_disease ever_married     work_type
## 1   9046   Male  67            0             1          Yes       Private
## 2  51676 Female  61            0             0          Yes Self-employed
## 3  31112   Male  80            0             1          Yes       Private
## 4  60182 Female  49            0             0          Yes       Private
## 5   1665 Female  79            1             0          Yes Self-employed
## 6  56669   Male  81            0             0          Yes       Private
## 7  53882   Male  74            1             1          Yes       Private
## 8  10434 Female  69            0             0           No       Private
## 9  27419 Female  59            0             0          Yes       Private
## 10 60491 Female  78            0             0          Yes       Private
##    Residence_type avg_glucose_level  bmi  smoking_status stroke
## 1           Urban            228.69 36.6 formerly smoked      1
## 2           Rural            202.21   NA    never smoked      1
## 3           Rural            105.92 32.5    never smoked      1
## 4           Urban            171.23 34.4          smokes      1
## 5           Rural            174.12 24.0    never smoked      1
## 6           Urban            186.21 29.0 formerly smoked      1
## 7           Rural             70.09 27.4    never smoked      1
## 8           Urban             94.39 22.8    never smoked      1
## 9           Rural             76.15   NA         Unknown      1
## 10          Urban             58.57 24.2         Unknown      1

4.3 Liệt kê 6 quan sát cuối cùng của dữ liệu

tail(df, 6)
##         id gender age hypertension heart_disease ever_married     work_type
## 5105 14180 Female  13            0             0           No      children
## 5106 18234 Female  80            1             0          Yes       Private
## 5107 44873 Female  81            0             0          Yes Self-employed
## 5108 19723 Female  35            0             0          Yes Self-employed
## 5109 37544   Male  51            0             0          Yes       Private
## 5110 44679 Female  44            0             0          Yes      Govt_job
##      Residence_type avg_glucose_level  bmi  smoking_status stroke
## 5105          Rural            103.08 18.6         Unknown      0
## 5106          Urban             83.75   NA    never smoked      0
## 5107          Urban            125.20 40.0    never smoked      0
## 5108          Rural             82.99 30.6    never smoked      0
## 5109          Rural            166.29 25.6 formerly smoked      0
## 5110          Urban             85.28 26.2         Unknown      0

4.4 Tóm tắt dữ liệu bằng hàm summary

summary(df)
##        id           gender               age         hypertension    
##  Min.   :   67   Length:5110        Min.   : 0.08   Min.   :0.00000  
##  1st Qu.:17741   Class :character   1st Qu.:25.00   1st Qu.:0.00000  
##  Median :36932   Mode  :character   Median :45.00   Median :0.00000  
##  Mean   :36518                      Mean   :43.23   Mean   :0.09746  
##  3rd Qu.:54682                      3rd Qu.:61.00   3rd Qu.:0.00000  
##  Max.   :72940                      Max.   :82.00   Max.   :1.00000  
##                                                                      
##  heart_disease     ever_married        work_type         Residence_type    
##  Min.   :0.00000   Length:5110        Length:5110        Length:5110       
##  1st Qu.:0.00000   Class :character   Class :character   Class :character  
##  Median :0.00000   Mode  :character   Mode  :character   Mode  :character  
##  Mean   :0.05401                                                           
##  3rd Qu.:0.00000                                                           
##  Max.   :1.00000                                                           
##                                                                            
##  avg_glucose_level      bmi        smoking_status         stroke       
##  Min.   : 55.12    Min.   :10.30   Length:5110        Min.   :0.00000  
##  1st Qu.: 77.25    1st Qu.:23.50   Class :character   1st Qu.:0.00000  
##  Median : 91.89    Median :28.10   Mode  :character   Median :0.00000  
##  Mean   :106.15    Mean   :28.89                      Mean   :0.04873  
##  3rd Qu.:114.09    3rd Qu.:33.10                      3rd Qu.:0.00000  
##  Max.   :271.74    Max.   :97.60                      Max.   :1.00000  
##                    NA's   :201

Việc 5: Biên tập dữ liệu

5.1 Mã hoá biến sex

5.1 Mã hóa biến gender (Female/Male/Other) thành biến sex với giá trị 0/1/2 (0= Male; 1= Female; 2= Other)

df$sex <- NA
df$sex[df$gender == "Female"] = 0
df$sex[df$gender == "Male"] = 1
df$sex[df$gender == "Other"] = 2


df$sex <- factor(df$sex, levels = c(0,1,2),
                 labels = c("Male","Female","Other"))

table(df$sex)
## 
##   Male Female  Other 
##   2994   2115      1

5.2 Mã hoá biến bmi

Nếu bmi < 18.5 thì bmi_cat = “Underweight” Nếu 18.5 ≤ bmi < 25.0 thì bmi_cat = “Normal”
Nếu 25.0 ≤ bmi < 30 thì bmi_cat = “Overweight” Nếu bmi ≥ 30.0 thì bmi = “Obese”

df$bmi_cat[df$bmi< 18.5]= "Underweight"
df$bmi_cat[df$bmi>= 18.5 & df$bmi < 25.0]= "Normal"
df$bmi_cat[df$bmi>= 25.0 & df$bmi < 30.0]= "Overweight"
df$bmi_cat[df$bmi>= 30]= "Obese"

table(df$bmi_cat)
## 
##      Normal       Obese  Overweight Underweight 
##        1243        1920        1409         337

5.3 Biến Stroke 1

5.3 Chuyển biến số stroke thành biếnfactor với giá trị mới (0= No; 1= Yes)

df$stroke1 = as.factor(df$stroke)
df$stroke2 = as.factor(df$stroke)

table(df$stroke1, df$stroke)
##    
##        0    1
##   0 4861    0
##   1    0  249
head(df)
##      id gender age hypertension heart_disease ever_married     work_type
## 1  9046   Male  67            0             1          Yes       Private
## 2 51676 Female  61            0             0          Yes Self-employed
## 3 31112   Male  80            0             1          Yes       Private
## 4 60182 Female  49            0             0          Yes       Private
## 5  1665 Female  79            1             0          Yes Self-employed
## 6 56669   Male  81            0             0          Yes       Private
##   Residence_type avg_glucose_level  bmi  smoking_status stroke    sex
## 1          Urban            228.69 36.6 formerly smoked      1 Female
## 2          Rural            202.21   NA    never smoked      1   Male
## 3          Rural            105.92 32.5    never smoked      1 Female
## 4          Urban            171.23 34.4          smokes      1   Male
## 5          Rural            174.12 24.0    never smoked      1   Male
## 6          Urban            186.21 29.0 formerly smoked      1 Female
##      bmi_cat stroke1 stroke2
## 1      Obese       1       1
## 2       <NA>       1       1
## 3      Obese       1       1
## 4      Obese       1       1
## 5     Normal       1       1
## 6 Overweight       1       1

Việc 6: Phân tích mô tả

6.1 Mô tả đặc điểm tuổi (age), giới tính (gender), bệnh cao huyết áp (hypertension), bệnh tim (heart_disease), tình trạng gia đình (ever_married), việc làm (work_type), nơi ở (Residence_type), nồng độ đường huyết (avg_glucose_level), chỉ số khối cơ thể (bmi), và tình trạng hút thuốc (smoking_status) theo tình trạng đột quị (stroke)

library(table1)
table1(~age + gender + hypertension + heart_disease + work_type + Residence_type + avg_glucose_level + bmi + smoking_status | stroke, data = df)
## Warning in table1.formula(~age + gender + hypertension + heart_disease + :
## Terms to the right of '|' in formula 'x' define table columns and are expected
## to be factors with meaningful labels.
0
(N=4861)
1
(N=249)
Overall
(N=5110)
age
Mean (SD) 42.0 (22.3) 67.7 (12.7) 43.2 (22.6)
Median [Min, Max] 43.0 [0.0800, 82.0] 71.0 [1.32, 82.0] 45.0 [0.0800, 82.0]
gender
Female 2853 (58.7%) 141 (56.6%) 2994 (58.6%)
Male 2007 (41.3%) 108 (43.4%) 2115 (41.4%)
Other 1 (0.0%) 0 (0%) 1 (0.0%)
hypertension
Mean (SD) 0.0889 (0.285) 0.265 (0.442) 0.0975 (0.297)
Median [Min, Max] 0 [0, 1.00] 0 [0, 1.00] 0 [0, 1.00]
heart_disease
Mean (SD) 0.0471 (0.212) 0.189 (0.392) 0.0540 (0.226)
Median [Min, Max] 0 [0, 1.00] 0 [0, 1.00] 0 [0, 1.00]
work_type
children 685 (14.1%) 2 (0.8%) 687 (13.4%)
Govt_job 624 (12.8%) 33 (13.3%) 657 (12.9%)
Never_worked 22 (0.5%) 0 (0%) 22 (0.4%)
Private 2776 (57.1%) 149 (59.8%) 2925 (57.2%)
Self-employed 754 (15.5%) 65 (26.1%) 819 (16.0%)
Residence_type
Rural 2400 (49.4%) 114 (45.8%) 2514 (49.2%)
Urban 2461 (50.6%) 135 (54.2%) 2596 (50.8%)
avg_glucose_level
Mean (SD) 105 (43.8) 133 (61.9) 106 (45.3)
Median [Min, Max] 91.5 [55.1, 268] 105 [56.1, 272] 91.9 [55.1, 272]
bmi
Mean (SD) 28.8 (7.91) 30.5 (6.33) 28.9 (7.85)
Median [Min, Max] 28.0 [10.3, 97.6] 29.7 [16.9, 56.6] 28.1 [10.3, 97.6]
Missing 161 (3.3%) 40 (16.1%) 201 (3.9%)
smoking_status
formerly smoked 815 (16.8%) 70 (28.1%) 885 (17.3%)
never smoked 1802 (37.1%) 90 (36.1%) 1892 (37.0%)
smokes 747 (15.4%) 42 (16.9%) 789 (15.4%)
Unknown 1497 (30.8%) 47 (18.9%) 1544 (30.2%)
table1(~age + hypertension + heart_disease + bmi + smoking_status | stroke, data = df)
## Warning in table1.formula(~age + hypertension + heart_disease + bmi +
## smoking_status | : Terms to the right of '|' in formula 'x' define table
## columns and are expected to be factors with meaningful labels.
0
(N=4861)
1
(N=249)
Overall
(N=5110)
age
Mean (SD) 42.0 (22.3) 67.7 (12.7) 43.2 (22.6)
Median [Min, Max] 43.0 [0.0800, 82.0] 71.0 [1.32, 82.0] 45.0 [0.0800, 82.0]
hypertension
Mean (SD) 0.0889 (0.285) 0.265 (0.442) 0.0975 (0.297)
Median [Min, Max] 0 [0, 1.00] 0 [0, 1.00] 0 [0, 1.00]
heart_disease
Mean (SD) 0.0471 (0.212) 0.189 (0.392) 0.0540 (0.226)
Median [Min, Max] 0 [0, 1.00] 0 [0, 1.00] 0 [0, 1.00]
bmi
Mean (SD) 28.8 (7.91) 30.5 (6.33) 28.9 (7.85)
Median [Min, Max] 28.0 [10.3, 97.6] 29.7 [16.9, 56.6] 28.1 [10.3, 97.6]
Missing 161 (3.3%) 40 (16.1%) 201 (3.9%)
smoking_status
formerly smoked 815 (16.8%) 70 (28.1%) 885 (17.3%)
never smoked 1802 (37.1%) 90 (36.1%) 1892 (37.0%)
smokes 747 (15.4%) 42 (16.9%) 789 (15.4%)
Unknown 1497 (30.8%) 47 (18.9%) 1544 (30.2%)

6.2 Bạn nhận xét như thế nào về kết quả của bệnh cao huyết áp và bệnh tim. Làm cách nào để trình bày kết quả tốt hơn?

library(ggplot2)

ggplot(df, aes(x = factor(stroke), fill = factor(hypertension))) +
  geom_bar(position = "fill") +
  labs(x = "Stroke", y = "Tỷ lệ", fill = "Hypertension")